In [2]:
from utils import build_sentence_window_index, get_openai_api_key, get_sentence_window_query_engine
from llama_index.core.response.notebook_utils import display_response

api_key = get_openai_api_key()
file_path = "./model-papers/plip.pdf"

# Build or load the sentence window index
sentence_index = build_sentence_window_index(file_path, api_key)

# Get the query engine
sentence_window_engine = get_sentence_window_query_engine(sentence_index)

query = """
    Please extract and list all dataset dependencies and model dependencies mentioned in the research paper that were used for training or fine-tuning the main model

    - Include pre-trained models that were fine-tuned or further trained as part of the model development process.
    - Exclude all datasets and models used solely for validation, testing, evaluation, baseline comparisons or benchmarking.
    - For datasets, if a subset was used, list the original, larger dataset as the dependency.
    - Provide a brief explanation for each dependency, showing how it was used in the model development.
    - Exclude general concepts, libraries, tools, and architectures (e.g., Scikit-learn, Logistic Regression, Variational Autoencoder, Text Transformer, etc).

    For instance, if a paper states 'we fine-tuned a pre-trained Model X', then Model X should be listed as a dependency.

    Present the information in this format:
    Dataset dependencies:
    - [Dataset name]: [Brief explanation of its use in training/fine-tuning]
    Model dependencies:
    - [Model name]: [Brief explanation of its use in training/fine-tuning]

    If no relevant datasets or models are identified, state "None identified" under the respective category.
    DO NOT include any other information in your response.
"""

# Query the index
window_response = sentence_window_engine.query(query)
display_response(window_response)

**`Final Response:`** Dataset dependencies:
- LAION-5B: Used to establish the PathLAION collection, which contains pathology image–text data from sources beyond Twitter. This subset was used for training the main model.

Model dependencies:
- CLIP: Fine-tuned to develop the PLIP model for visual–language representation and learning in pathology.