In [3]:
import os

import openai
from llama_index.core import (
    Document,
    load_index_from_storage,
    Settings,
    SimpleDirectoryReader,
    StorageContext,
    VectorStoreIndex,
)
from llama_index.core.node_parser import (
    get_leaf_nodes,
    HierarchicalNodeParser,
    SentenceWindowNodeParser,
)
from llama_index.core.indices.postprocessor import (
    MetadataReplacementPostProcessor,
    SentenceTransformerRerank,
)
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.retrievers import AutoMergingRetriever
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding

In [1]:
# Function to build or load the sentence window index.
def build_sentence_window_index(
    file_path,
    api_key,
    model="gpt-4o",
    embedding_model="text-embedding-3-large",
    index_dir="./sentence_index",
):
    # Load and parse the document
    documents = SimpleDirectoryReader(input_files=[file_path]).load_data()
    document_text = "\n\n".join([doc.text for doc in documents])
    document = Document(text=document_text)

    # Segment the document using a Sentence Window Parser
    node_parser = SentenceWindowNodeParser.from_defaults(
        window_size=4,
        window_metadata_key="window",
        original_text_metadata_key="original_sentence",
    )

    # Set up OpenAI integration and embeddings
    openai.api_key = api_key
    llm = OpenAI(model=model, temperature=0.1, api_key=api_key)
    embedding = OpenAIEmbedding(model=embedding_model)

    # Create the index
    Settings.llm = llm
    Settings.embeddings = embedding
    Settings.node_parser = node_parser

    if not os.path.exists(index_dir):
        sentence_index = VectorStoreIndex.from_documents(
            [document], service_context=Settings
        )
        sentence_index.storage_context.persist(persist_dir=index_dir)
    else:
        sentence_index = load_index_from_storage(
            StorageContext.from_defaults(persist_dir=index_dir),
            service_context=Settings,
        )

    return sentence_index


# Function to get the sentence window query engine
def get_sentence_window_query_engine(sentence_index):
    # Set up post-processing
    postproc = MetadataReplacementPostProcessor(target_metadata_key="window")

    # Set up the query engine
    sentence_window_engine = sentence_index.as_query_engine(
        similarity_top_k=10, node_postprocessors=[postproc]
    )

    return sentence_window_engine

In [None]:

def build_automerging_index(
    file_path,
    api_key,
    model="gpt-4o",
    embedding_model="text-embedding-3-large",
    index_dir="./merging_index",
    chunk_sizes=None
):
    chunk_sizes = chunk_sizes or [2048, 512, 128]

    # Load and parse the document
    documents = SimpleDirectoryReader(input_files=[file_path]).load_data()
    document_text = "\n\n".join([doc.text for doc in documents])
    document = Document(text=document_text)

    # Segment the document using a Hierarchical Node Parser
    node_parser = HierarchicalNodeParser.from_defaults(chunk_sizes=chunk_sizes)
    nodes = node_parser.get_nodes_from_documents([document])
    leaf_nodes = get_leaf_nodes(nodes)

    # Set up OpenAI integration and embeddings
    openai.api_key = api_key
    llm = OpenAI(model=model, temperature=0.1, api_key=api_key)
    embedding = OpenAIEmbedding(model=embedding_model)

    # Create the index
    Settings.llm = llm
    Settings.embeddings = embedding

    storage_context = StorageContext.from_defaults()
    storage_context.docstore.add_documents(nodes)

    if not os.path.exists(index_dir):
        automerging_index = VectorStoreIndex(
            leaf_nodes, storage_context=storage_context, service_context=Settings
        )
        automerging_index.storage_context.persist(persist_dir=index_dir)
    else:
        automerging_index = load_index_from_storage(
            StorageContext.from_defaults(persist_dir=index_dir),
            service_context=Settings
        )
    return automerging_index


# Function to get the automerging query engine.
def get_automerging_query_engine(automerge_index, similarity_top_k=12, rerank_top_n=6):
    base_retriever = automerge_index.as_retriever(similarity_top_k=similarity_top_k)
    retriever = AutoMergingRetriever(
        base_retriever, automerge_index.storage_context, verbose=True
    )
    rerank = SentenceTransformerRerank(
        top_n=rerank_top_n, model="BAAI/bge-reranker-base"
    )
    auto_merging_engine = RetrieverQueryEngine.from_args(
        retriever, node_postprocessors=[rerank]
    )
    return auto_merging_engine


In [2]:
import utils
from llama_index.core.response.notebook_utils import display_response

api_key = utils.get_openai_api_key()
file_path = "./model-papers/plip.pdf"

# Build or load the sentence window index
sentence_index = build_sentence_window_index(file_path, api_key)

# Get the query engine
sentence_window_engine = get_sentence_window_query_engine(sentence_index)

query = """
    Please extract and list all dataset dependencies and model dependencies mentioned in the research paper that were used for training or fine-tuning the main model

    - Include pre-trained models that were fine-tuned or further trained as part of the model development process.
    - Exclude all datasets and models used solely for validation, testing, evaluation, baseline comparisons or benchmarking.
    - For datasets, if a subset was used, list the original, larger dataset as the dependency.
    - Provide a brief explanation for each dependency, showing how it was used in the model development.
    - Exclude general concepts, libraries, tools, and architectures (e.g., Scikit-learn, Logistic Regression, Variational Autoencoder, Text Transformer, etc).

    For instance, if a paper states 'we fine-tuned a pre-trained Model X', then Model X should be listed as a dependency.

    Present the information in this format:
    Dataset dependencies:
    - [Dataset name]: [Brief explanation of its use in training/fine-tuning]
    Model dependencies:
    - [Model name]: [Brief explanation of its use in training/fine-tuning]

    If no relevant datasets or models are identified, state "None identified" under the respective category.
    DO NOT include any other information in your response.
"""

# Query the index
window_response = sentence_window_engine.query(query)
display_response(window_response)

**`Final Response:`** Dataset dependencies:
- LAION-5B: Used to establish the PathLAION collection, which contains pathology image–text data from sources beyond Twitter. This subset was used for training the main model.

Model dependencies:
- CLIP: Fine-tuned to develop the PLIP model for visual–language representation and learning in pathology.