In [3]:
## Logging
import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [4]:
import os
import shutil
from llama_index.readers.smart_pdf_loader import SmartPDFLoader
from llama_index.llms.openai import OpenAI
from llama_index.core import Settings, StorageContext, load_index_from_storage, VectorStoreIndex, SimpleDirectoryReader
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.storage.docstore import SimpleDocumentStore
from llama_index.core import VectorStoreIndex, get_response_synthesizer
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.postprocessor import SimilarityPostprocessor, PrevNextNodePostprocessor
from llama_index.core import PromptTemplate
from llama_index.core.schema import MetadataMode
from IPython.display import Markdown, display
from dotenv import load_dotenv


# Set the environment variable within the notebook's environment
load_dotenv('config.env')
llmsherpa_api_url = "http://localhost:5010/api/parseDocument?renderFormat=all"
PERSIST_DIR = "./storage"

# Verify it's set
if os.getenv('OPENAI_API_KEY'):
    print('API ok')


# define prompt viewing function
def display_prompt_dict(prompts_dict):
    for k, p in prompts_dict.items():
        text_md = f"**Prompt Key**: {k}" f"**Text:** "
        display(Markdown(text_md))
        print(p.get_template())
        display(Markdown(""))

def display_prompt_from_query_engine(query_engine):
    prompts_dict = query_engine.get_prompts()
    display_prompt_dict(prompts_dict)

def print_response(response):
    split_response=response.response.split(sep=' ')
    line_length=0
    line=[]
    for word in split_response:
        
        if line_length<80:
            line_length+= len(word) + 1
            line.append(word)
            
        if line_length>=80:
            line_length=0
            print(' '.join(line))
            line=[]
    
    if line_length<80:
        print(' '.join(line))

def move_parsed_doc():
    temp_docs = './data_to_add'
    docs_dir =  './data'
    for document in os.listdir(temp_docs):
        shutil.move(os.path.join(temp_docs, document), os.path.join(docs_dir, document))

def remove_useless_metadata(documents):
    for document in documents:
        del document.metadata['file_path']
        del document.metadata['file_size']
        del document.metadata['creation_date']
        del document.metadata['last_modified_date']
        del document.metadata['file_type']
        del document.metadata['chunk_type']
        document.excluded_llm_metadata_keys = []
    return documents
        
def remove_references_and_acknowledgments(documents):
    for document in documents[:]:
        text = document.text.lower()
        if ('reference' in text) or ('acknowledgment' in text) or ('conclusions and perspectives' in text) or ('literature cited' in text) or ('cell science at a glance' in text):
            documents.remove(document)
    return documents

def remove_first_line(documents):
    for document in documents:
        lines = document.text.splitlines()
        modified_text = "\n".join(lines[1:]) if len(lines) > 1 else ""
        document.text = modified_text
    return documents

def documents_parsing(documents):
    documents = remove_references_and_acknowledgments(documents)
    documents = remove_useless_metadata(documents)
    documents = remove_first_line(documents)
    return documents

def add_new_documents_to_index(index, PERSIST_DIR="./storage", llmsherpa_api_url="http://localhost:5010/api/parseDocument?renderFormat=all"):
    if len(os.listdir("./data_to_add")) >= 1:
        # SmartPDFLoader with `SimpleDirectoryReader`
        parser = SmartPDFLoader(llmsherpa_api_url=llmsherpa_api_url)
        file_extractor = {".pdf": parser}       
        documents = SimpleDirectoryReader("./data_to_add",
            file_extractor=file_extractor
        ).load_data()
        documents = documents_parsing(documents)
        splitter = SentenceSplitter(chunk_size=1024, chunk_overlap=128)
        # Split documents into nodes
        nodes = splitter.get_nodes_from_documents(documents)
        index.insert_nodes(nodes)
        # store it for later
        index.storage_context.persist(persist_dir=PERSIST_DIR)

        move_parsed_doc()


API ok


In [2]:
PERSIST_DIR = "./storage_smart_pdf"

In [3]:
## Load or read data

# check if storage already exists
if not os.path.exists(PERSIST_DIR):
    # SmartPDFLoader with `SimpleDirectoryReader`
    parser = SmartPDFLoader(llmsherpa_api_url=llmsherpa_api_url)
    file_extractor = {".pdf": parser}
    documents = SimpleDirectoryReader(
        "./data", file_extractor=file_extractor
    ).load_data()
    # Remove usless metadata and remove citation sections from documents
    documents = documents_parsing(documents)
    splitter = SentenceSplitter(chunk_size=1024, chunk_overlap=128)
    # Split documents into nodes
    print('Splitting Nodes')
    nodes = splitter.get_nodes_from_documents(documents)
    # Create an index from the nodes
    print('Indexing the nodes')
    index = VectorStoreIndex(nodes, show_progress=True)
    # store it for later
    index.storage_context.persist(persist_dir=PERSIST_DIR)
else:
    # load the existing index
    storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR)
    index = load_index_from_storage(storage_context)
    add_new_documents_to_index(index)

Splitting Nodes


  from .autonotebook import tqdm as notebook_tqdm


Indexing the nodes
Some nodes are missing content, skipping them...


Generating embeddings: 100%|██████████| 1758/1758 [00:28<00:00, 61.65it/s]


In [None]:
# Filter out one file

from llama_index.core import StorageContext
from llama_index.core.vector_stores import MetadataFilters, MetadataFilter, FilterOperator

# Define metadata filters
filters = MetadataFilters(
    filters=[MetadataFilter(key="file_name", value='The kinetochore–microtubule interface at a glance.pdf', operator=FilterOperator.EQ)]
)

# Delete nodes with the specified metadata
#index.vector_store.delete_nodes(filters=filters)
#index.storage_context.persist(persist_dir=PERSIST_DIR)

In [4]:
nodes = list(index.docstore.docs.values())
total_nodes = len(nodes)
print(f"Total number of nodes: {total_nodes}")

Total number of nodes: 1758


In [22]:
Settings.llm = OpenAI(temperature=0.6, model="o1-preview-2024-09-12")

In [23]:
## Query Engine for dissertation

# configure retriever
retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=80,
)

# configure response synthesizer
response_synthesizer = get_response_synthesizer(
    response_mode="tree_summarize",
)

# assemble query engine with similarity postprocessor
query_engine_dissertation = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer,
    node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.85)]
)

# shakespeare!
qa_prompt_tmpl_str = (
    "Context information from multiple scientific papers is below.\n"
    "---------------------\n"
    "---------------------\n"
    "{context_str}\n"
    "---------------------\n"
    "---------------------\n"
    "Given the context information and not prior knowledge, "
    "I want you to write a complete and exhaustive dissertation on this topic(s):\n"
    "Topics: {query_str}\n"
    "Please use the given topics to compose a complete and exhaustive dissertation, as it is a paragraph of a scientific review on the topic. Drawing on information only from the context provided.\n"
    "Be complete and exhaustive and make sure to include all the relevant information from the given context but avoid redundancy in the information.\n"
    "Avoid redundancy in the information you write.\n"
#   "Give also examples of experimetns made that prove the statments and so where a piece of knowledge cames from.\n"
    "Write this as a single paragraph without any subsections or titles and ensure that there is logical flow and coherence between different concepts and pieces of information. Avoid making final summaries or conclusion paragraphs\n"
    "I want you to mention also from which file_name and section each piece of information is taken, in brakets as they were citations.\n"
    "Use neutral, clear, easy and appropriate language. Write at least 500 words.\n"
    "Answer: "
)

qa_prompt_tmpl = PromptTemplate(qa_prompt_tmpl_str)

query_engine_dissertation.update_prompts(
    {"response_synthesizer:summary_template": qa_prompt_tmpl}
)

In [7]:
## Query Engine

# configure retriever
retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=10,
)

# configure response synthesizer
response_synthesizer = get_response_synthesizer(
    response_mode="tree_summarize",
)

# assemble query engine with similarity postprocessor
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer,
    node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.85)]
)

In [7]:
# Normal Query
query = input()

print('Query: ', query)

response = query_engine.query(f'{query} I want you to mention also from which file_name and section each piece of information is taken.')
print_response(response)

Query:  The role of intra complex duplications in Ska and Dam1 complex evolution. 
Intra-complex duplications played a significant role in the evolution of the Ska
and Dam1 complexes. Gene duplications contributed to the invention of Dam1-C and
Ska-C, indicating that such duplications were a mode of invention shared with other
protein complexes (Pereira-Leal et al. 2007). This is evident from the fact that
within the Ska-C, all three subunits are homologous to one another, and within Dam1-C,
there are two sets of homologous subunits (Duo1-Dad2 and Dad1-Dad4-Ask1). The intra-complex
homology reveals that gene duplications were crucial in the formation and evolution
of these complexes. This information is derived from the section on page 1297 of
the file "Unique Phylogenetic Distributions of the Ska and Dam1.pdf".


In [27]:
# Dissertation Query
query = input()

print('Query: \n', query)

response = query_engine_dissertation.query(query)
print('Answer: ')
print_response(response)

Query: 
 Winged Helix domain (or winged Helix turn Helix - wHTH). Its structure, functions and evolutionary history.
Answer: 
The Winged Helix domain (WHD), also known as the winged helix-turn-helix (wHTH)
domain, is a specific subtype of the helix-turn-helix (HTH) family of DNA-binding
motifs, characterized by a compact α/β structure consisting of three α-helices (H1,
H2, H3), three β-strands (S1, S2, S3), and two characteristic loops forming the
wings (W1 and W2), arranged in the order H1-S1-H2-H3-S2-W1-S3-W2 (*Winged helix
proteins.pdf*). The N-terminal half of the WHD is largely helical, while the C-terminal
half comprises the twisted antiparallel β-sheet and the wings, which flank helix
H3 like the wings of a butterfly, inspiring the name winged helix motif (*Winged
helix proteins.pdf*). The wings, particularly W1, often provide additional interfaces
for nucleic acid contact, typically interacting with the minor groove of DNA through
charged residues in the hairpin (*The many face

In [28]:
print(response)

The Winged Helix domain (WHD), also known as the winged helix-turn-helix (wHTH) domain, is a specific subtype of the helix-turn-helix (HTH) family of DNA-binding motifs, characterized by a compact α/β structure consisting of three α-helices (H1, H2, H3), three β-strands (S1, S2, S3), and two characteristic loops forming the wings (W1 and W2), arranged in the order H1-S1-H2-H3-S2-W1-S3-W2 (*Winged helix proteins.pdf*). The N-terminal half of the WHD is largely helical, while the C-terminal half comprises the twisted antiparallel β-sheet and the wings, which flank helix H3 like the wings of a butterfly, inspiring the name winged helix motif (*Winged helix proteins.pdf*). The wings, particularly W1, often provide additional interfaces for nucleic acid contact, typically interacting with the minor groove of DNA through charged residues in the hairpin (*The many faces of the helix-turn-helix domain.pdf*). Variations of the WHD exist, with different versions containing two, three, or four β-

In [None]:
# Access and print source nodes used for answer
print(f'Used {len(response.source_nodes)} nodes to answer:')
for node_with_score in response.source_nodes:
    print('---------------------------------------------------') 
    print('---------------------------------------------------')
    print(f"Document ID: {node_with_score.node.id_}")
    print(f"Metadata: {node_with_score.node.metadata}")
    print(f"Text: {node_with_score.node.text}")

#### Nodes retrival

In [None]:
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.retrievers import VectorIndexRetriever

# Load the index from storage
storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR)
index = load_index_from_storage(storage_context)

# Input the query from the user
query_str = input("Enter your query: ")

# Generate the query embedding
embed_model = OpenAIEmbedding(model_name="text-embedding-3-large")
query_embedding = embed_model.get_text_embedding(query_str)

# Create a retriever and perform the query
retriever = VectorIndexRetriever(index=index, similarity_top_k=5)
query = VectorStoreQuery(query_embedding=query_embedding, similarity_top_k=10)
result = retriever.retrieve(query)

# Process and print the retrieved nodes
for node, similarity, node_id in zip(result.nodes, result.similarities, result.ids):
    print(f"Node ID: {node_id}, Similarity: {similarity}")
    print(f"Content: {node.get_content()}")


#### Direct query the models

In [None]:
import os
from openai import OpenAI

client = OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY"),  # This is the default and can be omitted
)

chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": input(),
        }
    ],
    model="o1-preview-2024-09-12",
)

print(chat_completion.choices[0].message.content)