In [3]:
!pip3 install --upgrade --quiet langchain langchain-community langchain-openai chromadb 
!pip3 install --upgrade --quiet pypdf pandas streamlit python-dotenv

In [4]:
# Import Langchain modules
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.vectorstores import Chroma
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field

# Other modules and packages
import os
import tempfile
import streamlit as st  
import pandas as pd
from dotenv import load_dotenv


For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  exec(code_obj, self.user_global_ns, self.user_ns)


In [5]:
load_dotenv()

True

In [6]:
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")

## Define our LLM

In [8]:
llm = ChatOpenAI(model="gpt-4o-mini", api_key=OPENAI_API_KEY)
llm.invoke("Tell me a joke about cats")

AIMessage(content='Why was the cat sitting on the computer?\n\nBecause it wanted to keep an eye on the mouse!', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 20, 'prompt_tokens': 13, 'total_tokens': 33, 'completion_tokens_details': {'audio_tokens': None, 'reasoning_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': None, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_482c22a7bc', 'finish_reason': 'stop', 'logprobs': None}, id='run-53758b6b-a8e7-4b31-b074-d13cdf6aba00-0', usage_metadata={'input_tokens': 13, 'output_tokens': 20, 'total_tokens': 33, 'input_token_details': {'cache_read': 0}, 'output_token_details': {'reasoning': 0}})

## Process PDF document

### Load PDF document

In [17]:
!pip3 install --upgrade --quiet rapidocr-onnxruntime

In [114]:
loader = PyPDFLoader("data/FeSe_strain.pdf")#
# loader = PyPDFLoader("data/FeSe_doping.pdf")
pages = loader.load()
pages

[Document(metadata={'source': 'data/FeSe_strain.pdf', 'page': 0}, page_content='Evolution of pairing symmetry in FeSe 1−xSxas probed by uniaxial-strain tuning of Tc\nRuixian Liu§,1Qi Tang§,1Chang Liu,1Chunyi Li,1Kaijuan Zhou,1Qiaoyu Wang,1and Xingye Lu1,∗\n1Center for Advanced Quantum Studies, School of Physics and Astronomy, Beijing Normal University, Beijing, 100875, China\n(Dated: October 18, 2024)\nIn iron-based superconductors (FeSCs), the interplay between electronic nematicity and superconductivity\nis essential for understanding the exotic superconducting ground state. In the nematic regime, uniaxial-strain\n(ε) tuning of the superconducting transition temperature Tc[∆Tc(ε) =αε+βε2] offers a unique approach\nto investigating the evolution of pairing symmetry if both sanddwave pairing instabilities are relevant. Here,\nwe employ uniaxial strain to tune the Tcof FeSe 1−xSx, in which both nematicity and superconductivity un-\ndergo significant changes with doping. While Tcis usual

### Split document

In [115]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500,
                                            chunk_overlap=200,
                                            length_function=len,
                                            separators=["\n\n", "\n", " "])
chunks = text_splitter.split_documents(pages)

### Create embeddings

In [116]:
def get_embedding_function():
    embeddings = OpenAIEmbeddings(
        model="text-embedding-ada-002", openai_api_key=OPENAI_API_KEY
    )
    return embeddings

embedding_function = get_embedding_function()
# test_vector = embedding_function.embed_query("cat")

In [31]:
# len(test_vector)

1536

In [117]:
from langchain.evaluation import load_evaluator

evaluator = load_evaluator(evaluator="embedding_distance", 
                            embeddings=embedding_function)

# evaluator.evaluate_strings(prediction="Feynman", reference="physics")

In [33]:
# evaluator.evaluate_strings(prediction="Feynman", reference="biology")

{'score': 0.2021738115865368}

### Create vector database

In [118]:
import uuid

def create_vectorstore(chunks, embedding_function, vectorstore_path):

    # Create a list of unique ids for each document based on the content
    ids = [str(uuid.uuid5(uuid.NAMESPACE_DNS, doc.page_content)) for doc in chunks]
    
    # Ensure that only unique docs with unique ids are kept
    unique_ids = set()
    unique_chunks = []
    
    unique_chunks = [] 
    for chunk, id in zip(chunks, ids):     
        if id not in unique_ids:       
            unique_ids.add(id)
            unique_chunks.append(chunk) 

    # Create a new Chroma database from the documents
    vectorstore = Chroma.from_documents(documents=unique_chunks, 
                                        ids=list(unique_ids),
                                        embedding=embedding_function, 
                                        persist_directory = vectorstore_path)

    # vectorstore.persist()
    
    return vectorstore

In [119]:
# Create vectorstore
vectorstore = create_vectorstore(chunks=chunks, 
                                 embedding_function=embedding_function, 
                                 vectorstore_path="vectorstore_1")

In [120]:
len(chunks)

24

In [121]:
len(vectorstore)

24

## 2. Query for relevant data

In [6]:
# Load vectorstore
vectorstore = Chroma(persist_directory="vectorstore_1", embedding_function=embedding_function)

NameError: name 'Chroma' is not defined

In [123]:
len(vectorstore)

24

In [5]:
# Create retriever and get relevant chunks
retriever = vectorstore.as_retriever(search_type="similarity")
relevant_chunks = retriever.invoke("What is the material used in the paper?")
relevant_chunks

NameError: name 'vectorstore' is not defined

In [3]:
# Prompt template
PROMPT_TEMPLATE = """
You are an assistant for question-answering tasks.
Use the following pieces of retrieved context to answer
the question. If the answer isn't in the text, say that you
don't know. DON'T MAKE UP ANYTHING.

{context}

---

Answer the question based on the above context: {question}
"""

## 3. Generate responses

In [4]:
# Concatenate context text
context_text = "\n\n---\n\n".join([doc.page_content for doc in relevant_chunks])

# Create prompt
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
prompt = prompt_template.format(context=context_text, 
                                question="What is the material used in the paper?")
print(prompt)

NameError: name 'relevant_chunks' is not defined

In [127]:
llm.invoke(prompt)

AIMessage(content='The material used in the paper is FeSe (iron selenide) and its variant FeSe 1−xSx, along with references to iron pnictides and other related materials.', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 40, 'prompt_tokens': 1342, 'total_tokens': 1382, 'completion_tokens_details': {'audio_tokens': None, 'reasoning_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': None, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_f59a81427f', 'finish_reason': 'stop', 'logprobs': None}, id='run-416b9cd3-41cf-462a-af6a-5ea5f34f0590-0', usage_metadata={'input_tokens': 1342, 'output_tokens': 40, 'total_tokens': 1382, 'input_token_details': {'cache_read': 0}, 'output_token_details': {'reasoning': 0}})

### Using Langchain Expression Language

In [128]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
            {"context": retriever | format_docs, "question": RunnablePassthrough()}
            | prompt_template
            | llm
        )
rag_chain.invoke("What is the material used in the paper?")

AIMessage(content='The material used in the paper is titanium, which serves as a platform for applying strain to a thin crystal affixed to it. Additionally, the study also discusses iron-based superconductors, specifically FeSe and its variants.', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 44, 'prompt_tokens': 1337, 'total_tokens': 1381, 'completion_tokens_details': {'audio_tokens': None, 'reasoning_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': None, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_482c22a7bc', 'finish_reason': 'stop', 'logprobs': None}, id='run-2dbf396a-2913-4db6-9540-b744d2b74284-0', usage_metadata={'input_tokens': 1337, 'output_tokens': 44, 'total_tokens': 1381, 'input_token_details': {'cache_read': 0}, 'output_token_details': {'reasoning': 0}})

### Generate structured responses

In [129]:
from typing import Tuple, List

# class IntWithSources(BaseModel):
#     """An answer to the question, with sources and reasoning."""
#     answer: int = Field(description="Answer to question")
#     sources: str = Field(description="Full direct text chunk from the context used to answer the question")
#     reasoning: str = Field(description="Explain the reasoning of the answer based on the sources")

class StrWithSources(BaseModel):
    """An answer to the question, with sources and reasoning."""
    answer: str = Field(description="Answer to question")
    sources: str = Field(description="Full direct text chunk from the context used to answer the question")
    reasoning: str = Field(description="Explain the reasoning of the answer based on the sources")

# class TransitionTemp(BaseModel):
#     """An answer to the question, with sources and reasoning."""
#     answer: Tuple[str, int, str] = Field(description="Answer to question: Material, critical temperature, and temperature unit, e.g. ('CuO4', 8, 'K')")
#     sources: str = Field(description="Full direct text chunk from the context used to answer the question")
#     reasoning: str = Field(description="Explain the reasoning of the answer based on the sources")

class TransitionTemp(BaseModel):
    """Information related to the transition temperature, with sources and reasoning."""
    material: str = Field(description="Material used (chemical formula), e.g. 'CuO4' - answer should not contain any unknowns, e.g. 'x' - \
                          if there is no valid answer, the answer should be 'None'")
    critical_temp: str = Field(description="Critical temperature (Tc), e.g. '8 K' - must contain the temperature unit, \
                               and must be a direct quote from the text - if there is no numerical answer, the answer should be 'None'")
    sources: str = Field(description="Full direct text chunk from the context used to answer the question")
    reasoning: str = Field(description="Explain the reasoning of the answer based on the sources")
    
class ExtractedInfo(BaseModel):
    """Extracted information about the research article"""
    paper_summary: StrWithSources
    transition_temp: TransitionTemp

In [130]:
rag_chain = (
            {"context": retriever | format_docs, "question": RunnablePassthrough()}
            | prompt_template
            | llm.with_structured_output(ExtractedInfo, strict=True)
        )

rag_chain.invoke("Give me a summary of the research paper, the material used (chemical formula), and the critical temperature, with its unit.")

ExtractedInfo(paper_summary=StrWithSources(answer='The research paper investigates the relationship between uniaxial strain and superconducting transition temperature (Tc) in the iron selenide system FeSe1−xSx, focusing on the doping dependence and nematic fluctuations. It presents experimental data showing how the application of strain affects Tc, with results indicating a predominant s±wave pairing in lightly doped samples and variations in nematic susceptibility with doping concentrations. The study employs techniques like the four-electrode method to measure resistivity and explores the interplay between nematicity and superconductivity in this material.', sources='The functional form of ∆Tc(φ)[equivalent to ∆Tc(ε)], dictated by the coupling FSC−nem∝φ∆s∆dcosθ, allows us to unveil the evolution of underlying pairing symmetry in the nematic regime of FeSe 1−xSx. In undoped and lightly doped FeSe 1−xSx, ε[110]-induced change in Tc is dominated by ∆Tc(ε) = βε2 with β < 0, consistent wi

### Transform response into a dataframe

In [75]:
structured_response = rag_chain.invoke("Give me the title, summary, publication date, authors of the research paper.")
df = pd.DataFrame([structured_response.dict()])

# Transforming into a table with two rows: 'answer' and 'source'
answer_row = []
source_row = []
reasoning_row = []

for col in df.columns:
    answer_row.append(df[col][0]['answer'])
    source_row.append(df[col][0]['sources'])
    reasoning_row.append(df[col][0]['reasoning'])

# Create new dataframe with two rows: 'answer' and 'source'
structured_response_df = pd.DataFrame([answer_row, source_row, reasoning_row], columns=df.columns, index=['answer', 'source', 'reasoning'])
structured_response_df

Unnamed: 0,paper_title,paper_summary,publication_year,paper_authors
answer,Consequences of Erudite Vernacular Utilized Ir...,The paper explores the negative relationship b...,2006,Daniel M. Oppenheimer
source,"Copyright#2005 John Wiley & Sons, Ltd. Appl. C...",Most texts on writing style encourage authors ...,Appl. Cognit. Psychol. 20: 139–156 (2006),"Correspondence to: D. M. Oppenheimer, Departme..."
reasoning,The title is explicitly mentioned at the begin...,The summary is derived from the overall conten...,The publication year is indicated in the citat...,The author’s name is provided in the correspon...
