In [2]:
# Import Langchain modules
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.vectorstores import Chroma
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field

# Other modules and packages
import os
import tempfile
import streamlit as st  
import pandas as pd
from dotenv import load_dotenv


For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  exec(code_obj, self.user_global_ns, self.user_ns)


In [3]:
load_dotenv()

True

In [4]:
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")

In [6]:
llm = ChatOpenAI(model="gpt-4o-mini", api_key=OPENAI_API_KEY)

In [42]:
loader = PyPDFLoader("data/Transition temp example.pdf")#
# loader = PyPDFLoader("data/FeSe_doping.pdf")
pages = loader.load()
pages

[Document(metadata={'source': 'data/Transition temp example.pdf', 'page': 0}, page_content='Superconductivity at 250 K in lanthanum hydride  under high pressures  \n \nA. P.  Drozdov1, P. P. Kong1, V. S. Minkov1, S. P. Besedin1, M. A. Kuzovnikov1,6, S. Mozaffari2, L. \nBalicas2,  F. Balakirev3, D. Graf2, V. B. Prakapenka4, E. Greenberg4, D. A. Knyazev1, M. Tkacz5, and \nM. I. Eremets1 \n \n \n1Max-Planck -Institut fur  Chemie, Hahn -Meitner Weg 1, 55128 Mainz, Germany   \n2National High Magnetic Field Laboratory  (NHMFL ), Florida State University, Tallahassee, Florida \n32310, USA  \n3NHMFL, Los Alamos National Laboratory, MS E536, Los Alamos, New Mexico 87545, USA  \n4Center for Advanced Radiation Sources, University of Chicago, 5640 South Ellis Avenue, Chicago, \nIllinois, 60637, USA  \n5Institute of Physical Chemistry PAS, Kasprzaka 44/ 52, 01 -224 Warsaw, Poland  \n6Institute of Solid State Physics RAS, Chernogolovka,  Moscow District,  142432 Russia  \n \n \nThe discovery of supe

In [43]:
# split the abstract into chunks for RAG
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len,
    separators=['.  \n', '. \n', '.\n', '. ', '.'],
)

chunks = splitter.split_documents(pages)
print(f'Split into {len(chunks)} chunks')

Split into 74 chunks


In [44]:
def get_embedding_function():
    embeddings = OpenAIEmbeddings(
        model="text-embedding-ada-002", openai_api_key=OPENAI_API_KEY
    )
    return embeddings

embedding_function = get_embedding_function()
# test_vector = embedding_function.embed_query("cat")

In [45]:
from langchain.evaluation import load_evaluator

evaluator = load_evaluator(evaluator="embedding_distance", 
                            embeddings=embedding_function)

# evaluator.evaluate_strings(prediction="Feynman", reference="physics")

In [46]:
import uuid

def create_vectorstore(chunks, embedding_function, vectorstore_path):

    # Create a list of unique ids for each document based on the content
    ids = [str(uuid.uuid5(uuid.NAMESPACE_DNS, doc.page_content)) for doc in chunks]
    
    # Ensure that only unique docs with unique ids are kept
    unique_ids = set()
    unique_chunks = []
    
    unique_chunks = [] 
    for chunk, id in zip(chunks, ids):     
        if id not in unique_ids:       
            unique_ids.add(id)
            unique_chunks.append(chunk) 

    # Create a new Chroma database from the documents
    vectorstore = Chroma.from_documents(documents=unique_chunks, 
                                        ids=list(unique_ids),
                                        embedding=embedding_function, 
                                        persist_directory = vectorstore_path)

    # vectorstore.persist()
    
    return vectorstore

In [47]:
# Create vectorstore
vectorstore = create_vectorstore(chunks=chunks, 
                                 embedding_function=embedding_function, 
                                 vectorstore_path="vectorstore_2")

In [48]:
# Load vectorstore
vectorstore = Chroma(persist_directory="vectorstore_2", embedding_function=embedding_function)

In [49]:
len(chunks), len(vectorstore)

(74, 74)

In [50]:
# Create retriever and get relevant chunks
retriever = vectorstore.as_retriever(search_type="similarity")
# relevant_chunks = retriever.invoke("Material superconducting transition temperature (Tc) i.e., number with unit (K) or range (K-K) or superconducting critical temperature (Tc) in the abstract")
relevant_chunks = retriever.invoke("transition temperature")
relevant_chunks

[Document(metadata={'page': 1, 'source': 'data/Transition temp example.pdf'}, page_content='. \nThe extrapolation of the  temperature depende nt upper critical fields 𝐻𝑐2(𝑇) towards T = 0 K,  Fig. 2b,  \nyields values between 95 and 136 T for 𝐻𝑐2(0). Notice the two steps  near 245 K and 230 K in the \nsuperconducting transition at zero -field. The higher  temperature step gradually broadens with \nincreasing magnetic field and completely disappears above 3  T. This behavior is consistent with \ninhomogeneous superconductivity. While it is difficult to investigate the local inhomogeneity of the \nsuperconducting state in a DAC, multiple examples of m ulti-step transitions in inhomogeneous samples'),
 Document(metadata={'page': 5, 'source': 'data/Transition temp example.pdf'}, page_content='9 Gor’kov, L. P. & Kresin, V. Z. High Pressure and Road to Room Temperature \nSuperconductivity. Rev. Modern Phys. 90, 011001 (2018).  \n10 Allen, P. B. & Dynes, R. C. Transition temperature of strong

In [56]:
print(relevant_chunks[0].page_content)

. 
The extrapolation of the  temperature depende nt upper critical fields 𝐻𝑐2(𝑇) towards T = 0 K,  Fig. 2b,  
yields values between 95 and 136 T for 𝐻𝑐2(0). Notice the two steps  near 245 K and 230 K in the 
superconducting transition at zero -field. The higher  temperature step gradually broadens with 
increasing magnetic field and completely disappears above 3  T. This behavior is consistent with 
inhomogeneous superconductivity. While it is difficult to investigate the local inhomogeneity of the 
superconducting state in a DAC, multiple examples of m ulti-step transitions in inhomogeneous samples


In [57]:
# Prompt template
PROMPT_TEMPLATE = """
You are an assistant for question-answering tasks.
Use the following pieces of retrieved context to answer
the question. If the answer isn't in the text, don't give
a positive answer (there is the option to output "None").
Do not add any information that is not present in the text.

{context}

---

Answer the question based on the above context: {question}
"""

In [59]:
# Concatenate context text
context_text = "\n\n---\n\n".join([doc.page_content for doc in relevant_chunks])

# Create prompt
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
prompt = prompt_template.format(context=context_text, 
                                question="What is the material used in the paper?")
print(prompt)

Human: 
You are an assistant for question-answering tasks.
Use the following pieces of retrieved context to answer
the question. If the answer isn't in the text, don't give
a positive answer (there is the option to output "None").
Do not add any information that is not present in the text.

. 
The extrapolation of the  temperature depende nt upper critical fields 𝐻𝑐2(𝑇) towards T = 0 K,  Fig. 2b,  
yields values between 95 and 136 T for 𝐻𝑐2(0). Notice the two steps  near 245 K and 230 K in the 
superconducting transition at zero -field. The higher  temperature step gradually broadens with 
increasing magnetic field and completely disappears above 3  T. This behavior is consistent with 
inhomogeneous superconductivity. While it is difficult to investigate the local inhomogeneity of the 
superconducting state in a DAC, multiple examples of m ulti-step transitions in inhomogeneous samples

---

9 Gor’kov, L. P. & Kresin, V. Z. High Pressure and Road to Room Temperature 
Superconductivity.

In [61]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
            {"context": retriever | format_docs, "question": RunnablePassthrough()}
            | prompt_template
            | llm
        )
rag_chain.invoke("What is the material used in the paper?")

AIMessage(content='None', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 1, 'prompt_tokens': 513, 'total_tokens': 514, 'completion_tokens_details': {'audio_tokens': None, 'reasoning_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': None, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_f59a81427f', 'finish_reason': 'stop', 'logprobs': None}, id='run-3df9d951-df9e-40a1-aaca-d51b26334f05-0', usage_metadata={'input_tokens': 513, 'output_tokens': 1, 'total_tokens': 514, 'input_token_details': {'cache_read': 0}, 'output_token_details': {'reasoning': 0}})

In [62]:
from typing import Tuple, List

# class IntWithSources(BaseModel):
#     """An answer to the question, with sources and reasoning."""
#     answer: int = Field(description="Answer to question")
#     sources: str = Field(description="Full direct text chunk from the context used to answer the question")
#     reasoning: str = Field(description="Explain the reasoning of the answer based on the sources")

class StrWithSources(BaseModel):
    """An answer to the question, with sources and reasoning."""
    answer: str = Field(description="Answer to question")
    sources: str = Field(description="Full direct text chunk from the context used to answer the question")
    reasoning: str = Field(description="Explain the reasoning of the answer based on the sources")

# class TransitionTemp(BaseModel):
#     """An answer to the question, with sources and reasoning."""
#     answer: Tuple[str, int, str] = Field(description="Answer to question: Material, critical temperature, and temperature unit, e.g. ('CuO4', 8, 'K')")
#     sources: str = Field(description="Full direct text chunk from the context used to answer the question")
#     reasoning: str = Field(description="Explain the reasoning of the answer based on the sources")

class TransitionTemp(BaseModel):
    """Information related to the transition temperature, with sources and reasoning."""
    material: str = Field(description="Material used (chemical formula), e.g. 'CuO4' - answer should not contain any unknowns, e.g. 'x' - \
                          if there is no valid answer, the answer should be 'None'")
    critical_temp: str = Field(description="Critical temperature (Tc), e.g. '8 K' - must contain the temperature unit, \
                               and must be a direct quote from the text - if there is no numerical answer, the answer should be 'None'")
    sources: str = Field(description="Full direct text chunk from the context used to answer the question")
    reasoning: str = Field(description="Explain the reasoning of the answer based on the sources")
    
class ExtractedInfo(BaseModel):
    """Extracted information about the research article"""
    paper_summary: StrWithSources
    transition_temp: TransitionTemp

In [None]:
class TransitionTemp(BaseModel):
    """Information related to the transition temperature, with sources and reasoning."""
    material: str = Field(description="Material used (chemical formula), e.g. 'CuO4' - answer should not contain any unknowns, e.g. 'x' - \
                          if there is no valid answer, the answer should be 'None'")
    critical_temp: str = Field(description="Critical temperature (Tc), e.g. '8 K' - must contain the temperature unit, \
                               and must be a direct quote from the text - if there is no numerical answer, the answer should be 'None'")
    sources: str = Field(description="Full direct text chunk from the context used to answer the question")
    reasoning: str = Field(description="Explain the reasoning of the answer based on the sources")

In [63]:
rag_chain = (
            {"context": retriever | format_docs, "question": RunnablePassthrough()}
            | prompt_template
            | llm.with_structured_output(ExtractedInfo, strict=True)
        )

rag_chain.invoke("Give me a summary of the research paper, the material used (chemical formula), and the critical temperature, with its unit.")

ExtractedInfo(paper_summary=StrWithSources(answer='The research paper discusses the quest for room temperature superconductivity (RTSC), highlighting the limits of known superconductors with critical temperatures (Tcs) below approximately 30 K until the discovery of cuprates, which can reach Tcs up to 164 K. It mentions ongoing theoretical and experimental efforts to find materials that can exhibit RTSC, particularly focusing on yttrium super hydrides and carbon-based materials, with an encouraging example of superconductivity found in Q-carbon with Tc approximately 55 K.', sources='The quest for room temperature superconductivity is a longstanding challenge. Superconductivity was considered as a low-temperature phenomenon as the known materials had Tcs inferior to 30 K, until 1986, when Bednorz and Müller discovered the cuprates – copper based superconductors with enormous Tcs that can reach 164 K – or the so-called high temperature superconductors (HTSCs). The cuprates stimulated an 

In [74]:
# do the same using this context:
manual_context = """
The discovery of superconductivity at 203 K in H3S
1 brought attention back to conventional
superconductors whose properties can be described by the Bardeen-Cooper-Schrieffer (BCS) and
the Migdal-Eliashberg theories. These theories predict that high, and even room temperature
superconductivity (RTSC) is possible in metals possessing certain favorable parameters such as
lattice vibrations at high frequencies. However, these general theories do not suffice to predict
real superconductors. New superconducting materials can be predicted now with the aid of first
principles calculations based on Density Functional Theory (DFT). In particular, the calculations
suggested a new family of hydrides possessing a clathrate structure, where the host atom (Ca, Y,
La) is at the center of the cage formed by hydrogen atoms2-4
. For LaH10 and YH10
superconductivity, with critical temperatures Tc ranging between 240 and 320 K is predicted at
megabar pressures3-6
. Here, we report superconductivity with a record Tc  250 K within the
Fm3m structure of LaH10 at a pressure P  170 GPa. We proved the existence of superconductivity
at 250 K through the observation of zero-resistance, isotope effect, and the decrease of Tc under
an external magnetic field, which suggests an upper critical magnetic field of 120 T at zerotemperature. The pressure dependence of the transition temperatures Tc (P) has a maximum of
250-252 K at the pressure of about 170 GPa. This leap, by  50 K, from the previous Tc record of
203 K1
indicates the real possibility of achieving RTSC (that is at 273 K) in the near future at high
pressures and the perspective of conventional superconductivity at ambient pressure.
"""

# get structured output with the manual context and the same question
rag_chain.invoke("Give me a summary of the research paper, the material used (chemical formula), and the critical temperature, with its unit.", context=manual_context)

# this isn't structured output - fix please
llm_struc = llm.with_structured_output(ExtractedInfo, strict=True)

rag_chain = (
            {"context": retriever | format_docs, "question": RunnablePassthrough()}
            | prompt_template
            | llm_struc
        )

rag_chain.invoke("Give me a summary of the research paper, the material used (chemical formula), and the critical temperature, with its unit.")

ExtractedInfo(paper_summary=StrWithSources(answer='The research paper discusses the quest for room temperature superconductivity (RTSC) and highlights the importance of theoretical predictions for high temperature superconductors. It emphasizes the significance of experiments on yttrium super hydrides and mentions an encouraging discovery of superconductivity in Q-carbon with a critical temperature of approximately 55 K.', sources='The quest for room temperature superconductivity is a longstanding challenge... One encouraging example is the discovery of superconductivity with Tc \uf040 55 K in Q-carbon26.', reasoning='The summary focuses on the ongoing research and discoveries related to superconductivity, particularly at room temperature, mentioning specific materials and their critical temperatures.'), transition_temp=TransitionTemp(material='Q-carbon', critical_temp='55 K', sources='One encouraging example is the discovery of superconductivity with Tc \uf040 55 K in Q-carbon26.', re

In [None]:

# make manual_context_dict 
manual_context_dict = {"paper_summary": manual_page, "transition_temp": manual_page}
# invoke the chain with the manual context
rag_chain_manual = (
            {"context": manual_context, "question": RunnablePassthrough()}
            | prompt_template
            | llm.with_structured_output(ExtractedInfo, strict=True)
        )

rag_chain_manual.invoke("Give me a summary of the research paper, the material used (chemical formula), and the critical temperature, with its unit.")


TypeError: Expected a Runnable, callable or dict.Instead got an unsupported type: <class 'str'>