In [19]:
# PDF Loader
from langchain_community.document_loaders import PyPDFLoader
loader = PyPDFLoader('1-s2.0-S2772899424000302-main.pdf')
data = loader.load()
data

[Document(metadata={'producer': 'Acrobat Distiller 8.1.0 (Windows)', 'creator': 'Elsevier', 'creationdate': '2025-02-24T19:46:21+05:30', 'crossmarkdomains[2]': 'elsevier.com', 'crossmarkmajorversiondate': '2010-04-23', 'subject': 'Crop Design, 4 (2025) 100081. doi:10.1016/j.cropd.2024.100081', 'author': 'Zhao Liang', 'elsevierwebpdfspecifications': '7.0.1', 'crossmarkdomainexclusive': 'true', 'robots': 'noindex', 'moddate': '2025-02-24T19:48:16+05:30', 'doi': '10.1016/j.cropd.2024.100081', 'crossmarkdomains[1]': 'sciencedirect.com', 'title': 'Analysis of CYP701A1 genes in gossypium species and functional characterization through gene silencing', 'source': '1-s2.0-S2772899424000302-main.pdf', 'total_pages': 12, 'page': 0, 'page_label': '1'}, page_content='Analysis of CYP701A1 genes in gossypium species and functional\ncharacterization through gene silencing\nZhao Lianga, Di Jiachunb, Guo Qia, Xu Zhenzhena, Zhao Juna, Xu Penga, Xu Jianwena,\nLiu Jianguanga, Shen Xinliana, Chen Xushenga,*

In [20]:
# Creating chunks from documents we loaded from the PDF
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
)

docs = text_splitter.split_documents(data)
print(len(docs))

58


In [None]:
# Creating embeddings and storing them in a vector store

# We can also use other embedding models like OpenAIEmbeddings, HuggingFaceEmbeddings, etc.
from langchain_ollama import OllamaEmbeddings
# We can also use other vector stores like Chroma, Pinecone, etc.
from langchain_community.vectorstores import FAISS

embedding_model = OllamaEmbeddings(model="nomic-embed-text")
vectorstore = FAISS.from_documents(docs, embedding_model)

In [22]:
# query = "Who funded for the research?"
# result = vectorstore.similarity_search(query)

In [28]:
from langchain_ollama import OllamaLLM
llm = OllamaLLM(model="llama3", temperature=0)

In [24]:
# Design Prompt
from langchain_core.prompts import ChatPromptTemplate

prompt = ChatPromptTemplate.from_template(
    """Only use the following context to answer the question, if the context is not sufficient, say 'I don't know'.
    context: {context}
    question: {question}
    """)

In [25]:
# Creating the chain to combine the retriever and the LLM

retriever = vectorstore.as_retriever(search_kwargs={"k": 4})
retriever

VectorStoreRetriever(tags=['FAISS', 'OllamaEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x000001413602E2A0>, search_kwargs={'k': 4})

In [26]:
from langchain_core.runnables import RunnablePassthrough

rag_chain = (
    {
        "context": retriever,
        "question": RunnablePassthrough()
    }
    | prompt
    | llm
)


In [27]:
result = rag_chain.invoke("Who funded for the research?")
print(result)

The research was financially supported in part by grants from the National Science Foundation in China (31401429) and the Natural Science Foundation in Jiangsu Province (BK20140747).
