In [32]:
import getpass
import os
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI

os.environ["LANGCHAIN_TRACING_V2"] = "true"

load_dotenv()
if "OPENAI_API_KEY" not in os.environ:
    os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI API key: ")
if "LANGCHAIN_API_KEY" not in os.environ:
    os.environ["LANGCHAIN_API_KEY"] = getpass.getpass("Enter your LangChain API key: ")

llm = ChatOpenAI(model="gpt-4o-mini")

In [None]:
from langchain_community.document_loaders import PyPDFLoader
#load
file_path = "test.pdf"
loader = PyPDFLoader(file_path) # extracts text from PDF file
docs = loader.load()  # create langchain document for each page of the pdf (page content + metadata)

In [34]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma             # vector store
from langchain_openai import OpenAIEmbeddings   # embedding model

# split
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, 
    chunk_overlap=200, 
)
all_splits = text_splitter.split_documents(docs)
#embed
vector_store = Chroma.from_documents(documents=all_splits, embedding=OpenAIEmbeddings())
#retrieve
retriever = vector_store.as_retriever()

In [None]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt = (
    """
        You are a helpful assistant for answering questions based on the content of a PDF document.
        Use the following pieces of context to answer the question.
        If you don't know the answer, just say that you don't know, don't try to make
        up an answer. Use maximum 3 sentences to keep the answer concise.

        Context: {context}
    """
)
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"), #create_retrieval_chain expects "input" keyword
    ]
)
qa_chain = create_stuff_documents_chain(llm, prompt)    # 2. inject context into the prompt and pass to LLM
rag_chain = create_retrieval_chain(retriever, qa_chain) # 1. retrieve data, passes to qa_chain



results = rag_chain.invoke({"input": "What are the main topics covered in the PDF?"})
results

{'input': 'What are the main topics covered in the PDF?',
 'context': [Document(id='2d58e7f9-397f-4c3a-888f-1570f8a5192b', metadata={'producer': 'PDFTron PDFNet, V9.21306', 'creationdate': '2022-09-29T07:28:32+00:00', 'total_pages': 30, 'page_label': '5', 'page': 4, 'moddate': '2022-09-29T09:40:10+02:00', 'source': 'test.pdf', 'title': 'Food safety aspects of cell-based food', 'creator': 'PyPDF'}, page_content='iii \n \nContents \n \n \nAcknowledgements ............................................................................................................................................... iv \nAbbreviations and acronyms .................................................................................................................................. v \nExecutive summary ................................................................................................................................................ vi \n1. Introduction ...............................................