RAG with Chroma and GPT3.5

In [1]:
from langchain.document_loaders import PyPDFLoader,CSVLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain_openai import ChatOpenAI


In [2]:
openai_key=''

We'll create an instance of the llm model we'll use in the rest of the notebook

In [3]:
from langchain_openai import OpenAIEmbeddings,OpenAI
llm = OpenAI(model_name="gpt-3.5-turbo-instruct", openai_api_key=openai_key)

We'll load a copy of my resume and ask questions based on information from it

In [4]:
loader = PyPDFLoader(file_path='Data/RAG_Documents/Lokesh_subramany_resume.pdf')
data = loader.load()

chunk_size = 200
chunk_overlap = 50
rc_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size,chunk_overlap=chunk_overlap)

docs = rc_splitter.split_documents(data)

We'll now create an instance of ChromaDB and persist it to disk. Then we'll load the document into the DB with a specified embedding function

In [5]:
from langchain_openai import OpenAIEmbeddings,OpenAI
from langchain_community.vectorstores import Chroma

embedding_function = OpenAIEmbeddings(openai_api_key=openai_key)
chromadb = Chroma(persist_directory='Data/Chroma/',embedding_function=embedding_function)
chromadb.persist()
docstorage = Chroma.from_documents(docs, embedding_function)

Now we can ask questions related to the loaded document

In [10]:
from langchain.chains import RetrievalQA

retriever = docstorage.as_retriever()
qa = RetrievalQA.from_llm(llm=llm, retriever=retriever)

results = qa.invoke('Whose resume is it?')

print(results)

{'query': 'Whose resume is it?', 'result': " Lokesh Subramany's resume."}


In [11]:
#Showing reference to the document in the response
from langchain.chains import RetrievalQAWithSourcesChain

qa_source = RetrievalQAWithSourcesChain.from_chain_type(llm=llm, retriever=retriever)

results = qa_source.invoke('Whose resume is it?',return_only_outputs=True)
print(results)

{'answer': ' The resume belongs to Lokesh Subramany.\n', 'sources': 'Data/RAG_Documents/Lokesh_subramany_resume.pdf'}


In [12]:
results = qa_source.invoke("What company did lokesh last work",return_only_outputs=True)
print(results)

{'answer': ' Lokesh last worked at Qualcomm.\n', 'sources': 'Data/RAG_Documents/Lokesh_subramany_resume.pdf'}
