In [None]:
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.callbacks.manager import CallbackManager
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader
from langchain_core.prompts import ChatPromptTemplate
from langchain.vectorstores import Chroma

import os
os.environ["OPENAI_API_KEY"] = " "

In [None]:
embeddings = HuggingFaceInstructEmbeddings(model_name="BAAI/bge-small-en-v1.5", encode_kwargs={"normalize_embeddings": True})

In [None]:
document = []
pdf_path = " "

loader = PyPDFLoader(pdf_path)

document += loader.load()

In [None]:
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=2500, chunk_overlap=50)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=50)
texts = text_splitter.split_documents(document)

In [None]:
# Run this cell only if you have added any new documents

PERSIST_DIRECTORY = 'doc_db'

db = Chroma.from_documents(
        documents=texts,
        persist_directory=PERSIST_DIRECTORY,
        embedding=embeddings
    )


In [None]:
# Execute this cell only if no new docs are loaded.

PERSIST_DIRECTORY = 'doc_db'

db = Chroma(
        persist_directory=PERSIST_DIRECTORY,
        embedding_function=embeddings,
    )



In [None]:
retriever = db.as_retriever(search_type="similarity_score_threshold", search_kwargs={'score_threshold': 0.7})

In [None]:
template = """
You are an experienced document analyzer. You need to extract information from the document \
and answer to the USER QUESTION based on the CONTEXT accordingly.
Use the provided context only to answer the following question:
<context>
{context}
</context>

To answer the question do the following:
* Read the CONTEXT clearly before answering the QUESTION.
* Provide a clear and consise answer to the QUESTION only. NEVER hallucinate any information. NEVER fetch information from any other sources.
* If no relevant documents are found in the CONTEXT then inform the user "I cannot provide answer to the above question as it is out of scope". 
* Do not generate extra questions. ONLY provide answer to the QUESTION.
* Do not use any other information, other than what is mentioned in the CONTEXT for answering.
* Try to give the answers in bullet points.

Question: {input}
"""

prompt = ChatPromptTemplate.from_template(template)

In [None]:
llm = ChatOpenAI(model_name="gpt-3.5-turbo")

In [None]:
qa = RetrievalQA.from_chain_type(
            llm=llm,
            chain_type="stuff",  # try other chains types as well. refine, map_reduce, map_rerank
            retriever=retriever,
            return_source_documents=True,  # verbose=True,
            callbacks=callback_manager,
            chain_type_kwargs={
                "prompt": prompt,
            },
        )

In [None]:
query = "What is the content of the document?"
res = qa(query) 
answer, docs = res["result"], res["source_documents"]
print(answer)
list(map(lambda doc: doc.metadata['source'], docs))

In [None]:
query = "Explain the concept of ...?"
res = qa(query) 
answer, docs = res["result"], res["source_documents"]
print(answer)
list(map(lambda doc: doc.metadata['source'], docs))