# Ask questions about your own collection of pdfs.
# The pdfs are loaded into a chroma database.
## Items we see here
### reading in all pdfs in a specified directory
### splitting up the pdfs into chunks and vectorizing them
### storing vectors to a chroma database
### load all vectors from chroma database
### for specified questions get similar documents
### langchain -> ask questions using the loaded documents 

1.)  Takes a list of questions to ask about one input directory of pdfs.
2.)  Similarity search on questions.
3.)  YES:  memory in this example.

For RAG Remembering chat history use 
https://python.langchain.com/docs/use_cases/question_answering/chat_vector_db

...use ConversationalRetrievalChain
https://medium.com/@kbdhunga/enhancing-conversational-ai-the-power-of-langchains-question-answer-framework-4974e1cab3cf
section on Memory Based Conversation

In [2]:
from langchain.llms import OpenAI 
from dotenv import load_dotenv
from pathlib import Path
import os
#load_dotenv()
dotenv_path = Path('.env_secure')
load_dotenv(dotenv_path=dotenv_path)
open_api_key = os.environ.get('OPENAI_API_KEY')
dotenv_path = Path('.env_rag_0006')
load_dotenv(dotenv_path=dotenv_path)
text_chunk_size = int(os.environ.get('TEXT_CHUNK_SIZE'))
text_chunk_overlap = int(os.environ.get('TEXT_CHUNK_OVERLAP'))
query_file_name = os.environ.get('QUERY_FILE_NAME') + '.txt'
input_pdf_directory = os.environ.get('INPUT_PDF_DIRECTORY')
input_query_directory = os.environ.get('INPUT_QUERY_DIRECTORY')
chroma_directory = './' + os.environ.get('CHROMA_DIRECTORY')


In [3]:
# Set logging for the queries
import logging
logging.basicConfig(level=logging.INFO)
logging.getLogger('langchain.retrievers.multi_query').setLevel(logging.INFO)
logging.debug('test debug')
logging.info('test info')
logging.warning('test warning')
logging.error('test error')
logging.critical('test critical')

INFO:root:test info
ERROR:root:test error
CRITICAL:root:test critical


In [4]:
#to simplify things just delete the contents of the chroma directory and start fresh
#if 'in use' error, restart kernel and try again
import os, shutil
folder = chroma_directory
for filename in os.listdir(folder):
    file_path = os.path.join(folder, filename)
    try:
        if os.path.isfile(file_path) or os.path.islink(file_path):
            os.unlink(file_path)
        elif os.path.isdir(file_path):
            shutil.rmtree(file_path)
    except Exception as e:
        print('Failed to delete %s. Reason: %s' % (file_path, e))

In [5]:

from langchain.docstore.document import Document
#from langchain.document_loaders import TextLoader
#from langchain.text_splitter import CharacterTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA
#from langchain.memory import ConversationBufferMemory


In [6]:

def get_query(specified_file_name):
    specified_full_name = './' + input_query_directory + '/' + specified_file_name
    logging.info(specified_full_name)
    text_file = open(specified_full_name, "r")
    data = text_file.read()
    all_lines = data.splitlines()
    return all_lines

In [7]:
class StopExecution(Exception):
    def _render_traceback_(self):
        logging.info(f'Exception: {Exception}')
        return []

In [8]:


def check_f(file_to_be_checked):
    full_file_to_be_checked = './' + input_query_directory + '/' + file_to_be_checked
    check_file = os.path.isfile(full_file_to_be_checked)
    print(check_file)
    if (not(check_file)):
        this_err = f'error on file {full_file_to_be_checked}'
        raise StopExecution(this_err)


In [10]:
#get all pdf files in directory
#step 1.)  fill up loaders

import glob
from os import path
from langchain.document_loaders import PyPDFLoader
a_list = []
full_path = './' + input_pdf_directory + '/*.pdf'
a_list = glob.glob(full_path)
pdf_loaders = []
logging.info(f'specific pdf list->{a_list}')
for one_item in a_list:
    a_loader = PyPDFLoader(one_item)
    pdf_loaders.append(a_loader)
logging.info(f'number pdf documents->{len(pdf_loaders)}')

pdf_page_chunks = []
for loader in pdf_loaders:
    pdf_page_chunks.extend(loader.load())
    

INFO:root:specific pdf list->['./datafiles2_input_pdf_0006\\afdb23-01_aeo_main_english_0602.pdf', './datafiles2_input_pdf_0006\\Enhancing_understanding_of_SARS-CoV-2_infection_am.pdf', './datafiles2_input_pdf_0006\\GEP-January-2023.pdf', './datafiles2_input_pdf_0006\\Prevalence_of_tuberculosis_COVID-19_chronic_condit.pdf', './datafiles2_input_pdf_0006\\Ramosetal.2023.Hypsolebiaslulai.pdf', './datafiles2_input_pdf_0006\\rep-0523.pdf', './datafiles2_input_pdf_0006\\Siapatisetal2023.pdf', './datafiles2_input_pdf_0006\\SocioeconomicStatusoftheFishingHouseholdsInsightfromSomeSelectedCoastalAreaofBangladesh.pdf', './datafiles2_input_pdf_0006\\Variations_in_growth_performance_of_Catla_Cattla_c.pdf']
INFO:root:number pdf documents->9


In [11]:
# Define the Text Splitter 

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = text_chunk_size,
    chunk_overlap = text_chunk_overlap
)

#Create a split of the document using the text splitter
splits = text_splitter.split_documents(pdf_page_chunks)
logging.info(f'number of data chunks->{len(splits)}')

INFO:root:number of data chunks->2809


In [12]:

embedding = OpenAIEmbeddings()
persist_directory = chroma_directory
# Create the vector store
ids = [str(i) for i in range(1, len(splits) + 1)]
vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embedding,
    persist_directory=persist_directory,
    ids=ids
)

logging.info(f'vector database number of stored entries->{vectordb._collection.count()}')


INFO:chromadb.telemetry.product.posthog:Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
INFO:root:vector database number of stored entries->2809


In [13]:
#https://stackoverflow.com/questions/76551067/how-to-create-a-langchain-doc-from-an-str
#basically you get back a list of tuples instead of a list of document types
#list massages the data to take a tuple and return it as a langchain Document type
#so this is only necessary when with score is specified
#remove duplicate scores just in case database is not deleted prior to run and there
#are duplicate entries
class score_docs:
    def __init__(self, score_similarity):
        self.score_similarity = score_similarity
        self.score_list = []

    def get_each_docs(self, one_with_score_item):
        local_document = Document(page_content=one_with_score_item[0].page_content, metadata=one_with_score_item[0].metadata)
        local_document.metadata.update({"score":one_with_score_item[1]})
        return local_document
    
    def process_list(self):
        parsed_docs = []
        logging.info(len(self.score_similarity))
        for one_returned_item in self.score_similarity:
            the_returned_doc = self.get_each_docs(one_returned_item)
            if the_returned_doc.metadata["score"] in self.score_list:
                logging.info(f'duplicate similarity found, skipping -> {the_returned_doc.metadata["score"]}')
            else:
                self.score_list.append(the_returned_doc.metadata["score"])
                parsed_docs.append(the_returned_doc)
        return parsed_docs
    

In [14]:
#get list of questions
all_questions = get_query(query_file_name)
q_len = len(all_questions)
if(q_len < 4):
    this_err = f'query question length wrong, len->{q_len}'
    raise StopExecution(this_err)

question = all_questions[0]
search_results = vectordb.similarity_search_with_score(question,k=5)
docs = search_results
logging.info(f'note type is list of tuples0>{type(docs)} / {type(docs[0])}')

similar_items = score_docs(search_results)
similar_items_list = similar_items.process_list()
vectordb.persist()

INFO:root:./datafiles2_input_query_0006/specified_query.txt
INFO:root:note type is list of tuples0><class 'list'> / <class 'tuple'>
INFO:root:5


In [15]:
#reference-> https://medium.com/@onkarmishra/using-langchain-for-question-answering-on-own-data-3af0a82789ed
#load vector db 
# Load vector database that was persisted earlier and check collection count in it
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
persist_directory = chroma_directory
embedding = OpenAIEmbeddings()
vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding)
print(vectordb._collection.count())

2809


In [16]:
question = all_questions[0]
docs = vectordb.similarity_search(question,k=5)
len(docs)
docs = vectordb.similarity_search_with_score(question,k=5)
similar_items = score_docs(docs)
similar_items_list = similar_items.process_list()

print(similar_items_list)
print(len(similar_items_list))

INFO:root:5


[Document(page_content='of Central African States tightened monetary policy \nin 2022 by raising key rates several times to respond \nto inflationary pressures and boost foreign reserves. \nInflation rose to 4.2% in 2022 from 1.1% in 2021 due \nto higher food prices and the effects of Russia’s inva -\nsion of Ukraine. The fiscal balance turned to a surplus \nof 0.8% in 2022 from a deficit of 1.1% in 2021 due to \nhigher oil revenue (up 51.8%). Debt fell to 52.6% of GDP \nin 2022 from 66.0% in 2021 thanks to lower financing', metadata={'page': 180, 'source': './datafiles2_input_pdf_0006\\afdb23-01_aeo_main_english_0602.pdf', 'score': 0.2246154546737671}), Document(page_content='Africa’s average consumer price inflation \nis projected to increase from an estimated \n14.2\xa0percent in 2022 to 15.1\xa0percent in 2023, \nand to decline to 9.5\xa0 percent in 2024. The \nprojected increase in 2023 mirrors structural \nweaknesses in most African countries: supply \nconstraints to offset the e

In [17]:

def one_question(particular_question, search_ty):
    llm_name = "gpt-3.5-turbo"
    llm = ChatOpenAI(model_name=llm_name, temperature=0)
    retriever = vectordb.as_retriever(search_type=search_ty, search_kwargs={'k': 3, 'fetch_k': 10, 'lambda_mult': .8})
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=retriever,
        return_source_documents=True,
    )

    query = particular_question
    result = qa_chain({"query": query})
    print(f'search type:{search_ty}/result->{result["result"]}')
    base_name = os.path.basename(result['source_documents'][0].metadata['source'])
    base_name_page = base_name + '(page:' + str(result['source_documents'][0].metadata['page']) + ')'
    print(f"specific metadata->{base_name_page}")




In [18]:
loop_max = 2
loop_cnt = 0
import time
for this_question in all_questions:
    print(this_question)
    one_question(this_question, 'mmr')
    time.sleep(5)
    one_question(this_question, 'similarity')
    time.sleep(5)
    if loop_cnt >= loop_max:
        break
    loop_cnt += 1
    logging.info(f'looping->{loop_cnt}')

What fueled strong inflation pressures in African economies in 2022?
search type:mmr/result->The strong inflation pressures in African economies in 2022 were fueled by higher food prices and the effects of Russia's invasion of Ukraine.
specific metadata->afdb23-01_aeo_main_english_0602.pdf(page:180)
search type:similarity/result->The strong inflation pressures in African economies in 2022 were fueled by higher food prices and the effects of Russia's invasion of Ukraine.
specific metadata->afdb23-01_aeo_main_english_0602.pdf(page:180)


INFO:root:looping->1


What is Down syndrome (DS) characterized by?
search type:mmr/result->Down syndrome (DS) is characterized by intellectual and developmental disabilities, facial dysmorphisms, muscular hypotonia, and numerous birth defects, including cardiac and gastrointestinal anomalies. Individuals with DS also have immune defects, making them more susceptible to autoimmune diseases and infections.
specific metadata->Enhancing_understanding_of_SARS-CoV-2_infection_am.pdf(page:0)
search type:similarity/result->Down syndrome (DS) is characterized by intellectual and developmental disabilities, facial dysmorphisms, muscular hypotonia, and numerous birth defects, including cardiac and gastrointestinal anomalies. Individuals with DS also have immune defects, making them more susceptible to autoimmune diseases and infections.
specific metadata->Enhancing_understanding_of_SARS-CoV-2_infection_am.pdf(page:0)


INFO:root:looping->2


What is the World Bank Evolution Roadmap?
search type:mmr/result->The World Bank Evolution Roadmap is a document being developed by the World Bank to address the challenges faced by developing countries due to the COVID-19 pandemic and other factors such as the Ukraine war. It aims to propose a way forward for the World Bank in supporting countries' efforts to achieve sustainable, inclusive, and resilient development. The roadmap also includes an enhanced formulation of the World Bank's mission, which is to end extreme poverty and boost shared prosperity.
specific metadata->GEP-January-2023.pdf(page:3)
search type:similarity/result->The World Bank Evolution Roadmap is a document being developed by the World Bank at a critical moment in time. It takes into account the impact of the COVID-19 pandemic and the Ukraine war on developing countries, which has led to loss of fiscal buffers, increased indebtedness, and erosion of creditworthiness. The roadmap aims to propose a way forward for t

Right now we have the vector database loaded


What follows is a simple demonstration of chat history with RAG.  
This chain has two steps. First, it condenses the current question and the chat history into a standalone question. This is necessary to create a standanlone vector to use for retrieval. After that, it does retrieval and then answers the question using retrieval augmented generation with a separate model. Part of the power of the declarative nature of LangChain is that you can easily use a separate language model for each call. This can be useful to use a cheaper and faster model for the simpler task of condensing the question, and then a more expensive model for answering the question. Here is an example of doing so.
(https://python.langchain.com/docs/use_cases/question_answering/chat_vector_db)

In [19]:
#uses article Simpatisetal2023.pdf
from langchain.chains import ConversationalRetrievalChain

llm_name = "gpt-3.5-turbo"
llm = ChatOpenAI(model_name=llm_name, temperature=0)
retriever = vectordb.as_retriever(search_type='similarity', search_kwargs={'k': 3, 'fetch_k': 10, 'lambda_mult': .8})
qa_chain = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=retriever,
    condense_question_llm = ChatOpenAI(temperature=0, model='gpt-3.5-turbo'),
)

particular_question = 'Where was the first reported record of the Atlantic soft pout?'
query = particular_question
chat_history = []
result = qa_chain({"question": query, "chat_history": chat_history})
print(result)

print(f'result->{result["answer"]}')
chat_history = [(query, result["answer"])]
print(type(chat_history))
print(type(chat_history[0]))

particular_question = 'How was a specimen caught?'
query = particular_question
result = qa_chain({"question": query, "chat_history": chat_history})
print(result)

print(f'result->{result["answer"]}')
chat_history.append(tuple((query, result["answer"])))
print(type(chat_history))
print(chat_history)

particular_question = 'What waters does it inhabit?'
query = particular_question
result = qa_chain({"question": query, "chat_history": chat_history})
print(result)

print(f'result->{result["answer"]}')
chat_history.append(tuple((query, result["answer"])))
print(type(chat_history))
print(chat_history)


{'question': 'Where was the first reported record of the Atlantic soft pout?', 'chat_history': [], 'answer': 'The first reported record of the Atlantic soft pout, Melanostigma atlanticum, was in the eastern Mediterranean Sea.'}
result->The first reported record of the Atlantic soft pout, Melanostigma atlanticum, was in the eastern Mediterranean Sea.
<class 'list'>
<class 'tuple'>
{'question': 'How was a specimen caught?', 'chat_history': [('Where was the first reported record of the Atlantic soft pout?', 'The first reported record of the Atlantic soft pout, Melanostigma atlanticum, was in the eastern Mediterranean Sea.')], 'answer': 'The specimen of the Atlantic soft pout was caught in a pelagic trawl during a research trip carried out north of the island of Crete (Greece) in December 2019.'}
result->The specimen of the Atlantic soft pout was caught in a pelagic trawl during a research trip carried out north of the island of Crete (Greece) in December 2019.
<class 'list'>
[('Where was 