# Project: RAG - Q&A on Private Documents using LangChain, OpenAI and Pinecone

This notebook uses **the latest versions** of the OpenAI and LangChain libraries.

In [137]:
pip install -q -r ./requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [3]:
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

True

### Loading Documents

In [4]:
# loading PDF, DOCX and TXT files as LangChain Documents
def load_document(file):
    import os
    name, extension = os.path.splitext(file)

    if extension == '.pdf':
        from langchain.document_loaders import PyPDFLoader
        print(f'Loading {file}')
        loader = PyPDFLoader(file)
    elif extension == '.docx':
        from langchain.document_loaders import Docx2txtLoader
        print(f'Loading {file}')
        loader = Docx2txtLoader(file)
    elif extension == '.txt':
        from langchain.document_loaders import TextLoader
        loader = TextLoader(file)
    else:
        print('Document format is not supported!')
        return None

    data = loader.load()
    return data
  

In [5]:
# wikipedia
def load_from_wikipedia(query, lang='en', load_max_docs=2):
    from langchain.document_loaders import WikipediaLoader
    loader = WikipediaLoader(query=query, lang=lang, load_max_docs=load_max_docs)
    data = loader.load()
    return data


### Chunking Data

In [6]:
def chunk_data(data, chunk_size=256):
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=0)
    chunks = text_splitter.split_documents(data)
    return chunks
    

### Calculating Cost

In [7]:
def print_embedding_cost(texts):
    import tiktoken
    enc = tiktoken.encoding_for_model('text-embedding-3-small')
    total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])
    # check prices here: https://openai.com/pricing
    print(f'Total Tokens: {total_tokens}')
    print(f'Embedding Cost in USD: {total_tokens / 1000 * 0.00002:.6f}')

### Embedding and Uploading to a Vector Database (Pinecone)

In [8]:
def insert_or_fetch_embeddings(index_name, chunks):
    # importing the necessary libraries and initializing the Pinecone client
    import pinecone
    from langchain_community.vectorstores import Pinecone
    from langchain_openai import OpenAIEmbeddings
    from pinecone import ServerlessSpec

    
    pc = pinecone.Pinecone()
        
    embeddings = OpenAIEmbeddings(model='text-embedding-3-small', dimensions=1536)  # 512 works as well

    # loading from existing index
    if index_name in pc.list_indexes().names():
        print(f'Index {index_name} already exists. Loading embeddings ... ', end='')
        vector_store = Pinecone.from_existing_index(index_name, embeddings)
        print('Ok')
    else:
        # creating the index and embedding the chunks into the index 
        print(f'Creating index {index_name} and embeddings ...', end='')

        # creating a new index
        pc.create_index(
            name=index_name,
            dimension=1536,
            metric='cosine',
            spec=ServerlessSpec(
                cloud="aws",
                region="us-east-1"
        ) 
        )

        # processing the input documents, generating embeddings using the provided `OpenAIEmbeddings` instance,
        # inserting the embeddings into the index and returning a new Pinecone vector store object. 
        vector_store = Pinecone.from_documents(chunks, embeddings, index_name=index_name)
        print('Ok')
        
    return vector_store
    

In [9]:
def delete_pinecone_index(index_name='all'):
    import pinecone
    pc = pinecone.Pinecone()
    
    if index_name == 'all':
        indexes = pc.list_indexes().names()
        print('Deleting all indexes ... ')
        for index in indexes:
            pc.delete_index(index)
        print('Ok')
    else:
        print(f'Deleting index {index_name} ...', end='')
        pc.delete_index(index_name)
        print('Ok')
    

### Asking and Getting Answers

In [10]:
def ask_and_get_answer(vector_store, q, k=3):
    from langchain.chains import RetrievalQA
    from langchain_openai import ChatOpenAI

    llm = ChatOpenAI(model='gpt-3.5-turbo', temperature=1)

    retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': k})

    chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)
    
    answer = chain.invoke(q)
    return answer


### Running Code

In [11]:
# import warnings
# warnings.filterwarnings('ignore')

#### Ask a PDF

In [12]:
data = load_document('files/us_constitution.pdf')
# print(data[1].page_content)
# print(data[10].metadata)

print(f'You have {len(data)} pages in your data')
print(f'There are {len(data[20].page_content)} characters in the page')

Loading files/us_constitution.pdf
You have 41 pages in your data
There are 1137 characters in the page


In [13]:
# data = load_document('files/the_great_gatsby.docx')
# print(data[0].page_content)

In [14]:
# data = load_from_wikipedia('GPT-4', 'de')
# print(data[0].page_content)

In [15]:
chunks = chunk_data(data)
print(len(chunks))
# print(chunks[10].page_content)

190


In [16]:
print_embedding_cost(chunks)

Total Tokens: 16711
Embedding Cost in USD: 0.000334


In [17]:
delete_pinecone_index()

Deleting all indexes ... 
Ok


In [18]:
index_name = 'askadocument'
vector_store = insert_or_fetch_embeddings(index_name=index_name, chunks=chunks)

Creating index askadocument and embeddings ...Ok


In [19]:
q = 'What is the Bill of Rights?'
answer = ask_and_get_answer(vector_store, q)
print(answer)

{'query': 'What is the Bill of Rights?', 'result': 'The Bill of Rights refers to the first ten amendments to the United States Constitution.'}


#### While Loop for Asking Questions

In [20]:
import time
i = 1
print('Write Quit or Exit to quit.')
while True:
    q = input(f'Question #{i}: ')
    i = i + 1
    if q.lower() in ['quit', 'exit']:
        print('Quitting ... bye bye!')
        time.sleep(2)
        break
    
    answer = ask_and_get_answer(vector_store, q)
    print(f'\nAnswer: {answer}')
    print(f'\n {"-" * 50} \n')

    

Write Quit or Exit to quit.


Question #1:  what is the most controversial bill passed?



Answer: {'query': 'what is the most controversial bill passed?', 'result': "I don't know."}

 -------------------------------------------------- 



Question #2:  How long can a president be there in powwer in USA?



Answer: {'query': 'How long can a president be there in powwer in USA?', 'result': 'A president in the USA can be elected to the office no more than twice according to the 22nd Amendment to the Constitution of the United States, and no person who has held the office of President for more than two years of a term to which some other person was elected President shall be elected to office. A president must also be at least 35 years old and have been a resident of the United States for at least 14 years.'}

 -------------------------------------------------- 



Question #3:  exit


Quitting ... bye bye!


#### Ask Wikipedia

In [21]:
delete_pinecone_index()

Deleting all indexes ... 
Ok


In [22]:
data = load_from_wikipedia('Google Gemini', 'de')
chunks = chunk_data(data)

In [23]:
chunks

[Document(page_content='Google Gemini (ehemals Google Bard) ist ein von Google entwickelter KI-basierter, multimodaler Chatbot. Er wurde als direkte Reaktion auf den Erfolg von ChatGPT entwickelt und im März 2023 in eingeschränkter Kapazität veröffentlicht, bevor er im Laufe des', metadata={'title': 'Google Gemini', 'summary': 'Google Gemini (ehemals Google Bard) ist ein von Google entwickelter KI-basierter, multimodaler Chatbot. Er wurde als direkte Reaktion auf den Erfolg von ChatGPT entwickelt und im März 2023 in eingeschränkter Kapazität veröffentlicht, bevor er im Laufe des Sommers in weiteren Ländern verfügbar wurde. Google Gemini ist in 40 Sprachen verfügbar.\n\nÄhnlich wie OpenAIs Konkurrenzprodukt ChatGPT kann Google Gemini in einer gesprächsähnlichen Art eine Vielzahl von Fragen beantworten. Gemini erlaubt multimodales Arbeiten, das heißt, man kann sowohl diverse Medien wie Sprache oder Fotos eingeben als auch unterschiedliche Formate, wie Code oder Sprache, ausgegeben bekomm

In [49]:
index_name = 'gemini'
vector_store = insert_or_fetch_embeddings(index_name=index_name, chunks=chunks)

Creating index gemini and embeddings ...Ok


In [51]:
q = 'Was ist Google Gemini?'
answer = ask_and_get_answer(vector_store, q)
print(answer)

{'query': 'Was ist Google Gemini?', 'result': 'Google Gemini (ehemals Google Bard) ist ein von Google entwickelter KI-basierter, multimodaler Chatbot. Es wurde entwickelt als Reaktion auf den Erfolg von ChatGPT und bietet eine kostenlose Version sowie eine kostenpflichtige Version namens Gemini Advanced an. Es ist in 40 Sprachen verfügbar.'}
