## Question and Answer Application - on Private Documents


In [1]:
import os
from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv(), override=True)

True

In [2]:
# pip install pypdf -q

In [3]:
# pip install docx2txt -q

In [4]:
pip install wikipedia -q

Note: you may need to restart the kernel to use updated packages.


In [5]:
from langchain.document_loaders import PyPDFLoader, Docx2txtLoader
def load_document(file):
    import os
    name, extension = os.path.splitext(file)
    if extension == '.pdf':
        print(f'loading {file}')
        loader = PyPDFLoader(file)
    elif extension == '.docx':
        print(f'loading {file}')
        loader = Docx2txtLoader(file)
    else:
        print(f"Document format for {file} not supported")
        return None
    data = loader.load()
    return data

In [6]:
from langchain.document_loaders import WikipediaLoader

def load_from_wikipedia(query, lang='en'):
    loader = WikipediaLoader(query=query, lang=lang, load_max_docs=2) # load_max_docs limits the number of downloaded documents
    data = loader.load()
    return data

#### Chunk data

In [10]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
def chunk_data(data, chunk_size=256):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=0)
    chunks = text_splitter.split_documents(data) # use create_documents instead of split_documents when it's not already split in pages
    return chunks

#### Calculating cost

In [11]:
import tiktoken
def print_embedding_cost(texts):
    enc = tiktoken.encoding_for_model('text-embedding-ada-002')
    total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])
    print(f'Total Tokens: {total_tokens}')
    print(f'Embedding Cost in USD: {total_tokens/1000 * 0.0004:.6f}')

### Embedding and uploading to a vector database (Pinecone)

In [13]:
import pinecone
from langchain.vectorstores import Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings

def insert_or_fetch_embeddings(index_name):

    embeddings = OpenAIEmbeddings()
    pinecone.init(api_key=os.environ.get('PINECONE_API_KEY'), environment=os.environ.get('PINECONE_ENV'))

    if index_name in pinecone.list_indexes():
        print(f"Index {index_name} already exist")
        vector_store = Pinecone.from_existing_index(index_name, embeddings)
        print("Completed")
    else:
        print(f"Creating index: {index_name}")
        pinecone.create_index(index_name, dimension=1536, metric='cosine')
        vector_store = Pinecone.from_documents(chunks, embeddings, index_name=index_name)
        print("completed")

    return vector_store

In [14]:
import pinecone
def delete_pinecone_index(index_name="all"):
    pinecone.init(api_key=os.environ.get('PINECONE_API_KEY'), environment=os.environ.get('PINECONE_ENV'))

    if index_name == 'all':
        indexes = pinecone.list_indexes()
        print("Deleting all indexes")
        for index in indexes:
            pinecone.delete_index(index)
        print('Done')
    else:
        print(f"Deleting index {index_name}")
        pinecone.delete_index(index_name)
        print('Done')
        
    

### Asking and getting answers

In [15]:
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.chains import ConversationalRetrievalChain

def ask_and_get_answer(vectore_store, q):
    # llm = ChatOpenAI(model='gpt-4', temperature=0.7)
    
    llm = ChatOpenAI(model='gpt-3.5-turbo', temperature=1)
    retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': 3})
    chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)
    answer = chain.run(q)
    return answer

def ask_with_memory(vector_store, question, chat_history=[]):
    llm = ChatOpenAI(temperature=1)
    retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': 3})

    crc = ConversationalRetrievalChain.from_llm(llm, retriever)
    result = crc({'question': question, 'chat_history': chat_history})
    chat_history.append((question, result['answer']))

    return result, chat_history
    

### Running Code

In [16]:
pdf_data = load_document('files/us_constitution.pdf')
print(len(pdf_data))
# print(data[1].page_content)
print(pdf_data[2].metadata)
print(f"You file has {len(pdf_data)} pages")

loading files/us_constitution.pdf
41
{'source': 'files/us_constitution.pdf', 'page': 2}
You file has 41 pages


In [17]:
doc_data = load_document('files/the_great_gatsby.docx')
print(len(doc_data))
# print(doc_data[0].page_content)

loading files/the_great_gatsby.docx
1


In [18]:
data = load_from_wikipedia('GPT-4')
print(len(data))
# print(data[0].page_content)

2


In [19]:
chunks = chunk_data(pdf_data)
print(len(chunks))
# print(chunks[0].page_content)
print_embedding_cost(chunks)

190
Total Tokens: 16711
Embedding Cost in USD: 0.006684


In [20]:
delete_pinecone_index()

Deleting all indexes
Done


In [21]:
index_name = 'ask-document'
vector_store = insert_or_fetch_embeddings(index_name)

Creating index: ask-document
completed


In [165]:
q = "What is the document about?"
answer = ask_and_get_answer(vector_store, q)
print(answer)

The document provided is the United States Constitution. It outlines the framework and principles of the American government, establishes the rights and responsibilities of its citizens, and defines the relationship between the federal government and the states.


In [22]:
import time
count = 1
print("Type quit or exit to quit")
while True:
    q = input(f"Question #{count}: ")
    count += 1
    if q.lower() in ['quit', 'exit']:
        print("Quitting... \nBye!")
        time.sleep(2)
        break
    answer = ask_and_get_answer(vector_store, q)
    print(f'Answer: \n {answer}')
    print("*" * 50)

Type quit or exit to quit


Question #1:  What is the bill of rights


Answer: 
 The Bill of Rights refers to the first ten amendments to the United States Constitution. These amendments outline certain fundamental rights and liberties that protect individual freedoms. They cover a range of topics, including freedom of speech, religion, and the press; the right to bear arms; protection against unreasonable searches and seizures; and the right to a fair trial.
**************************************************


Question #2:  exit


Quitting... 
Bye!


In [23]:
delete_pinecone_index()

Deleting all indexes
Done


In [24]:
data = load_from_wikipedia("chatGPT")
chunks = chunk_data(data)
index_name = "chatgpt"
vector_store = insert_or_fetch_embeddings(index_name)

Creating index: chatgpt
completed


In [25]:
q = "what is chatGPT?"
answer = ask_and_get_answer(vector_store, q)
print(answer)

ChatGPT is a language model developed by OpenAI. It is based on the GPT (Generative Pre-trained Transformer) architecture and trained using a large amount of internet text data. ChatGPT is designed to generate human-like responses based on the input it receives, making it suitable for conversational applications and interactive dialogue.


In [26]:
# asking with memory
chat_history = []
question = "how many amendments are in the U.S. constitution?"
result, chat_history = ask_with_memory(vector_store, question, chat_history)
print(result['answer'])
print(chat_history)

There are 27 amendments in the U.S. Constitution.
[('how many amendments are in the U.S. constitution?', 'There are 27 amendments in the U.S. Constitution.')]


In [28]:
question = "Multiply the number by 3"
result, chat_history = ask_with_memory(vector_store, question, chat_history)
print(result['answer'])
print(chat_history)

The number of amendments in the U.S. Constitution is 27. So, multiplying this number by 3 would result in 81.
[('how many amendments are in the U.S. constitution?', 'There are 27 amendments in the U.S. Constitution.'), ('Multiply the number by 3', "I'm sorry, but I don't have the specific number mentioned in the context to give you the result of multiplying it by 3."), ('Multiply the number by 3', 'The number of amendments in the U.S. Constitution is 27. So, multiplying this number by 3 would result in 81.')]


In [31]:
import time
count = 1
chat_history = []
print("Write Quit or Exit to quit")

while True:
    q = input(f"Question #{count}")
    count += 1
    if q.lower() in ["quit", "exit"]:
        print("Quitting...")
        time.sleep(2)
        break
    result, _ = ask_with_memory(vector_store, q, chat_history)
    print (result['answer'])
    print("*" * 50)

Write Quit or Exit to quit


Question #1 how many amendments do we have in us constitution


The US Constitution currently has 27 amendments.
**************************************************


Question #2 add 5 to it


No, I cannot add 5 to the number of amendments in the US Constitution, as I don't have access to real-time data or the ability to perform calculations. As of September 2021, there are 27 amendments in the US Constitution.
**************************************************


Question #3 add 5 to the number


The US Constitution currently has 27 amendments. Adding 5 to this number would result in a total of 32 amendments.
**************************************************


Question #4 exit


Quitting...
