## Question and Answer Application - on Private Documents


In [47]:
import os
from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv(), override=True)

True

In [48]:
# pip install pypdf -q

In [49]:
# pip install docx2txt -q

In [50]:
# pip install wikipedia -q

In [51]:
from langchain.document_loaders import PyPDFLoader, Docx2txtLoader
import os

def load_document(file):
    name, extension = os.path.splitext(file)
    if extension == '.pdf':
        print(f'loading {file}')
        loader = PyPDFLoader(file)
    elif extension == '.docx':
        print(f'loading {file}')
        loader = Docx2txtLoader(file)
    else:
        print(f"Document format for {file} not supported")
        return None
    data = loader.load()
    return data

In [52]:
from langchain.document_loaders import WikipediaLoader

def load_from_wikipedia(query, lang='en'):
    loader = WikipediaLoader(query=query, lang=lang, load_max_docs=2) # load_max_docs limits the number of downloaded documents
    data = loader.load()
    return data

#### Chunk data

In [53]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
def chunk_data(data, chunk_size=256):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=0)
    chunks = text_splitter.split_documents(data) # use create_documents instead of split_documents when it's not already split in pages
    return chunks

#### Calculating cost

In [54]:
import tiktoken
def print_embedding_cost(texts):
    enc = tiktoken.encoding_for_model('text-embedding-ada-002')
    total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])
    print(f'Total Tokens: {total_tokens}')
    print(f'Embedding Cost in USD: {total_tokens/1000 * 0.0004:.6f}')

### Embedding and uploading to a vector database (Pinecone)

In [55]:
import pinecone
from langchain.vectorstores import Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings

def insert_or_fetch_embeddings(index_name):

    embeddings = OpenAIEmbeddings()
    pinecone.init(api_key=os.environ.get('PINECONE_API_KEY'), environment=os.environ.get('PINECONE_ENV'))

    if index_name in pinecone.list_indexes():
        print(f"Index {index_name} already exist")
        vector_store = Pinecone.from_existing_index(index_name, embeddings)
        print("Completed")
    else:
        print(f"Creating index: {index_name}")
        pinecone.create_index(index_name, dimension=1536, metric='cosine')
        vector_store = Pinecone.from_documents(chunks, embeddings, index_name=index_name)
        print("completed")

    return vector_store

In [56]:
import pinecone
def delete_pinecone_index(index_name="all"):
    pinecone.init(api_key=os.environ.get('PINECONE_API_KEY'), environment=os.environ.get('PINECONE_ENV'))

    if index_name == 'all':
        indexes = pinecone.list_indexes()
        print("Deleting all indexes")
        for index in indexes:
            pinecone.delete_index(index)
        print('Done')
    else:
        print(f"Deleting index {index_name}")
        pinecone.delete_index(index_name)
        print('Done')

### Asking and getting answers

In [57]:
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.chains import ConversationalRetrievalChain

def ask_and_get_answer(vector_store, q):
    # llm = ChatOpenAI(model='gpt-4', temperature=0.7)
    
    llm = ChatOpenAI(model='gpt-3.5-turbo', temperature=1)
    retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': 3})
    chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)
    answer = chain.run(q)
    return answer

def ask_with_memory(vector_store, question, chat_history=[]):
    llm = ChatOpenAI(temperature=1)
    retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': 3})

    crc = ConversationalRetrievalChain.from_llm(llm, retriever)
    result = crc({'question': question, 'chat_history': chat_history})
    chat_history.append((question, result['answer']))

    return result, chat_history
    

### Running Code

In [58]:
pdf_data = load_document('../files/us_constitution.pdf')
print(len(pdf_data))
# print(data[1].page_content)
print(pdf_data[2].metadata)
print(f"You file has {len(pdf_data)} pages")

loading ../files/us_constitution.pdf
41
{'source': '../files/us_constitution.pdf', 'page': 2}
You file has 41 pages


In [59]:
doc_data = load_document('../files/the_great_gatsby.docx')
print(len(doc_data))
# print(doc_data[0].page_content)

loading ../files/the_great_gatsby.docx
1


In [60]:
data = load_from_wikipedia('GPT-4')
print(len(data))
# print(data[0].page_content)

2


In [61]:
chunks = chunk_data(pdf_data)
print(len(chunks))
# print(chunks[0].page_content)
print_embedding_cost(chunks)

190
Total Tokens: 16711
Embedding Cost in USD: 0.006684


In [62]:
delete_pinecone_index()

Deleting all indexes
Done


In [63]:
index_name = 'ask-document'
vector_store = insert_or_fetch_embeddings(index_name)

Creating index: ask-document
completed


In [66]:
q = "What is the document about?"
answer = ask_and_get_answer(vector_store, q)
print(answer)

The document is the United States Constitution. It establishes the framework for the government of the United States and outlines the rights and responsibilities of its citizens. It aims to form a more perfect union, establish justice, ensure domestic tranquility, provide for the common defense, promote the general welfare, and secure the blessings of liberty for the people of the United States.


In [68]:
import time
count = 1
print("Type quit or exit to quit")
while True:
    q = input(f"Question #{count}: ")
    count += 1
    if q.lower() in ['quit', 'exit']:
        print("Quitting... \nBye!")
        time.sleep(2)
        break
    answer = ask_and_get_answer(vector_store, q)
    print(f'Answer: \n {answer}')
    print("*" * 50)

Type quit or exit to quit


Question #1:  What is the bill of right


Answer: 
 The Bill of Rights refers to the first ten amendments to the United States Constitution. These amendments outline specific rights and freedoms that are guaranteed to the American people, including the freedoms of speech, religion, and assembly, as well as the right to bear arms, the right to a fair trial, and protections against unreasonable searches and seizures.
**************************************************


Question #2:  exit


Quitting... 
Bye!


Quitting... 
Bye!


In [40]:
data = load_from_wikipedia("chatGPT")
chunks = chunk_data(data)
index_name = "chatgpt"
vector_store = insert_or_fetch_embeddings(index_name)

Creating index: chatgpt
completed


In [41]:
q = "what is chatGPT?"
answer = ask_and_get_answer(vector_store, q)
print(answer)

ChatGPT is a language model developed by OpenAI. It is designed to generate human-like responses in conversational settings. It uses a deep learning architecture called transformer, trained on a large amount of internet text to learn patterns in language and generate coherent responses. ChatGPT can be used for a variety of tasks such as answering questions, writing code, creating conversational agents, and more.


In [42]:
# asking with memory
chat_history = []
question = "how many amendments are in the U.S. constitution?"
result, chat_history = ask_with_memory(vector_store, question, chat_history)
print(result['answer'])
print(chat_history)

There are 27 amendments in the U.S. Constitution.
[('how many amendments are in the U.S. constitution?', 'There are 27 amendments in the U.S. Constitution.')]


In [43]:
question = "Multiply the number by 3"
result, chat_history = ask_with_memory(vector_store, question, chat_history)
print(result['answer'])
print(chat_history)

There have been 27 amendments to the U.S. Constitution. When you multiply this number by 3, the result is 81.
[('how many amendments are in the U.S. constitution?', 'There are 27 amendments in the U.S. Constitution.'), ('Multiply the number by 3', 'There have been 27 amendments to the U.S. Constitution. When you multiply this number by 3, the result is 81.')]


In [44]:
import time
count = 1
chat_history = []
print("Write Quit or Exit to quit")

while True:
    q = input(f"Question #{count}")
    count += 1
    if q.lower() in ["quit", "exit"]:
        print("Quitting...")
        time.sleep(2)
        break
    result, _ = ask_with_memory(vector_store, q, chat_history)
    print (result['answer'])
    print("*" * 50)

Write Quit or Exit to quit


Question #1 What rights do US citizens have


US citizens have a wide range of rights guaranteed by the Constitution and federal laws. Some of the key rights include:

1. Freedom of speech: The First Amendment protects citizens' rights to express their opinions and ideas.

2. Freedom of religion: Citizens have the right to practice any religion or no religion at all, as guaranteed by the First Amendment.

3. Right to bear arms: The Second Amendment grants citizens the right to own firearms, although this right is subject to certain regulations.

4. Right to due process: The Fifth Amendment ensures that citizens cannot be deprived of life, liberty, or property without due process of law.

5. Right to a fair trial: The Sixth Amendment guarantees citizens the right to a fair and speedy trial by an impartial jury.

6. Protection from unreasonable searches and seizures: The Fourth Amendment protects citizens against invasive searches and seizures without a warrant or probable cause.

7. Right to equal protection under the law: The Four

Question #2 Which one is mostly exercised


The right most commonly exercised by US citizens is the right to vote.
**************************************************


Question #3 How many are the rights 


US citizens have various rights guaranteed by the Constitution and other laws. Some of these rights include freedom of speech, religion, assembly, and press, as well as the right to bear arms, due process, and equal protection under the law. There are numerous other rights and freedoms provided to US citizens, and it is recommended to refer to the Constitution and legal resources for a comprehensive list.
**************************************************


Question #4 exit


Quitting...
