# Questioning-and-Answering on Private Documents

In [1]:
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

True

In [2]:
pip install pypdf -q

Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install docx2txt -q

Note: you may need to restart the kernel to use updated packages.


In [4]:
pip install wikipedia -q

Note: you may need to restart the kernel to use updated packages.


### Loaders

In [5]:
def load_document(file):
    import os
    name, extension = os.path.splitext(file)
    if extension == '.pdf':
        from langchain.document_loaders import PyPDFLoader
        print(f'Loading {file}')
        loader = PyPDFLoader(file)
    elif extension == '.docx':
        from langchain.document_loaders import Docx2txtLoader
        print(f'Loading {file}')
        loader = Docx2txtLoader(file)
    elif extension == '.txt':
        from langchain.document_loaders import TextLoader
        print(f'Loading {file}')
        loader = TextLoader(file)
    else:
        print('Document format is not supported!')
        return None
    data = loader.load()
    return data

# wikipedia
def load_from_wikipedia(query, lang='en', load_max_docs=2):
    from langchain.document_loaders import WikipediaLoader
    loader = WikipediaLoader(query=query, lang=lang, load_max_docs=load_max_docs)
    data = loader.load()
    return data

### Chunking Data

In [6]:
def chunk_data(data, chunk_size=256, chunk_overlap=0):
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    chunks = text_splitter.split_documents(data)
    return chunks

### Calculating Cost

In [7]:
def print_embedding_cost(texts):
    import tiktoken
    enc = tiktoken.encoding_for_model('text-embedding-ada-002')
    total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])
    print(f'Total Tokens: {total_tokens}')
    print(f'Embedding Cost in USD: {total_tokens / 1000 * 0.0004:.6f}')


### Embedding and Uploading to a Vector Database

In [8]:
def insert_or_fetch_embeddings(index_name):
    import pinecone
    from langchain.vectorstores import Pinecone
    from langchain.embeddings.openai import OpenAIEmbeddings
    
    embeddings = OpenAIEmbeddings()
    pinecone.init(api_key=os.environ.get('PINECONE_API_KEY'), environment=os.environ.get('PINECONE_ENV'))
    
    if index_name in pinecone.list_indexes():
        print(f'Index {index_name} already exists. Loading embeddings ... ', end='')
        vector_store = Pinecone.from_existing_index(index_name, embeddings)
        print('Done')
    else:
        print(f'Creating index {index_name} and embeddings ... ', end='')
        pinecone.create_index(index_name, dimension=1536, metric='cosine')
        vector_store = Pinecone.from_documents(chunks, embeddings, index_name=index_name)
        print('Done')
    
    return vector_store

In [9]:
def delete_pinecone_index(index_name='all'):
    import pinecone
    pinecone.init(api_key=os.environ.get('PINECONE_API_KEY'), environment=os.environ.get('PINECONE_ENV'))
    
    if index_name == 'all':
        indexes = pinecone.list_indexes()
        print('Deleting all indexes ... ')
        for index in indexes:
            pinecone.delete_index(index)
        print('Done')
    else:
        print(f'Deleting index {index_name} ...', end='')
        pinecone.delete_index(index_name)
        print('Done')

### Asking and Getting Answers

In [10]:
def ask_and_get_answer(vector_store, query):
    from langchain.chains import RetrievalQA
    from langchain.chat_models import ChatOpenAI

    llm = ChatOpenAI(model='gpt-4', temperature=1)
    retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': 3})
    chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)
    answer = chain.run(query)
    
    return answer

def ask_with_memory(vector_store, query, chat_history=[]):
    from langchain.chains import ConversationalRetrievalChain
    from langchain.chat_models import ChatOpenAI
    
    llm = ChatOpenAI(model='gpt-3.5-turbo', temperature=1)
    retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': 3})
    crc = ConversationalRetrievalChain.from_llm(llm, retriever)
    result = crc({'question': query, 'chat_history': chat_history})
    chat_history.append((query, result['answer']))
    
    return result, chat_history

### Running Code

In [11]:
data = load_document('./constitution.pdf')
# print(data[1].page_content)
# print(data[10].metadata)
print(f'You have {len(data)} pages in your data.')
print(f'There are {len(data[11].page_content)} characters in the page ')

Loading ./constitution.pdf
You have 19 pages in your data.
There are 3174 characters in the page 


In [12]:
data2 = load_document('./World_History.docx')
print(data2[0].page_content)

Loading ./World_History.docx
Summer Assignment: World History 

Objective: The student understands the impact of Mahatma Gandhi on the political resistance to British Rule in India in the 20th Century. 

TEKS Alignment:

WH.20(B) describe the rights and responsibilities of citizens and noncitizens in civic participation throughout history

WH.21(E) identify examples of individuals who led resistance to political oppression such as Mohandas Gandhi



Instructions: Read through the following documents carefully.  Respond to the following prompts:

With reference to its origin, purpose and content, analyze the value and limitations of Document 4 to historians studying Gandhi’s resistance to British rule in India.  Please respond in one full and complete paragraph. (10 Points)

Use the sources to evaluate the impact of Gandhi’s resistance against the British government in India. In other words, do these sources indicate that Gandhi’s resistance was successful or unsuccessful? If he was suc

In [13]:
data3 = load_from_wikipedia('GPT-4', 'fr')
print(data3[0].page_content)

GPT-4 (acronyme de Generative Pre-trained Transformer 4) est un modèle de langage multimodalmultimodal, de type transformeur génératif pré-entraîné, développé par la société OpenAI et sorti le 14 mars 2023, il succède à GPT-3.


== Caractéristiques ==
OpenAI annonce, sur son blog, GPT-4 comme étant « plus fiable, créatif et capable de gérer des instructions beaucoup plus nuancées que GPT-3.5 ». L'organisation a produit deux versions de GPT-4 avec des fenêtres contextuelles de 8 192 et 32 768 jetons, une amélioration significative par rapport à GPT-3.5 et GPT-3, qui étaient limités à 4 096 et 2 048 jetons respectivement. Contrairement à son prédécesseur, GPT-4 peut prendre des images ainsi que du texte comme entrées.
OpenAI adopte une approche fermée en ce qui concerne les détails techniques de GPT-4 ; le rapport technique s'est explicitement abstenu de spécifier la taille, l'architecture, le matériel ou la méthode de formation du modèle. Le rapport affirme que « le paysage concurrentie

In [14]:
chunks = chunk_data(data)
print(len(chunks))
print(chunks[10].page_content)

247
Jersey four, Pennsylvania eight, Delaw are one, Maryland  
six, Virginia ten, North Carolina five, South Carolina five, 
and Georgia three.  
When vacancies happen in the Representation from any


In [15]:
print_embedding_cost(chunks)

Total Tokens: 12912
Embedding Cost in USD: 0.005165


In [17]:
delete_pinecone_index()

Deleting all indexes ... 
Done


In [18]:
index_name = 'askadocument'
vector_store = insert_or_fetch_embeddings(index_name) 

Creating index askadocument and embeddings ... Done


In [19]:
query = 'What is the whole document about?'
answer = ask_and_get_answer(vector_store, query)
print(answer)

I'm sorry, but there is no document provided for me to review and share details about. Could you please provide more information?


In [20]:
import time 
i = 1
print('Write Quit or Exit to quit.')
while True:
    query = input(f'Question #{i}: ')
    i = i + 1
    if query.lower() in ['quit', 'exit']:
        print('Bye Bye ... see you later!')
        time.sleep(2)
        break
    
    answer = ask_and_get_answer(vector_store, query)
    print(f'\nAnswer: {answer}')
    print(f'\n {"-" * 50} \n')

Write Quit or Exit to quit.
Question #1: Describe the Bill of Rights

Answer: The Bill of Rights refers to the first 10 amendments to the Constitution of the United States. These were ratified on December 15, 1791. These amendments outline and guarantee certain rights and freedoms for the citizens of the United States, such as freedom of speech, protection from unreasonable searches and seizures, and the right to a speedy and public trial, among others. The purpose of the Bill of Rights is to provide a clear constitutional check on governmental power, protecting individuals and the rights of states.

 -------------------------------------------------- 

Question #2: What is the first amendment of the US Constitution

Answer: The first amendment of the US Constitution states: "Congress shall make no law respecting an establishment of religion, or prohibiting the free exercise thereof; or abridging the freedom of speech, or of the press; or the right of the people peaceably to assemble, 

In [22]:
delete_pinecone_index()

Deleting all indexes ... 
Done


In [23]:
data = load_from_wikipedia('ChatGPT', 'es')
chunks = chunk_data(data)
index_name = 'chatgpt'
vector_store = insert_or_fetch_embeddings(index_name)

Creating index chatgpt and embeddings ... Done


In [24]:
# query = "Qué es el chat gpt"
# query = "Cuándo se lanzó gpt"
query = "Qué es InstructGPT"
answer = ask_and_get_answer(vector_store, query)
print(answer)

InstructGPT es un modelo de inteligencia artificial desarrollado por OpenAI. Este modelo es entrenado para seguir instrucciones escritas en un contexto conversacional y generar respuestas basadas en la información que ha aprendido. Utiliza una técnica conocida como "transformador" para interpretar y procesar el lenguaje natural.


In [21]:
# asking with memory
chat_history = []
query = "What is the last bill of rights in the US Constitution?"
result, chat_history = ask_with_memory(vector_store, query, chat_history)
print(result['answer'])
print(chat_history)

The last amendment in the Bill of Rights is the Tenth Amendment.
[('What is the last bill of rights in the US Constitution?', 'The last amendment in the Bill of Rights is the Tenth Amendment.')]


In [22]:
query = "How many amendments are in the US Constitution"
result, chat_history = ask_with_memory(vector_store, query, chat_history)
print(result['answer'])
print(chat_history)

Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised ServiceUnavailableError: The server is overloaded or not ready yet..


There are currently 27 amendments in the US Constitution.
[('What is the last bill of rights in the US Constitution?', 'The last amendment in the Bill of Rights is the Tenth Amendment.'), ('How many amendments are in the US Constitution', 'There are currently 27 amendments in the US Constitution.')]


In [23]:
query = "Multiply that number by 2"
result, chat_history = ask_with_memory(vector_store, query, chat_history)
print(result['answer'])
print(chat_history)

The number of amendments in the US Constitution is 27. Multiplying this by 2 would give a result of 54.
[('What is the last bill of rights in the US Constitution?', 'The last amendment in the Bill of Rights is the Tenth Amendment.'), ('How many amendments are in the US Constitution', 'There are currently 27 amendments in the US Constitution.'), ('Multiply that number by 2', 'The number of amendments in the US Constitution is 27. Multiplying this by 2 would give a result of 54.')]
