In [2]:
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(),override=True)

True

In [5]:
pip install pypdf -q

Note: you may need to restart the kernel to use updated packages.


In [6]:
pip install docx2txt -q

Note: you may need to restart the kernel to use updated packages.


In [7]:
# NOTE: Adding import in a function can be beneficial from
# refactoring perspective
def load_document(file):
    import os
    name, extension = os.path.splitext(file)
    
    print(f'Loading {file}...')
    if extension == ".pdf":
        from langchain.document_loaders import PyPDFLoader
        loader = PyPDFLoader(file)
    elif extension == ".docx":
        from langchain.document_loaders import Docx2txtLoader
        loader = Docx2txtLoader(file)
    else:
        print(f'Document extension {extension} is not supported.')
        
    data = loader.load()
    return data

### Running Code

In [36]:
data = load_document('./documents/us-constitution.pdf')
# print(data[1].page_content)
# print(data[10].metadata)
print(f'You have {len(data)} pages in your data.')

Loading ./documents/us-constitution.pdf...
You have 19 pages in your data.


In [11]:
data = load_document('./documents/word-data.docx')
# print(data[1].page_content)
# print(data[10].metadata)
# print(f'You have {len(data)} pages in your data.')
print(data[0].page_content)

Loading ./documents/word-data.docx...


### Get data from a website

In [15]:
pip install wikipedia

In [16]:
def load_from_wikipedia(query,lang='en',load_max_docs=2):
    from langchain.document_loaders import WikipediaLoader
    loader = WikipediaLoader(query=query
                             ,lang=lang,
                             load_max_docs=load_max_docs)
    data = loader.load()
    return data

In [17]:
data = load_from_wikipedia('Google BARD','en',1)
#print(data[0].page_content)
print(1)

1


In [18]:
def chunk_data(data,chunk_size=256,):
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size,
                                                   chunk_overlap=0)
    chunks = text_splitter.split_documents(data)
    return chunks
    

In [37]:
chunks = chunk_data(data)
print(len(chunks))
print(chunks[2].page_content)

247
the general  Welfare, and secure the Blessings of Liberty to 
ourselves  and our Posterity,  do ordain  and establish  this 
Constitution for the United States of America  
 
 
Article.   I. 
SECTION.  1


### Calculating the Cost

In [38]:
def print_embedding_cost(texts):
    import tiktoken
    enc = tiktoken.encoding_for_model('text-embedding-ada-002')
    total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])
    print(f'Total Tokens: {total_tokens}')
    print(f'Embedding Cost in USD: {total_tokens/1000 * 0.0004:.6f}')

print_embedding_cost(chunks)

Total Tokens: 12912
Embedding Cost in USD: 0.005165


### Embedding and Uploading to a Vector Database (Pinecone)

In [39]:
def insert_of_fetch_embeddings(index_name):
    # NOTE: There are 2 pinecone imports 1 library and 1 is class
    # 1) pinecone (p lower case and imported directly) (library)
    # 2) Pinecone (P upper case imported from langchain vectorstores)
    import pinecone
    from langchain.vectorstores import Pinecone
    from langchain.embeddings.openai import OpenAIEmbeddings
    
    # NO need to pass apikey, it automaically grabs it after your load env
    embeddings = OpenAIEmbeddings()
    
    pinecone.init(api_key=os.environ.get('PINECONE_API_KEY'),
                 environment=os.environ.get('PINECONE_ENV'))
    
    if index_name in pinecone.list_indexes():
        print(f'Index {index_name} already exists. Loadings Embeddings...',
             end='')
        vector_store = Pinecone.from_existing_index(index_name,embeddings)
        print('Done')
    else:
        # Go to https://docs.pinecone.io/reference/create_index/ for
        # documentation of create_index method
        print(f'Creating Index {index_name} and embeddings ...',end='')
        
        pinecone.create_index(index_name,dimension=1536,metric='cosine')
        
        vector_store = Pinecone.from_documents(chunks
                                               ,embeddings
                                               ,index_name=index_name)
        print('Done')
    
    return vector_store

In [40]:
def delete_pinecone_index(index_name='all'):
    import pinecone
    pinecone.init(api_key=os.environ.get('PINECONE_API_KEY'),
             environment=os.environ.get('PINECONE_ENV'))
    if(index_name=='all'):
        indexes = pinecone.list_indexes()
        print('Deleting all indexes ...')
        for index in indexes:
            pinecone.delete_index(index)
    else:
        print(f'Deleting index {index_name} ...')
        pinecone.delete_index(index_name)
    
    print('Done')


In [41]:
delete_pinecone_index()

Deleting all indexes ...
Done


In [42]:
index_name = 'askadocument'
vector_store = insert_of_fetch_embeddings(index_name)

Creating Index askadocument and embeddings ...Done


### Asking & Getting Answers

In [44]:
def ask_and_get_answer(vector_store,q):
    from langchain.chains import RetrievalQA
    from langchain.chat_models import ChatOpenAI

    llm = ChatOpenAI(model='gpt-3.5-turbo',temperature=1)

    retriever = vector_store.as_retriever(search_type='similarity',search_kwargs={'k':3})

    chain = RetrievalQA.from_chain_type(llm=llm,chain_type='stuff',retriever=retriever)
    
    answer = chain.run(q)
    
    return answer


In [46]:
q='What is the content about?'
answer = ask_and_get_answer(vector_store,q)
print(answer)

The content is about the National Constitution Center in Philadelphia, PA and its mission to promote awareness and understanding of the United States Constitution, its history, and its relevance to people's daily lives.


In [48]:
q='what country is this about?'
answer = ask_and_get_answer(vector_store,q)
print(answer)

This text is about the United States.


In [50]:
import time
i = 1
print('Write Quit or Exit to quit.')
while True:
    q = input(f'Question #{i}: ')
    i +=1
    if q.lower() in ['quit','exit']:
        print('Thanks for stopping by')
        time.sleep(2)
        break
    
    answer = ask_and_get_answer(vector_store,q)
    print(f'\nAnswer:{answer}')
    print(f'\n {"-" *60} \n')
    

Write Quit or Exit to quit.
Question #1: Q1: What is bill or rights? Q2: How does this document address presidential succession? Answer both questions

Answer:Q1: The Bill of Rights refers to the first ten amendments to the Constitution of the United States. These amendments were ratified by the states to ensure certain fundamental rights and freedoms of the people, such as freedom of speech, religion, and the right to bear arms.

Q2: The document does not directly address presidential succession. It mentions the eligibility criteria for the Office of President, stating that only natural born citizens or citizens at the time of the Constitution's adoption can hold the office. However, the specific process of presidential succession is outlined in the Presidential Succession Act of 1947, which establishes the order of succession in case the President is unable to perform their duties.

 ------------------------------------------------------------ 

Question #2: Can explain this document