## Install packages

In [None]:
%pip install langchain
%pip install openai
%pip install -qU pinecone-client pandas

In [None]:
%pip install unstructured

In [None]:
%pip install unstructured[local-inference]

In [None]:
%pip install tiktoken

## Load Data - pdf

In [None]:
from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
loader = UnstructuredPDFLoader("./budget_english_speech_2022.pdf")

In [None]:
data = loader.load()

In [None]:
print (f'You have {len(data)} document(s) in your data')
print (f'There are {len(data[0].page_content)} characters in your document')

### Chunk your data up into smaller documents

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(data)

In [None]:
print (f'Now you have {len(texts)} documents')

In [None]:
texts[12].page_content

## Create embeddings of your documents to get ready for semantic search

In [None]:
from langchain.vectorstores import Chroma, Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
import pinecone

In [None]:
OPENAI_API_KEY = "sk-Aq2YWVVDKXZac3Cb23bzT3BlbkFJtgzYEZArDok5LmJ"
PINECONE_API_KEY = '33adfd02-ed28-4676-872a-e78f3f3500c8'
PINECONE_API_ENV = 'northamerica-northeast1-gcp'

In [None]:
# Index anme : budget-speech
# Dimensions

embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

In [None]:
# initialize pinecone
pinecone.init(
    api_key=PINECONE_API_KEY,  # find at app.pinecone.io
    environment=PINECONE_API_ENV  # next to api key in console
)
index_name = "budget-speech"

In [None]:
docsearch = Pinecone.from_texts([t.page_content for t in texts], embeddings, index_name=index_name)

In [None]:
query = "What are some key highlights of budget?"
docs = docsearch.similarity_search(query, include_metadata=True)

In [None]:
docs

## Query using Index in Pinecone

In [None]:
type(docsearch)

In [None]:
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain

In [None]:
llm = OpenAI(temperature=0.9, openai_api_key=OPENAI_API_KEY)
chain = load_qa_chain(llm, chain_type="map_reduce") #stuff 0.06

In [None]:
p_vectorstore = Pinecone.from_existing_index(index_name, embeddings)


In [None]:
query = "What are some key highlights of budget?"
docs = p_vectorstore.similarity_search(query, include_metadata=True)

In [None]:
chain.run(input_documents=docs, question=query)

In [None]:
query = "In what context Narendra Modi was mentioned?"
docs = p_vectorstore.similarity_search(query, include_metadata=True)
chain.run(input_documents=docs, question=query, verbose=True)

In [None]:
query = "In what context Prime Minister was mentioned? List few of them"
docs = p_vectorstore.similarity_search(query, include_metadata=True)
chain.run(input_documents=docs, question=query, verbose=True)

In [None]:
# with chain_type ="stuff"
query = "What did Prime Minister say during Independence Day?"
docs = p_vectorstore.similarity_search(query, include_metadata=True)
chain.run(input_documents=docs, question=query, verbose=True)

In [None]:
# with chain_type ="map_reduce"
query = "What did Prime Minister say during Independence Day?"
docs = p_vectorstore.similarity_search(query, include_metadata=True)
chain.run(input_documents=docs, question=query, verbose=True)

In [None]:
query = "What did Narendra Modi say during Independence Day?"
docs = p_vectorstore.similarity_search(query, include_metadata=True)
chain.run(input_documents=docs, question=query, verbose=True)

In [None]:
query = "What was spoken about veterans or millitary personnels? Did they get any benefit?"
docs = p_vectorstore.similarity_search(query, include_metadata=True)
chain.run(input_documents=docs, question=query, verbose=True)

In [None]:
query = "What was spoken about veterans or millitary personnels? Did they get any benefit?"
docs = p_vectorstore.similarity_search(query, include_metadata=True, k=10)
chain.run(input_documents=docs, question=query, verbose=True)