<a href="https://colab.research.google.com/github/nehaa28/llm/blob/main/PDF_Chat_with_Pinecone.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installation

In [None]:
%%writefile requirements.txt
unstructured
tiktoken
pinecone-client
pypdf
openai
langchain
pandas
numpy
python-dotenv
accelerate
transformers
langchain-huggingface

Overwriting requirements.txt


In [None]:
!pip install -r requirements.txt

Collecting langchain-huggingface (from -r requirements.txt (line 12))
  Downloading langchain_huggingface-0.0.3-py3-none-any.whl.metadata (1.2 kB)
Downloading langchain_huggingface-0.0.3-py3-none-any.whl (17 kB)
Installing collected packages: langchain-huggingface
Successfully installed langchain-huggingface-0.0.3


# Imports

In [None]:
import langchain
from pinecone import Pinecone, ServerlessSpec
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pinecone import ServerlessSpec
# from langchain_pinecone import Pinecone
# from pinecone.grpc import PineconeGRPC as Pinecone
# from langchain.vectorstores import Pinecone
from langchain_pinecone import PineconeVectorStore
from langchain_huggingface import HuggingFaceEmbeddings, HuggingFacePipeline
from langchain.chains.question_answering import load_qa_chain

In [None]:
from google.colab import userdata
import os
os.environ['PINECONE_KEY'] = userdata.get('PINECONE_KEY')


In [None]:

## Lets Read the document
def read_doc(directory):
    file_loader=PyPDFDirectoryLoader(directory)
    documents=file_loader.load()
    return documents

doc = read_doc('documents/')
len(doc)

2

In [None]:
def chunk_data(docs,chunk_size=800,chunk_overlap=50):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=chunk_size,chunk_overlap=chunk_overlap)
    doc=text_splitter.split_documents(docs)
    return docs


documents=chunk_data(docs=doc)
len(documents)

2

In [None]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
query_result = embeddings.embed_query("Hello world")
len(query_result)

384

In [None]:
pc = Pinecone(api_key=os.environ['PINECONE_KEY'])

In [None]:
index_name = "pdf-qa-index"
index = pc.Index(index_name)
vector_store = PineconeVectorStore(index=index, embedding=embeddings)

In [None]:
vector_store.add_documents(doc)

['47142fb9-f55c-4b5c-9a3b-11bf2a59a832',
 'a2d026bc-8898-4010-b42e-f2ad0d008d6c']

In [None]:
def retrieve_query(query,k=2):
    matching_results=vector_store.similarity_search(query,k=k)
    return matching_results

In [None]:
results = vector_store.similarity_search(query="What is the main topic of this document?", k=2)

for res in results:
    print(f"* {res.page_content}")

In [None]:
llm = HuggingFacePipeline(
    model_id="bigscience/T0_3B",
    model_kwargs={"temperature": 0.1, "max_length": 512}
)
chain = load_qa_chain(llm, chain_type="stuff")


stuff: https://python.langchain.com/v0.2/docs/versions/migrating_chains/stuff_docs_chain
map_reduce: https://python.langchain.com/v0.2/docs/versions/migrating_chains/map_reduce_chain
refine: https://python.langchain.com/v0.2/docs/versions/migrating_chains/refine_chain
map_rerank: https://python.langchain.com/v0.2/docs/versions/migrating_chains/map_rerank_docs_chain

See also guides on retrieval and question-answering here: https://python.langchain.com/v0.2/docs/how_to/#qa-with-rag
  chain = load_qa_chain(llm, chain_type="stuff")


In [None]:
query = "What is the document about?"
doc_search=retrieve_query(query)
print(doc_search)
response=chain.run(input_documents=doc_search,question=query)
