<a href="https://colab.research.google.com/github/navaneethsanil/Pdf-Chat/blob/main/pdf_chat.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import os
import getpass
import gradio as gr

from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain_mistralai import ChatMistralAI
from langchain import hub
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains.retrieval import create_retrieval_chain

In [3]:
from dotenv import load_dotenv
load_dotenv()

True

In [None]:
# Please enter your env variables
print("Enter MISTRAL_API_KEY")
os.environ["MISTRAL_API_KEY"] = getpass.getpass()
print("Enter PINECONE_API_KEY")
os.environ["PINECONE_API_KEY"] = getpass.getpass()
print("Enter INDEX_NAME")
os.environ["INDEX_NAME"] = getpass.getpass()
print("Enter LANGSMITH_API_KEY")
LANGCHAIN_TRACING_V2=True
os.environ["LANGSMITH_API_KEY"] = getpass.getpass()

# **Upload document to pinecone vector database**

In [8]:
# Embedding model setup
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

In [None]:
def upload_document(doc_path: str):
  try:
    # Load the document
    loader = PyPDFLoader(doc_path)

    pages = []
    for page in loader.lazy_load():
      pages.append(page)

    # Splitting text into text_chunks
    text_splitter = RecursiveCharacterTextSplitter(
      chunk_size=500,
      chunk_overlap=100,
      add_start_index=True
    )

    text_chunks = text_splitter.split_documents(pages)

    # Pinecone setup (Ingesting data into pinecone vector database)
    PineconeVectorStore.from_documents(
      text_chunks,
      embedding_model,
      index_name=os.getenv('INDEX_NAME'),
    )

    return "Document uploaded successfully"
  except Exception as e:
    print(f"Document upload failed, error: {e}")

In [None]:
upload_document(doc_path="human_nutrition.pdf")

'Document uploaded successfully'

# **Retrival and Generation**

In [21]:
def query_document(query: str):
  llm = ChatMistralAI(
    model="mistral-large-latest",
    temperature=0,
    max_retries=2
  )

  vectorstore = PineconeVectorStore(index_name=os.getenv("INDEX_NAME"), embedding=embedding_model)

  retrieval_qa_chat_prompt = hub.pull("langchain-ai/retrieval-qa-chat")
  combine_docs_chain = create_stuff_documents_chain(llm, retrieval_qa_chat_prompt)

  retrival_chain = create_retrieval_chain(
    retriever=vectorstore.as_retriever(), combine_docs_chain=combine_docs_chain
  )

  result = retrival_chain.invoke(input={"input": query})
  print("Answer: ")
  return result['answer']

In [22]:
query_document("What are macronutrients?")

Answer: 


'Macronutrients are nutrients that are needed in large amounts. There are three classes of macronutrients: carbohydrates, lipids, and proteins. These can be metabolically processed into cellular energy. The energy from macronutrients comes from their chemical bonds, which is then converted into cellular energy that is utilized to perform work, allowing our bodies to conduct their basic functions.'