In [1]:
%pwd

'c:\\Projects\\Rag-Chatbot\\research'

In [3]:
import os
os.chdir("../")

In [24]:
%pwd

'c:\\Projects\\Rag-Chatbot'

In [25]:
import langchain
import pinecone
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
from langchain.llms import OpenAI

In [26]:
#Extract data from pdf
def load_pdf_file(data):
    loader = DirectoryLoader(data, glob="*.pdf", loader_cls=PyPDFLoader)
    documents = loader.load()
    return documents

In [27]:
extracted_data = load_pdf_file(data='Data/')

In [28]:
extracted_data

[Document(metadata={'producer': 'pdfTeX-1.40.18', 'creator': "LaTeX with 'moderncv' package", 'creationdate': '2023-04-04T05:04:00+00:00', 'author': 'Monisha Jegadeesan gray', 'keywords': 'Monisha Jegadeesan gray , curriculum vitæ, resumé', 'moddate': '2023-04-04T05:04:00+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.18 (TeX Live 2017) kpathsea version 6.2.3', 'subject': 'Resumé of Monisha Jegadeesan gray', 'title': 'Monisha Jegadeesan gray  –  Software Engineer, Google', 'trapped': '/False', 'source': 'Data\\cv.pdf', 'total_pages': 3, 'page': 0, 'page_label': '1'}, page_content='Monisha Jegadeesan\nSoftware Engineer, Google\nH +91 9035212894\nB monishaj.65@gmail.com\nÍ monisha-jega.github.io\nmonisha-jega\nmonisha-jegadeesan\nEducation\n2015-2020 Dual Degree (B.Tech + M.Tech) in Computer Science and Engineering\nIndian Institute of Technology Madras, Chennai, India CGPA: 8.78\n2015 XII - Karnataka Board,KLE Society’s Independent PU College, Bangalore 97.30 %\

In [29]:
def chunk_data(extracted_data,chunk_size=300,chunk_overlap=20):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=chunk_size,chunk_overlap=chunk_overlap)
    text_chunks=text_splitter.split_documents(extracted_data)
    return text_chunks

In [30]:
text_chunks = chunk_data(extracted_data=extracted_data)
len(text_chunks)

40

In [14]:
#continue from here

In [31]:
from langchain.embeddings import HuggingFaceEmbeddings

In [32]:
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    return embeddings

In [33]:
embeddings = download_hugging_face_embeddings()

In [34]:
query_result = embeddings.embed_query("Hello!!")
len(query_result)

384

In [35]:
from dotenv import load_dotenv
load_dotenv()

PINECONE_API_KEY = os.getenv("pinecone_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [36]:
from pinecone import Pinecone
from pinecone import ServerlessSpec

pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = "rag-chatbot"

if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        vector_type="dense",
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        ),
        deletion_protection="disabled",
        tags={
            "environment": "development"
        }
    )

In [37]:
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    index_name=index_name,
    embedding=embeddings
)

In [47]:
"""docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings
)"""

'docsearch = PineconeVectorStore.from_existing_index(\n    index_name=index_name,\n    embedding=embeddings\n)'

In [38]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})

In [39]:
retrived_docs = retriever.invoke("when did monisha create breakout game??")

In [40]:
retrived_docs

[Document(id='4ba4da4c-42a0-4718-a7f5-ee0ec8e2040f', metadata={'author': 'Monisha Jegadeesan gray', 'creationdate': '2023-04-04T05:04:00+00:00', 'creator': "LaTeX with 'moderncv' package", 'keywords': 'Monisha Jegadeesan gray , curriculum vitæ, resumé', 'moddate': '2023-04-04T05:04:00+00:00', 'page': 1.0, 'page_label': '2', 'producer': 'pdfTeX-1.40.18', 'ptex.fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.18 (TeX Live 2017) kpathsea version 6.2.3', 'source': 'Data\\cv.pdf', 'subject': 'Resumé of Monisha Jegadeesan gray', 'title': 'Monisha Jegadeesan gray  –  Software Engineer, Google', 'total_pages': 3.0, 'trapped': '/False'}, page_content='Developed an Android application for the Breakout game with basic playing and scoring features.\nTeaching Experience\nJan 2020 -\nMay 2020\nNatural Language Processing - Course Teaching Assistant,Indian Institute of Technology Madras'),
 Document(id='803a4e8b-b93d-48ab-afd8-71ce73d7c263', metadata={'author': 'Monisha Jegadeesan gray', 'cr

In [41]:
from langchain.llms import OpenAI
llm = OpenAI(temperature=0.4, max_tokens=500)

  llm = OpenAI(temperature=0.4, max_tokens=500)


In [42]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt = (
    "you are an assistant in question-answering tasks."
    "use the following pieces of retrieved context to answer"
    "the question. If you dont know the answer, politely say that "
    "you dont know the answer and give a generic response based on your intelligence."
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}")
    ]
)

In [43]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [44]:
response = rag_chain.invoke({"input": "When did monisha create the breakout game as her project?" })
print(response["answer"])



System: Monisha created the Breakout game as her project during her time as a student at Indian Institute of Technology Madras, from 2015 to 2020.
