In [1]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [2]:
#Extract Data From the PDF File
def load_pdf_file(data):
    loader= DirectoryLoader(data,
                            glob="*.pdf",
                            loader_cls=PyPDFLoader)
    documents=loader.load()
    return documents

In [3]:
extracted_data=load_pdf_file(data='')

In [4]:
#Split the Data into Text Chunks
def text_split(extracted_data):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks=text_splitter.split_documents(extracted_data)
    return text_chunks


In [5]:
text_chunks=text_split(extracted_data)

In [6]:
print("Length of Text Chunks", len(text_chunks))

Length of Text Chunks 4


In [7]:
from langchain.embeddings import HuggingFaceEmbeddings

#Download the Embeddings from Hugging Face
def download_hugging_face_embeddings():
    embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    return embeddings


In [9]:
# import sentence_transformers
embeddings = download_hugging_face_embeddings()

Downloading: 100%|██████████| 90.3M/90.3M [00:10<00:00, 8.92MB/s]
Downloading: 100%|██████████| 211k/211k [00:00<00:00, 497kB/s]
Downloading: 100%|██████████| 22.9M/22.9M [00:02<00:00, 8.35MB/s]
Downloading: 100%|██████████| 368k/368k [00:00<00:00, 821kB/s]
Downloading: 100%|██████████| 90.9M/90.9M [00:12<00:00, 7.03MB/s]
Downloading: 100%|██████████| 53.0/53.0 [00:00<?, ?B/s]
Downloading: 100%|██████████| 112/112 [00:00<?, ?B/s] 
Downloading: 100%|██████████| 466k/466k [00:00<00:00, 979kB/s]
Downloading: 100%|██████████| 350/350 [00:00<?, ?B/s] 
Downloading: 100%|██████████| 13.2k/13.2k [00:00<?, ?B/s]
Downloading: 100%|██████████| 232k/232k [00:00<00:00, 982kB/s]
Downloading: 100%|██████████| 349/349 [00:00<?, ?B/s] 
  return torch.load(checkpoint_file, map_location="cpu")


In [10]:
from dotenv import load_dotenv
load_dotenv()

True

In [11]:
import os
PINECONE_API_KEY=os.environ.get('PINECONE_API_KEY')

In [None]:
# import os
# os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY

In [15]:
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec

In [16]:
pc = Pinecone(api_key=PINECONE_API_KEY)

In [17]:
index_name = "testbot"
pc.create_index(
    name=index_name,
    dimension=384,
    metric="cosine",
    spec=ServerlessSpec(
        cloud="aws", 
        region="us-east-1"
    )
)


In [18]:
# Embed each chunk and upsert the embeddings into your Pinecone index.
from langchain_community.vectorstores import Pinecone
docsearch = Pinecone.from_documents(
    documents=text_chunks,
    index_name=index_name,
    embedding=embeddings, 
)

In [20]:
# Load Existing index 
from langchain_community.vectorstores import Pinecone
# Embed each chunk and upsert the embeddings into your Pinecone index.
docsearch = Pinecone.from_existing_index(
    index_name=index_name,
    embedding=embeddings
)

In [22]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})

In [23]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
# Step 1: Load the model and tokenizer
model_name = "google/flan-t5-large"  # You can use "base", "large", or "xl" for bigger models
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)


Downloading: 100%|██████████| 3.13G/3.13G [07:22<00:00, 7.08MB/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
  return torch.load(checkpoint_file, map_location="cpu")


In [24]:
# Example query and simulated retriever
query = "What is pem sherpa qualification?"
retrieved_docs = retriever.get_relevant_documents(query)

  retrieved_docs = retriever.get_relevant_documents(query)


In [25]:


# Combine retrieved documents into a single context string
context = " ".join([doc.page_content for doc in retrieved_docs])
print("Retrieved Context:", context)

Retrieved Context: CV
Name:PemChettarSherpaGender:Male
📍Address=Kathmandu|Contact=+9779864257084|📧Email=pemshera@gmail.com🔗LinkedIn|🔗GitHub|🔗Portfolio
PersonalSummary
AhighlymotivatedandambitiousCSITstudentwithastrongpassionfortechnologyandasolidfoundationincomputerscience. Education
● SecondaryEducationExamination(SEE)BishnuMemorialSecondarySchool,Dharan(Apr2018–Apr2019)● HigherSecondary(+2)BishnuMemorialSecondarySchool,Dharan(Apr2019–Mar2022)● BachelorofScienceinComputerScienceandInformationTechnology(BScCSIT)AmritScienceCampus,Kathmandu(Apr2022–Present)
Projects
1. DataAnalysisProjects○ Repository:PythonLearning2. MachineLearningProjects○ Repository:MachineLearning
Certifications ● Python101forDataScience○ IssuedbyIBMCognitiveClass|Certificate:PY0101EN● DataAnalysiswithPython○ IssuedbyIBMCognitiveClass|Certificate:DA0101EN● IntermediateMachineLearning○ IssuedbyKaggle|Certificate:IntrotoMachineLearning
Awards
● AsianHack2022(HackathonCertificate)○ OrganizedinKathmandu|Certificate:Vie

In [26]:
# Combine query and context into a single prompt
input_text = f"Context: {context} \n\nQuestion: {query} \n\nAnswer:"
inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)

In [27]:
# Generate an answer from the model
output = model.generate(**inputs, max_length=100)
answer = tokenizer.decode(output[0], skip_special_tokens=True)


print("Answer:", answer)


Answer: Bachelor of Science in Computer Science and Information Technology
