In [1]:
import os
from langchain_community.document_loaders import PyPDFLoader
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
file_path = "E:/CV.pdf"

In [3]:
loader = PyPDFLoader(file_path=file_path)

In [4]:
load = loader.load()

In [5]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 100,
    chunk_overlap= 20
)

splitted_text = text_splitter.split_documents(load)

In [6]:
splitted_text[:1] 

[Document(metadata={'producer': 'pdfTeX-1.40.27', 'creator': 'LaTeX with hyperref', 'creationdate': '2025-11-15T23:15:52+00:00', 'author': '', 'keywords': '', 'moddate': '2025-11-15T23:15:52+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.27 (TeX Live 2025) kpathsea version 6.4.1', 'subject': '', 'title': '', 'trapped': '/False', 'source': 'E:/CV.pdf', 'total_pages': 2, 'page': 0, 'page_label': '1'}, page_content='Muhammad Haris Imtiaz')]

In [7]:
#Embeddings

In [8]:
embedding_model = "sentence-transformers/all-MiniLM-L6-v2"

In [9]:
embed = HuggingFaceEmbeddings(model=embedding_model)

In [10]:
final_embed = Chroma.from_documents(
    documents=splitted_text,
    embedding=embed,
    collection_name="pdfs",
    persist_directory="E:/PDF_ChatBot/chroma_langchain_db"
)

In [11]:
len(final_embed)

584

In [12]:
collection = final_embed._collection
results = collection.get(include=["embeddings", "documents"])

In [13]:
len(results['documents'])

584

In [14]:
results.keys()

dict_keys(['ids', 'embeddings', 'documents', 'uris', 'included', 'data', 'metadatas'])

#### Now the Functions will be made for the later use  

In [15]:
def load_pdf(file_path):
    loader = PyPDFLoader(file_path=file_path)
    load = loader.load()

    return load

In [16]:
def splitting(load):
    text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 100,
    chunk_overlap= 20)

    splitted_text = text_splitter.split_documents(load)

    return splitted_text

In [17]:
def embeds(load):
    embedding_model = "sentence-transformers/all-MiniLM-L6-v2"
    embed = HuggingFaceEmbeddings(model=embedding_model)
    
    final_embed = Chroma.from_documents(
    documents=splitted_text,
    embedding=embed,
    collection_name="pdfs",
    persist_directory="E:/PDF_ChatBot/chroma_langchain_db"
)
    return final_embed
    
    

In [18]:
def custom_embeds(load):
    embedding_model1 = input("Please provide the model for embedding which you want to load: ")
    embedding_model = embedding_model1
    embed = HuggingFaceEmbeddings(model=embedding_model)
    
    final_embed = Chroma.from_documents(
    documents=splitted_text,
    embedding=embed,
    collection_name="pdfs",
    persist_directory="E:/PDF_ChatBot/chroma_langchain_db"
)
    return final_embed
    
    

In [19]:
from dotenv import load_dotenv
import streamlit as st

In [20]:
load_dotenv()

True

In [21]:
if not os.getenv("GOOGLE_API_KEY"):
    st.error("Err")
    st.stop

In [22]:
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_classic.chains import RetrievalQA
from langchain_core.prompts import PromptTemplate

In [23]:
#Setting up the brain

In [24]:
llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",
    temperature=1, 
    max_tokens=None,
    timeout=None,
    #google_api_key = "AIzaSyB1Gm74LHNB9kT2tnI4xyay1Pc1iqB_lk4",
    max_retries=2
)

In [25]:
#For retrieval

In [26]:
ret = final_embed.as_retriever(search_kwargs = {"k" : 2})

In [27]:
ret

VectorStoreRetriever(tags=['Chroma', 'HuggingFaceEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x0000018C049C8A50>, search_kwargs={'k': 2})

In [28]:
#connecting the brain and retrival for the responses

In [29]:
template = """
You are an expert and professional in reading PDFs. Your task is to answer questions based ONLY on the following context provided from a PDF document.

Guidelines:
1. Suggest him about job portals as well.
2. Keep your answer clear, structured, and easy to read.
3. Use bullet points if listing multiple items.
4. Do not make up information.
5. You can also search information from internet and give him suggestion

Context:
{context}

Question:
{question}

Helpful Answer:

"""

In [30]:
my_prompt = PromptTemplate(
    template=template,
    input_variables= ["context" , "question"]
    
)

In [31]:
qa_chain = RetrievalQA.from_chain_type(
    llm = llm,
    chain_type = "stuff",
    retriever = ret,
    return_source_documents=True,
    chain_type_kwargs = {"prompt" : my_prompt}
)


In [32]:
x = qa_chain.invoke({"query" : "summarize it and suggest him what type of job he needs to apply "})

In [33]:
print(x["result"])

Based on the context provided:

The individual is currently serving as a **GenAI Intern**, a role they started in September 2025 and is ongoing.

Given this experience, the individual should focus on applying for roles that leverage their Generative AI and broader AI/Machine Learning skills.

**Suggested Job Types to Apply For:**

*   **Entry-Level / Junior Generative AI Engineer/Developer:** Roles specifically focused on developing, deploying, and maintaining Generative AI models and applications.
*   **Junior Machine Learning Engineer:** Positions involving the broader application of machine learning principles, model training, and data pipeline development.
*   **AI/ML Research Assistant:** If interested in further research and development within academic or corporate labs.
*   **Data Scientist (with AI/ML focus):** Roles that combine data analysis with the implementation of AI and machine learning solutions.
*   **AI Prompt Engineer:** Specializing in designing and optimizing promp