In [1]:
import os
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS, Chroma
from langchain_classic.chains import RetrievalQA
from langchain_classic.prompts import PromptTemplate
import gradio as gr
import time

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
loader = PyPDFDirectoryLoader("Documents")
doc_before_split = loader.load()

In [3]:
print(len(doc_before_split))

24


In [4]:
len(doc_before_split[1].page_content)

219

In [5]:
for doc in doc_before_split:
    print(len(doc.page_content))

1176
219
771
704
705
651
613
600
633
629
620
632
871
0
1395
1053
691
661
646
555
1047
97
809
802


In [6]:
docs_before_split = [doc for doc in doc_before_split if len(doc.page_content) > 0]

In [7]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap = 100,
)

docs_after_split = text_splitter.split_documents(doc_before_split)

In [8]:
print(len(docs_after_split))
print(len(docs_after_split[1].page_content))

27
290


In [9]:
avg_doc_length = lambda docs: sum([len(doc.page_content) for doc in docs]) / len(docs)
print(f"Average document length before split: {avg_doc_length(doc_before_split)}")
print(f"Average document length after split: {avg_doc_length(docs_after_split)}")

Average document length before split: 690.8333333333334
Average document length after split: 624.1851851851852


In [10]:
huggingface_embedding = HuggingFaceBgeEmbeddings(
    model_name = "sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs = {"device": "cpu"},
    encode_kwargs = {"normalize_embeddings": True},
)

  huggingface_embedding = HuggingFaceBgeEmbeddings(


In [11]:
faiss_vectorizer = FAISS.from_documents(docs_after_split, huggingface_embedding)

chroma_vectorizer = Chroma.from_documents(
    documents = docs_after_split, 
    embedding = huggingface_embedding, 
    persist_directory = "chroma_db")

In [12]:
query = "Find candidates with more than 2 years of experience in data science and machine learning"

In [13]:
faiss_retriever = faiss_vectorizer.as_retriever(search_type="similarity", search_kwargs={"k":4})
chroma_retriever = chroma_vectorizer.as_retriever(search_type="similarity", search_kwargs={"k":4})

In [None]:
if "GOOGLE_API_KEY" not in os.environ:
    os.environ["GOOGLE_API_KEY"] = "xxxxxxxxxxxxxxxxxxxx" # put you api key here

from langchain_google_genai import ChatGoogleGenerativeAI

model = ChatGoogleGenerativeAI(
    model="gemini-3-flash-preview",
    temperature=1.0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
)

In [15]:
prompt_template = """You are an AI assistant specialized in analyzing resumes. 
Use the following pieces of context (resume content) to answer the question at the end. 
Follow these rules carefully:

1. If the answer is not present in the context, do NOT guess. Say: "I couldn't find this information in the resumes provided."
2. Extract skills, experience, and relevant qualifications concisely.
3. Provide the answer in a maximum of 5 sentences.
4. Whenever possible, include the name of the candidate and the source resume file.

Resume Content:
{context}

Query: {question}

Answer (concise and structured):
"""

PROMPT = PromptTemplate(
    template = prompt_template, input_variables = ["context", "question"]
)


In [16]:
faiss_retrievalQA = RetrievalQA.from_chain_type(
    llm = model, 
    chain_type = "stuff",
    retriever = faiss_retriever,
    return_source_documents = True,
    chain_type_kwargs = {"prompt" : PROMPT}
)

In [17]:
chroma_retrievalQA = RetrievalQA.from_chain_type(
    llm = model, 
    chain_type = "stuff",
    retriever = chroma_retriever,
    return_source_documents = True,
    chain_type_kwargs = {"prompt" : PROMPT}
)

In [18]:
faiss_result = faiss_retrievalQA.invoke({"query" : query})
print(faiss_result)

{'query': 'Find candidates with more than 2 years of experience in data science and machine learning', 'result': 'Based on the resumes provided, all four candidates meet the criteria:\n\n1. **Sara Haddad** has 3 years of experience as a Data Scientist specializing in machine learning models (XGBoost, Random Forest) and statistical analysis.\n2. **Fadi Karam** brings 7 years of experience as a Senior Data Engineer, focusing on designing AI pipelines and infrastructure to support machine learning workflows.\n3. **Nour El Din** has 3 years of experience as an AI Software Engineer, with expertise in integrating machine learning models into scalable APIs and backend systems.\n4. **Reem Khalaf** possesses 4 years of experience as an AI Healthcare Analyst, applying machine learning and predictive modeling to medical datasets.\n5. All candidates demonstrate proficiency in Python and SQL across their respective data science and AI-focused roles.', 'source_documents': [Document(id='e5a420ea-46e0

In [19]:
faiss_result['result']

'Based on the resumes provided, all four candidates meet the criteria:\n\n1. **Sara Haddad** has 3 years of experience as a Data Scientist specializing in machine learning models (XGBoost, Random Forest) and statistical analysis.\n2. **Fadi Karam** brings 7 years of experience as a Senior Data Engineer, focusing on designing AI pipelines and infrastructure to support machine learning workflows.\n3. **Nour El Din** has 3 years of experience as an AI Software Engineer, with expertise in integrating machine learning models into scalable APIs and backend systems.\n4. **Reem Khalaf** possesses 4 years of experience as an AI Healthcare Analyst, applying machine learning and predictive modeling to medical datasets.\n5. All candidates demonstrate proficiency in Python and SQL across their respective data science and AI-focused roles.'

In [20]:
chroma_result = chroma_retrievalQA.invoke({"query" : query})
print(chroma_result)

{'query': 'Find candidates with more than 2 years of experience in data science and machine learning', 'result': 'Sara Haddad (Data Scientist resume) has 3 years of experience in data science and machine learning, specializing in statistical analysis and predictive modeling. Her qualifications include a BSc in Computer Science and expertise in Python, SQL, Scikit-learn, and Power BI. She has successfully implemented machine learning projects using XGBoost, Random Forest, and KMeans clustering for sales forecasting and customer segmentation. Currently, she works as a Data Scientist at Insight Analytics, where she focuses on transforming raw data into actionable insights.', 'source_documents': [Document(metadata={'producer': 'www.ilovepdf.com', 'source': 'Documents\\Resume 2.pdf', 'moddate': '2026-01-04T12:33:45+00:00', 'page': 0, 'page_label': '1', 'total_pages': 2, 'creationdate': '2026-01-04T12:33:45+00:00', 'creator': 'Microsoft® Word 2016', 'author': 'Youssef Obeid'}, page_content='

In [21]:
chroma_result['result']

'Sara Haddad (Data Scientist resume) has 3 years of experience in data science and machine learning, specializing in statistical analysis and predictive modeling. Her qualifications include a BSc in Computer Science and expertise in Python, SQL, Scikit-learn, and Power BI. She has successfully implemented machine learning projects using XGBoost, Random Forest, and KMeans clustering for sales forecasting and customer segmentation. Currently, she works as a Data Scientist at Insight Analytics, where she focuses on transforming raw data into actionable insights.'

In [22]:
faiss_relevant_docs = faiss_result['source_documents']
print(f'There are {len(faiss_relevant_docs)} documents retrieved for FAISS:')
print("*" * 100)

for i, doc in enumerate(faiss_relevant_docs):
    print(f"Relevant Document #{i+1}")
    print(f"Source file: {doc.metadata.get('source', 'Unknown')}, Page: {doc.metadata.get('page', 'N/A')}")
    print(f"Content (first 500 chars):\n{doc.page_content[:500]}")
    print("-"*100)

There are 4 documents retrieved for FAISS:
****************************************************************************************************
Relevant Document #1
Source file: Documents\Resume 2.pdf, Page: 0
Content (first 500 chars):
Sara Haddad 
Data Scientist 
Tripoli, Lebanon 
Email: sara.haddad.ds@gmail.com 
SUMMARY 
Data Scientist with 3 years of experience transforming raw data into actionable 
insights using statistical analysis, machine learning models, and visualization 
techniques. 
AREAS OF EXPERTISE 
Data Analysis, Statistics, Machine Learning, Data Visualization, Python, SQL, EDA, 
Feature Engineering 
EDUCATION 
University of Balamand 
BSc in Computer Science 
2019 – 2022 
PROJECTS 
Sales Forecasting System 
 
----------------------------------------------------------------------------------------------------
Relevant Document #2
Source file: Documents\Resume 9.pdf, Page: 0
Content (first 500 chars):
Fadi Karam 
Data Engineer (AI Pipelines) 
Beirut, Lebanon 
Email: fa

In [23]:
chroma_relevant_docs = chroma_result['source_documents']
print(f'There are {len(chroma_relevant_docs)} documents retrieved for Chroma:')
print("*" * 100)

for i, doc in enumerate(chroma_relevant_docs):
    print(f"Relevant Document #{i+1}")
    print(f"Source file: {doc.metadata.get('source', 'Unknown')}, Page: {doc.metadata.get('page', 'N/A')}")
    print(f"Content (first 500 chars):\n{doc.page_content[:500]}")
    print("-"*100)


There are 4 documents retrieved for Chroma:
****************************************************************************************************
Relevant Document #1
Source file: Documents\Resume 2.pdf, Page: 0
Content (first 500 chars):
Sara Haddad 
Data Scientist 
Tripoli, Lebanon 
Email: sara.haddad.ds@gmail.com 
SUMMARY 
Data Scientist with 3 years of experience transforming raw data into actionable 
insights using statistical analysis, machine learning models, and visualization 
techniques. 
AREAS OF EXPERTISE 
Data Analysis, Statistics, Machine Learning, Data Visualization, Python, SQL, EDA, 
Feature Engineering 
EDUCATION 
University of Balamand 
BSc in Computer Science 
2019 – 2022 
PROJECTS 
Sales Forecasting System 
 
----------------------------------------------------------------------------------------------------
Relevant Document #2
Source file: Documents\Resume 2.pdf, Page: 0
Content (first 500 chars):
Sara Haddad 
Data Scientist 
Tripoli, Lebanon 
Email: sara.haddad.d

In [25]:
def answer_query(query):
    """
    Takes a user query, returns the answer from FAISS RetrievalQA and the resumes used.
    """
    # Run the RAG model
    result = faiss_retrievalQA.invoke({"query": query})
    
    # Extract the answer text
    answer = result['result']
    
    # Extract the resumes used
    docs = result['source_documents']
    sources = ", ".join([doc.metadata.get("source", "Unknown") for doc in docs])
    
    return answer, sources


In [26]:
iface = gr.Interface(
    fn=answer_query,  # The function to run
    inputs=gr.Textbox(
        lines=2, 
        placeholder="Ask about candidate experience..."
    ), 
    outputs=[
        gr.Textbox(label="Answer", lines=10),
        gr.Textbox(label="Resumes Used", lines=10)
    ],
    title="RAG-Powered Resume Assistant",
    description="Ask questions about resumes. The AI retrieves relevant resumes and answers based on them."
)


In [27]:
iface.launch(debug=True)

* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.


Keyboard interruption in main thread... closing server.


