In [8]:
import os
from langchain.document_loaders import PyPDFLoader


loader = PyPDFLoader("Y:\GitHub\AI Cloud\RAG\data\Pavan_Bairu_Resume.pdf")

documents = loader.load()
documents

[Document(metadata={'producer': 'Pdftools SDK', 'creator': 'PyPDF', 'creationdate': '', 'moddate': '2025-07-28T13:05:15+00:00', 'source': 'Y:\\GitHub\\AI Cloud\\RAG\\data\\Pavan_Bairu_Resume.pdf', 'total_pages': 2, 'page': 0, 'page_label': '1'}, page_content='Pavan Bairu \n+918886261654           pavan.bairu1@gmail.com         Warangal, India-506003 \n \nProfessional Summary \nSoftware Developer with 4.6 years of experience in SQL-based backend development for banking and insurance \ndomains. Skilled in writing and optimizing SQL queries, and building APIs using Python and FastAPI. Also upskilled \nand hands-on on AI/ML model development and deployment.\n \nTechnical Skills \n• Programming: Python \n• Databases: SQL, DB2 \n• Cloud: AWS (S3, EC2, ECS, ECR) \n• MLOps & Deployment: Docker, CI/CD Pipelines, Flask, FastAPI, GitHub Actions \n• Data Visualization: Matplotlib, Seaborn \n \nWork Experience \nSystem Engineer | TCS | Feb 2023 – Present \nIndustry: Insurance, Pensions & Investment

In [37]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_spiltter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50)
chunks = text_spiltter.split_documents(documents)

chunks

[Document(metadata={'producer': 'Pdftools SDK', 'creator': 'PyPDF', 'creationdate': '', 'moddate': '2025-07-28T13:05:15+00:00', 'source': 'Y:\\GitHub\\AI Cloud\\RAG\\data\\Pavan_Bairu_Resume.pdf', 'total_pages': 2, 'page': 0, 'page_label': '1'}, page_content='Pavan Bairu \n+918886261654           pavan.bairu1@gmail.com         Warangal, India-506003 \n \nProfessional Summary \nSoftware Developer with 4.6 years of experience in SQL-based backend development for banking and insurance'),
 Document(metadata={'producer': 'Pdftools SDK', 'creator': 'PyPDF', 'creationdate': '', 'moddate': '2025-07-28T13:05:15+00:00', 'source': 'Y:\\GitHub\\AI Cloud\\RAG\\data\\Pavan_Bairu_Resume.pdf', 'total_pages': 2, 'page': 0, 'page_label': '1'}, page_content='domains. Skilled in writing and optimizing SQL queries, and building APIs using Python and FastAPI. Also upskilled \nand hands-on on AI/ML model development and deployment.\n \nTechnical Skills \n• Programming: Python \n• Databases: SQL, DB2 \n• Clou

In [38]:
from langchain_ollama.embeddings import OllamaEmbeddings

embeddings = OllamaEmbeddings(model="llama3.2:1b")


In [39]:
from langchain.vectorstores import Chroma

vector_db = Chroma.from_documents(
    documents=chunks,
    embedding=embeddings
)



In [40]:
retriever=vector_db.as_retriever()

In [41]:
from langchain_ollama.llms import OllamaLLM

llm = OllamaLLM(model="llama3.2:1b")

In [42]:
from langchain.prompts import PromptTemplate

prompt_template = PromptTemplate(
    input_variables=["context", "question"],
    template="""
You are a highly intelligent resume parsing assistant.

Given the resume content below:
{context}

Your task is to:
- Carefully read the question and extract only the **relevant** information.
- Focus on **specific sections** mentioned in the question (e.g., TCS, Capgemini, Projects).
- If the question is **multi-part**, extract and structure all parts clearly.
- Always respond in a **valid Python dictionary** format.
- Use **nested dictionaries or lists** if necessary.
- Do not include unrelated information, even if found elsewhere in the resume.
- If the information is not found, return an empty dictionary: {{}}

Now answer the following question:
"{question}"
"""
)

In [43]:
from langchain.chains import RetrievalQA

retrieval_qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs={"prompt": prompt_template}
)

In [44]:
question = "what are all the skills does pavan have?"
result = retrieval_qa({"query": question})
print(result['result'])

## Skills
```
{
    "Technical": [
        "Data Analysis",
        "Machine Learning",
        "Python",
        "R",
        "SQL"
    ],
    "Programming Languages": [
        "Python 3.8",
        "NumPy"
    ]
}
```


In [45]:
result = retrieval_qa({"query": "What backend system did the candidate build at TCS and what technologies were used?"})
print(result['result'])

{
    "background": "",
    "education": {},
    "experience": {
        "TCS": {
            "system_engineer": [
                {
                    "backend_system": "AWS",
                    "cloud": "S3, EC2, ECS, ECR"
                }
            ]
        },
        "personal_projects": {
            "Credit Card Transactions Fraud Detection": {
                "tech_stack": [
                    "Python", 
                    "FastAPI", 
                    "Scikit-learn", 
                    "GitHub Actions", 
                    "Docker", 
                    "AWS EC2", 
                    "AWS S3"
                ],
                "problem_statement": "Detecting fraud in highly imbalanced credit card transactions (99% legit, 1% fraud) while minimizing false negatives.",
                "tech_stack_used": []
            }
        }
    }
}


In [46]:
result = retrieval_qa({"query": "Summarize the responsibilities held at Capgemini?"})
print(result['result'])

{
    "Company": {
        "Name": "Capgemini",
        "Industry": "Software Engineering"
    },
    "Job Title": "System Engineer",
    "Time Period": [
        "2023 - Present"
    ],
    "Experience": [
        {
            "Company": "TCS (System Engineer)",
            "Date": "Feb 2023 - Present",
            "Industry": "Insurance, Pensions & Investment",
            "Client": "ACD Project (LBG Client)"
        }
    ]
}


In [47]:
result = retrieval_qa({"query": "What are the main goals of the SQL-LLM-Chatbot project and how was it deployed?"})
print(result['result'])

{
    "Project Goals": [
        "Develop a conversational AI chatbot that converts natural language inputs to SQL queries",
        "Integrate an LLM-powered assistant with AWS ECS/ECR for database query execution"
    ],
    "Deployment Overview": [
        "AWS ECS/ECR containerized infrastructure for the chatbot",
        "GitHub Actions workflow file for continuous integration and deployment"
    ]
}


In [49]:
result = retrieval_qa({"query": "fetch the personal details of the candidate"})
print(result['result'])

Here's the parsed response:

```
{
    "Professional Summary": {
        "Personal Details": {
            "Name": "Pavan Bairu",
            "Contact Information": {
                "Email": "pavan.bairu1@gmail.com",
                "Phone Number": "+918886261654"
            },
            "Location": "Warangal, India - 506003"
        }
    },
    "Education": {
        "Academic History": {
            "Degree": "B.Tech in Electronics & Communications Engineering",
            "Campus": "Guru Nanak Institutions Technical Campus",
            "Institution": "Hyderabad",
            "Year of Graduation": 2019
        }
    },
    "Certifications": {},
    "Achievements and Awards": {
        "Personal Achievements": {
            "RISING STAR Certificate from Capgemini"
        },
        "Professional Certifications": {
            "AWS Cloud Practitioner CLF - C02"
        }
    }
}
```

Note: The 'TCS' section was not explicitly mentioned, but based on the context, it can be infer