In [13]:
import fitz  # PyMuPDF

def extract_text_from_pdf(file_path):
    doc = fitz.open(file_path)
    return "\n".join([page.get_text() for page in doc])


In [14]:
resume_text = extract_text_from_pdf("Mohd_Humaid_Resume_Updated.pdf")
print(resume_text)

Mohd Humaid 
Jaipur, Rajasthan, India 
 +91 6388792121 
 mohdhumaid65@gmail.com 
 Linkedin 
 Github 
EXPERIENCE 
 
Robotic Process Automation Developer 
July 2022 – Present 
AU Small Finance Bank 
Jaipur, India 
• 
UiPath and Python to automate complex processes, improve 
efficiency, and reduce errors. 
• Expert in RPA technologies. Proficient in UiPath and Python, capable of designing, developing, and deploying robust 
automation solutions. 
• Extensively utilized SQL for data manipulation, retrieval, and integration within RPA workflows, ensuring robust data 
consistency and reliability.  
• Collaborated closely with cross-functional teams to design and deploy RPA solutions aligned with compliance and 
business goals. 
• Implemented advanced error handling, exception management, and data validation, reducing workflow failures by 
• Automated complex data tasks with Excel, reducing manual work by up to 90%. 
Internship – Frontend Developer 
April 2022 – July 2022 
Web Student 
Jaipur,

In [15]:
def load_job_descriptions(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        content = file.read()

    jobs_raw = content.split('---')
    jobs = []

    for job in jobs_raw:
        job = job.strip()
        if job:
            lines = job.splitlines()
            title = ""
            company = ""
            description = ""
            for line in lines:
                if line.lower().startswith("title:"):
                    title = line.split(":", 1)[1].strip()
                elif line.lower().startswith("company:"):
                    company = line.split(":", 1)[1].strip()
                elif line.lower().startswith("description:"):
                    description = line.split(":", 1)[1].strip()
                else:
                    description += " " + line.strip()

            jobs.append({
                "title": title,
                "company": company,
                "description": description
            })
    return jobs

In [6]:
job_description = load_job_descriptions("job_descriptions.txt")
jobs = job_description
for job in jobs:
    print(f"Title: {job['title']}")
    print(f"Company: {job['company']}")
    print(f"Description: {job['description'][:100]}...\n")

Title: 
Company: 
Description:  Job ID: 1...

Title: Data Scientist
Company: ABC Corp
Description: We are looking for a Data Scientist with experience in Python, machine learning, data visualization,...

Title: 
Company: 
Description:  Job ID: 2...

Title: Machine Learning Engineer
Company: XYZ Inc
Description: Responsibilities include building and deploying ML models, optimizing performance, and working with ...

Title: 
Company: 
Description:  Job ID: 3...

Title: Analyst Robotic Process Automation (RPA)
Company: Deloitte
Description: We are seeking a skilled RPA Developer to join our dynamic RPA team. The ideal candidate will be res...



In [19]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')
resume_embed = model.encode(resume_text)
jd_embed_list = model.encode([job["description"] for job in jobs], convert_to_numpy=True)

jd_embed = model.encode(job_description)


In [20]:
from sklearn.metrics.pairwise import cosine_similarity

similarity = cosine_similarity([resume_embed], jd_embed_list)
top_matches = sorted(zip(similarity[0], job), reverse=True)[:5]


In [44]:
common_skills = {
    # Programming Languages
    "python", "r", "java", "c++", "c#", "scala", "javascript", "typescript",
    
    # Data Manipulation & Analysis
    "pandas", "numpy", "matplotlib", "seaborn", "scipy", "dplyr", "tidyverse", "statsmodels",

    # Data Visualization
    "power bi", "tableau", "excel", "lookml", "qlikview", "ggplot2", "dash", "plotly", "looker",

    # Machine Learning
    "scikit-learn", "xgboost", "lightgbm", "catboost", "tensorflow", "keras", "pytorch",
    "machine learning", "supervised learning", "unsupervised learning", "reinforcement learning",
    "model evaluation", "cross-validation", "hyperparameter tuning", "model deployment",

    # Deep Learning
    "cnn", "rnn", "transformers", "autoencoders", "bert", "gpt", "llm", "deep learning",

    # NLP
    "nlp", "text mining", "spacy", "nltk", "text classification", "named entity recognition",
    "sentiment analysis", "language modeling", "topic modeling", "tf-idf", "word2vec",

    # SQL / Databases
    "sql", "mysql", "postgresql", "mssql", "oracle", "nosql", "mongodb", "redshift", "bigquery",
    "hive", "snowflake", "dynamodb", "database design", "joins", "window functions",

    # Cloud & DevOps
    "aws", "azure", "gcp", "docker", "kubernetes", "lambda", "sagemaker", "ec2", "cloud storage",
    "mlops", "ci/cd", "jenkins", "airflow", "databricks", "terraform",

    # Big Data & ETL
    "hadoop", "spark", "pyspark", "kafka", "flink", "hdfs", "etl", "data pipeline", "data warehouse",

    # Web Frameworks
    "flask", "django", "fastapi", "streamlit", "gradio", "html", "css", "api development",

    # Data Engineering / Processing
    "data cleaning", "data wrangling", "feature engineering", "data preprocessing",
    "data modeling", "data governance", "data validation", "schema design",

    # Version Control & Tools
    "git", "github", "bitbucket", "jira", "notion", "postman", "linux", "bash", "shell scripting",

    # RPA Specific
    "uipath", "automation anywhere", "blue prism", "workflow automation", "robotic process automation",
    "orchestrator", "reframework", "selectors", "citrix automation", "mainframe automation",
    
    # Soft Skills
    "communication", "problem solving", "teamwork", "leadership", "collaboration",
    "time management", "critical thinking", "adaptability", "attention to detail",
    "presentation skills", "stakeholder management", "business acumen", "project management"
}


In [45]:
import spacy

nlp = spacy.load("en_core_web_sm")

def extract_skills_local(text):
    doc = nlp(text.lower())
    extracted = set()

    for chunk in doc.noun_chunks:
        skill = chunk.text.strip()
        if skill in common_skills:
            extracted.add(skill)

    for token in doc:
        if token.text in common_skills:
            extracted.add(token.text)

    return extracted


In [46]:
resume_skills = extract_skills_local(resume_text)

job_skills_list = []
for jd in job_description:
    jd_text = jd["description"]  # ✅ extract the string
    jd_skills = extract_skills_local(jd_text)
    job_skills_list.append(jd_skills)


In [47]:
for jd, jd_skills in zip(job_description, job_skills_list):
    matched = resume_skills.intersection(jd_skills)
    missing = jd_skills.difference(resume_skills)

    skill_match_results.append({
        "job_title": jd["title"],
        "company": jd["company"],
        "job_description": jd["description"][:100] + "...",
        "matched_skills": matched,
        "missing_skills": missing,
        "match_count": len(matched),
        "total_required": len(jd_skills),
        "skill_score": round(len(matched) / len(jd_skills) * 100, 2) if jd_skills else 0
    })


In [48]:
skill_match_results[1]["missing_skills"]

{'pandas', 'scikit-learn'}

In [50]:
similarity[0][1]

np.float32(0.35151204)

In [52]:
print(type(similarity))


<class 'numpy.ndarray'>


In [24]:
prompt = f"Extract top 10 technical and soft skills from the following text:\n{resume_text}"
similarity

array([[0.2702284 , 0.35151204, 0.25057712, 0.4337449 , 0.2548904 ,
        0.60043   ]], dtype=float32)

In [8]:
jobs = load_job_descriptions("job_descriptions.txt")


In [9]:
from sentence_transformers import SentenceTransformer
import numpy as np

# Load sentence transformer model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Extract descriptions
descriptions = [job["description"] for job in jobs]

# Generate embeddings (vectors)
embeddings = model.encode(descriptions, convert_to_numpy=True)


In [10]:
import faiss

# Dimension of embeddings
dim = embeddings.shape[1]

# Create FAISS index (L2 similarity; you can also use cosine)
index = faiss.IndexFlatL2(dim)

# Add embeddings to index
index.add(embeddings)

# Optional: Keep mapping from index to job metadata
id_to_job = {i: job for i, job in enumerate(jobs)}


In [11]:
def find_similar_jobs(query_text, top_k=3):
    query_embedding = model.encode([query_text], convert_to_numpy=True)
    distances, indices = index.search(query_embedding, top_k)
    return [(id_to_job[i], distances[0][rank]) for rank, i in enumerate(indices[0])]


In [12]:
resume_text = "I have 3 years experience in Python, SQL, and building ML models using scikit-learn."
matches = find_similar_jobs(resume_text)

for job, score in matches:
    print(f"Title: {job['title']} ({job['company']})")
    print(f"Similarity Score: {score:.2f}")
    print(f"Description: {job['description'][:100]}...\n")


Title: Data Scientist (ABC Corp)
Similarity Score: 0.68
Description: We are looking for a Data Scientist with experience in Python, machine learning, data visualization,...

Title: Machine Learning Engineer (XYZ Inc)
Similarity Score: 1.07
Description: Responsibilities include building and deploying ML models, optimizing performance, and working with ...

Title: Analyst Robotic Process Automation (RPA) (Deloitte)
Similarity Score: 1.52
Description: We are seeking a skilled RPA Developer to join our dynamic RPA team. The ideal candidate will be res...



In [None]:
import streamlit as st

st.title("Resume Analyzer & Job Matcher")
resume = st.file_uploader("Upload your resume (PDF)", type=["pdf"])
if resume:
    text = extract_text_from_pdf(resume)
    # do matching, display results
