In [1]:
!pip install pandas scikit-learn sentence-transformers kagglehub




In [2]:
import pandas as pd
import kagglehub

# Download the job description dataset
job_path = kagglehub.dataset_download("ravindrasinghrana/job-description-dataset")
print("✅ Job dataset path:", job_path)

# Download the resume dataset
resume_path = kagglehub.dataset_download("snehaanbhawal/resume-dataset")
print("✅ Resume dataset path:", resume_path)


Using Colab cache for faster access to the 'job-description-dataset' dataset.
✅ Job dataset path: /kaggle/input/job-description-dataset
Using Colab cache for faster access to the 'resume-dataset' dataset.
✅ Resume dataset path: /kaggle/input/resume-dataset


In [3]:
# Check files in each dataset
import os

print("Job Dataset Files:")
print(os.listdir(job_path))

print("Resume Dataset Files:")
print(os.listdir(resume_path))


Job Dataset Files:
['job_descriptions.csv']
Resume Dataset Files:
['Resume', 'data']


In [4]:
import os

print("Inside Resume Folder:")
print(os.listdir(f"{resume_path}/Resume"))

print("Inside Data Folder:")
print(os.listdir(f"{resume_path}/data"))


Inside Resume Folder:
['Resume.csv']
Inside Data Folder:
['data']


In [5]:
import pandas as pd

# Load the job descriptions
jobs = pd.read_csv(f"{job_path}/job_descriptions.csv")

# Load the resume dataset
resumes = pd.read_csv(f"{resume_path}/Resume/Resume.csv")

print("Job Descriptions Sample:")
display(jobs.head())

print("Resumes Sample:")
display(resumes.head())


Job Descriptions Sample:


Unnamed: 0,Job Id,Experience,Qualifications,Salary Range,location,Country,latitude,longitude,Work Type,Company Size,...,Contact,Job Title,Role,Job Portal,Job Description,Benefits,skills,Responsibilities,Company,Company Profile
0,1089843540111562,5 to 15 Years,M.Tech,$59K-$99K,Douglas,Isle of Man,54.2361,-4.5481,Intern,26801,...,001-381-930-7517x737,Digital Marketing Specialist,Social Media Manager,Snagajob,Social Media Managers oversee an organizations...,"{'Flexible Spending Accounts (FSAs), Relocatio...","Social media platforms (e.g., Facebook, Twitte...","Manage and grow social media accounts, create ...",Icahn Enterprises,"{""Sector"":""Diversified"",""Industry"":""Diversifie..."
1,398454096642776,2 to 12 Years,BCA,$56K-$116K,Ashgabat,Turkmenistan,38.9697,59.5563,Intern,100340,...,461-509-4216,Web Developer,Frontend Web Developer,Idealist,Frontend Web Developers design and implement u...,"{'Health Insurance, Retirement Plans, Paid Tim...","HTML, CSS, JavaScript Frontend frameworks (e.g...","Design and code user interfaces for websites, ...",PNC Financial Services Group,"{""Sector"":""Financial Services"",""Industry"":""Com..."
2,481640072963533,0 to 12 Years,PhD,$61K-$104K,Macao,"Macao SAR, China",22.1987,113.5439,Temporary,84525,...,9687619505,Operations Manager,Quality Control Manager,Jobs2Careers,Quality Control Managers establish and enforce...,"{'Legal Assistance, Bonuses and Incentive Prog...",Quality control processes and methodologies St...,Establish and enforce quality control standard...,United Services Automobile Assn.,"{""Sector"":""Insurance"",""Industry"":""Insurance: P..."
3,688192671473044,4 to 11 Years,PhD,$65K-$91K,Porto-Novo,Benin,9.3077,2.3158,Full-Time,129896,...,+1-820-643-5431x47576,Network Engineer,Wireless Network Engineer,FlexJobs,"Wireless Network Engineers design, implement, ...","{'Transportation Benefits, Professional Develo...",Wireless network design and architecture Wi-Fi...,"Design, configure, and optimize wireless netwo...",Hess,"{""Sector"":""Energy"",""Industry"":""Mining, Crude-O..."
4,117057806156508,1 to 12 Years,MBA,$64K-$87K,Santiago,Chile,-35.6751,-71.5429,Intern,53944,...,343.975.4702x9340,Event Manager,Conference Manager,Jobs2Careers,A Conference Manager coordinates and manages c...,"{'Flexible Spending Accounts (FSAs), Relocatio...",Event planning Conference logistics Budget man...,Specialize in conference and convention planni...,Cairn Energy,"{""Sector"":""Energy"",""Industry"":""Energy - Oil & ..."


Resumes Sample:


Unnamed: 0,ID,Resume_str,Resume_html,Category
0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,"<div class=""fontsize fontface vmargins hmargin...",HR
1,22323967,"HR SPECIALIST, US HR OPERATIONS ...","<div class=""fontsize fontface vmargins hmargin...",HR
2,33176873,HR DIRECTOR Summary Over 2...,"<div class=""fontsize fontface vmargins hmargin...",HR
3,27018550,HR SPECIALIST Summary Dedica...,"<div class=""fontsize fontface vmargins hmargin...",HR
4,17812897,HR MANAGER Skill Highlights ...,"<div class=""fontsize fontface vmargins hmargin...",HR


In [6]:
#check columns names
print("Job dataset columns:", jobs.columns.tolist())
print("Resume dataset columns:", resumes.columns.tolist())


Job dataset columns: ['Job Id', 'Experience', 'Qualifications', 'Salary Range', 'location', 'Country', 'latitude', 'longitude', 'Work Type', 'Company Size', 'Job Posting Date', 'Preference', 'Contact Person', 'Contact', 'Job Title', 'Role', 'Job Portal', 'Job Description', 'Benefits', 'skills', 'Responsibilities', 'Company', 'Company Profile']
Resume dataset columns: ['ID', 'Resume_str', 'Resume_html', 'Category']


In [7]:
#clean steps
def clean_text(text):
    if isinstance(text, str):
        return " ".join(text.lower().split())
    return ""

# Rename columns if necessary
jobs.rename(columns={'Job Title': 'Job_Title', 'Job Description': 'Job_Description'}, inplace=True)
resumes.rename(columns={'Category': 'Candidate_Name', 'Resume_str': 'Resume'}, inplace=True)

# Apply text cleaning
jobs['clean_text'] = jobs['Job_Description'].apply(clean_text)
resumes['clean_text'] = resumes['Resume'].apply(clean_text)

print("Cleaned and standardized columns")
display(jobs[['Job_Title', 'clean_text']].head(2))
display(resumes[['Candidate_Name', 'clean_text']].head(2))


Cleaned and standardized columns


Unnamed: 0,Job_Title,clean_text
0,Digital Marketing Specialist,social media managers oversee an organizations...
1,Web Developer,frontend web developers design and implement u...


Unnamed: 0,Candidate_Name,clean_text
0,HR,hr administrator/marketing associate hr admini...
1,HR,"hr specialist, us hr operations summary versat..."


In [9]:
# Step 7 — Generate Text Embeddings


from sentence_transformers import SentenceTransformer
import torch
from sklearn.metrics.pairwise import cosine_similarity

# 1. Detect GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"🚀 Using device: {device.upper()}")



#  2. Load a lightweight sentence transformer model
# You can switch to 'all-MiniLM-L6-v2' for more accuracy but slower speed
model = SentenceTransformer('paraphrase-MiniLM-L3-v2', device=device)
print("✅ Model loaded successfully!")

# 3. Limit data for fast testing (increase later if needed)
resume_sample = resumes['clean_text'].tolist()[:300]
job_sample = jobs['clean_text'].tolist()[:30]

# 4. Generate embeddings efficiently with batching
print("Generating embeddings... this should take ~2–3 minutes on GPU.")

resume_embeddings = model.encode(
    resume_sample,
    batch_size=64,
    show_progress_bar=False,
    convert_to_tensor=True
)

job_embeddings = model.encode(
    job_sample,
    batch_size=64,
    show_progress_bar=False,
    convert_to_tensor=True
)

print("Embedding generation complete!")
print(f"Resumes embedded: {len(resume_sample)} | Jobs embedded: {len(job_sample)}")


torch.save(resume_embeddings, "resume_embeddings.pt")
torch.save(job_embeddings, "job_embeddings.pt")
print("Embeddings saved for reuse!")

🚀 Using device: CUDA
✅ Model loaded successfully!
Generating embeddings... this should take ~2–3 minutes on GPU.
Embedding generation complete!
Resumes embedded: 300 | Jobs embedded: 30
Embeddings saved for reuse!


In [12]:
# Calculate similarity scores
similarity_matrix = cosine_similarity(job_embeddings, resume_embeddings)

similarity_df = pd.DataFrame(
    similarity_matrix,
    index=jobs['Job_Title'].iloc[:len(job_embeddings)], # Ensure indices match the sample size
    columns=resumes['Candidate_Name'].iloc[:len(resume_embeddings)] # Ensure columns match the sample size
)

print("✅ Similarity matrix created!")
display(similarity_df.head())

TypeError: can't convert cuda:0 device type tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first.

In [13]:
#calculate similarity scoress
similarity_matrix = cosine_similarity(job_embeddings.cpu(), resume_embeddings.cpu())

similarity_df = pd.DataFrame(
    similarity_matrix,
    index=jobs['Job_Title'].iloc[:len(job_embeddings)], # Ensure indices match the sample size
    columns=resumes['Candidate_Name'].iloc[:len(resume_embeddings)] # Ensure columns match the sample size
)

print("✅ Similarity matrix created!")
display(similarity_df.head())

✅ Similarity matrix created!


Candidate_Name,HR,HR,HR,HR,HR,HR,HR,HR,HR,HR,...,INFORMATION-TECHNOLOGY,INFORMATION-TECHNOLOGY,INFORMATION-TECHNOLOGY,INFORMATION-TECHNOLOGY,INFORMATION-TECHNOLOGY,INFORMATION-TECHNOLOGY,INFORMATION-TECHNOLOGY,INFORMATION-TECHNOLOGY,INFORMATION-TECHNOLOGY,INFORMATION-TECHNOLOGY
Job_Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Digital Marketing Specialist,0.246356,0.309667,0.209817,0.220604,0.263136,0.231136,0.282722,0.251091,0.202161,0.187805,...,0.108552,0.168255,0.27198,0.203843,0.181682,0.241525,0.100752,0.128909,0.110812,0.26138
Web Developer,0.223946,0.28295,0.227708,0.255999,0.204081,0.174056,0.248784,0.281809,0.22756,0.235969,...,0.182502,0.330677,0.155785,0.212731,0.229232,0.204315,0.130517,0.272166,0.255404,0.202277
Operations Manager,0.160333,0.172569,0.125263,0.280957,0.199446,0.185348,0.216556,0.249713,0.228621,0.274125,...,0.196973,0.260968,0.218886,0.215617,0.316762,0.190217,0.193224,0.209697,0.034302,0.184794
Network Engineer,0.107395,0.159488,0.087591,0.164714,0.163042,0.087249,0.110253,0.130046,0.128492,0.160095,...,0.206845,0.103827,0.102137,0.123681,0.26107,0.132382,0.179662,0.130204,0.268092,0.06834
Event Manager,0.28795,0.283232,0.345245,0.280703,0.300737,0.307282,0.36966,0.303152,0.272378,0.298527,...,0.293442,0.190606,0.321988,0.353641,0.337773,0.395523,0.125052,0.178784,0.174726,0.254856


In [21]:
#rank the top 3

import pandas as pd

# Check DataFrame structure
print("Similarity DataFrame shape:", similarity_df.shape)
print("Columns:", similarity_df.columns.tolist()[:5], "...")

# Rank top 3 resumes for each job
for job in similarity_df.index:
    print(f"\nJob: {job}")

    # Extract similarity scores for the current job
    job_scores = similarity_df.loc[job]

    # If result is a DataFrame (e.g., 1-row DataFrame), flatten it
    if isinstance(job_scores, pd.DataFrame):
        job_scores = job_scores.squeeze()

    # Ensure we now have a Series (candidate_name → similarity_score)
    if isinstance(job_scores, pd.Series):
        top_matches = job_scores.sort_values(ascending=False).head(3)

        # Display top 3 matches
        for candidate_name, score in top_matches.items():
            print(f"Candidate: {candidate_name} — Match Score: {score * 100:.2f}%")
    else:
        print("Unexpected data format — skipping this job.")


Similarity DataFrame shape: (30, 300)
Columns: ['HR', 'HR', 'HR', 'HR', 'HR'] ...

Job: Digital Marketing Specialist
Candidate: INFORMATION-TECHNOLOGY — Match Score: 38.97%
Candidate: INFORMATION-TECHNOLOGY — Match Score: 36.40%
Candidate: HR — Match Score: 33.99%

Job: Web Developer
Candidate: DESIGNER — Match Score: 48.94%
Candidate: DESIGNER — Match Score: 48.52%
Candidate: DESIGNER — Match Score: 46.50%

Job: Operations Manager
Candidate: INFORMATION-TECHNOLOGY — Match Score: 44.15%
Candidate: DESIGNER — Match Score: 41.61%
Candidate: INFORMATION-TECHNOLOGY — Match Score: 40.76%

Job: Network Engineer
Candidate: INFORMATION-TECHNOLOGY — Match Score: 34.35%
Candidate: INFORMATION-TECHNOLOGY — Match Score: 32.12%
Candidate: INFORMATION-TECHNOLOGY — Match Score: 31.09%

Job: Event Manager
Candidate: INFORMATION-TECHNOLOGY — Match Score: 50.13%
Candidate: INFORMATION-TECHNOLOGY — Match Score: 49.01%
Candidate: INFORMATION-TECHNOLOGY — Match Score: 46.44%

Job: Software Tester
Candidate

In [23]:
#Store Top 3 Matches in a DataFrame


results = []

for job in similarity_df.index:
    job_scores = similarity_df.loc[job]
    if isinstance(job_scores, pd.DataFrame):
        job_scores = job_scores.squeeze()
    if isinstance(job_scores, pd.Series):
        top_matches = job_scores.sort_values(ascending=False).head(3)
        for candidate_name, score in top_matches.items():
            results.append({
                "Job": job,
                "Candidate": candidate_name,
                "Match_Score": round(score * 100, 2)
            })

results_df = pd.DataFrame(results)
results_df.reset_index(drop=True, inplace=True)
results_df.head(10)


Unnamed: 0,Job,Candidate,Match_Score
0,Digital Marketing Specialist,INFORMATION-TECHNOLOGY,38.97
1,Digital Marketing Specialist,INFORMATION-TECHNOLOGY,36.4
2,Digital Marketing Specialist,HR,33.99
3,Web Developer,DESIGNER,48.94
4,Web Developer,DESIGNER,48.52
5,Web Developer,DESIGNER,46.5
6,Operations Manager,INFORMATION-TECHNOLOGY,44.15
7,Operations Manager,DESIGNER,41.61
8,Operations Manager,INFORMATION-TECHNOLOGY,40.76
9,Network Engineer,INFORMATION-TECHNOLOGY,34.35


In [27]:
# bonus task
!pip install gradio PyPDF2 --quiet
import gradio as gr
from PyPDF2 import PdfReader
import torch

#pdf extraxtion (text)
def extract_text_from_pdf(file):
    text = ""
    try:
        reader = PdfReader(file)
        for page in reader.pages:
            text += page.extract_text() or ""
    except Exception as e:
        text = f"Error reading PDF: {e}"
    return text.strip()

# matching logic
def match_resume_from_pdf(pdf_file):
    resume_text = extract_text_from_pdf(pdf_file)
    if not resume_text or len(resume_text) < 100:
        return "<p style='color:red;'>Could not read enough text from the PDF. Try another resume.</p>"

    # embedding for uploaded resume
    resume_emb = model.encode([resume_text], convert_to_tensor=True)

    # cosine similarity
    scores = torch.nn.functional.cosine_similarity(resume_emb, job_embeddings)
    top_scores, top_indices = torch.topk(scores, k=3)

    # html front end
    results_html = "<div style='font-family: Poppins, sans-serif;'>"
    results_html += "<h3 style='color:#0078ff;'>Top Matching Jobs</h3>"
    for rank, (score, idx) in enumerate(zip(top_scores, top_indices), start=1):
        job_preview = job_sample[idx][:220].replace('\n', ' ') + "..."
        results_html += f"""
        <div style='background:#f5f8ff; border-radius:12px; padding:15px; margin-bottom:12px;
                    box-shadow:0 2px 8px rgba(0,0,0,0.1); transition:0.3s'>
            <h4 style='margin:0; color:#222;'>#{rank} — Job_{idx+1}</h4>
            <p style='margin:6px 0; font-size:15px; color:#333;'>
                <b>Match Score:</b> {score.item()*100:.2f}%
            </p>
            <p style='margin:0; font-size:14px; color:#555;'>{job_preview}</p>
        </div>
        """
    results_html += "</div>"
    return results_html

# css
css = """
#component-0 {
    background: linear-gradient(135deg, #d9eaff, #ffffff);
    padding: 30px 0 !important;
}
footer {display:none !important;}
"""

demo = gr.Interface(
    fn=match_resume_from_pdf,
    inputs=gr.File(label="📤 Upload Your Resume (PDF)", file_types=[".pdf"]),
    outputs=gr.HTML(label="Results"),
    title="your Resume Matcher",
    description=(
        "<div style='font-family:Poppins, sans-serif; color:#222;'>"
        "<h2 style='margin-bottom:10px;'>Find Your Perfect Job Match</h2>"
        "<p style='font-size:16px;'>Upload your resume, and our AI will instantly analyze your profile "
        "and show the top 3 jobs that fit you best. </p>"
        "</div>"
    ),
    theme="default",
    css=css
)

demo.launch()


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://6ab822db448a3e9446.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


