In [None]:
# Download datasets
!kaggle datasets download -d ravindrasinghrana/job-description-dataset
!kaggle datasets download -d snehaanbhawal/resume-dataset

# Unzip datasets
!unzip /content/job-description-dataset.zip -d /content/job_description_data
!unzip /content/resume-dataset.zip "data/*" -d /content/resume_data

Dataset URL: https://www.kaggle.com/datasets/ravindrasinghrana/job-description-dataset
License(s): CC0-1.0
Downloading job-description-dataset.zip to /content
 96% 441M/457M [00:03<00:00, 137MB/s]
100% 457M/457M [00:03<00:00, 152MB/s]
Dataset URL: https://www.kaggle.com/datasets/snehaanbhawal/resume-dataset
License(s): CC0-1.0
Downloading resume-dataset.zip to /content
 88% 55.0M/62.5M [00:00<00:00, 77.8MB/s]
100% 62.5M/62.5M [00:00<00:00, 82.3MB/s]
Archive:  /content/job-description-dataset.zip
  inflating: /content/job_description_data/job_descriptions.csv  
Archive:  /content/resume-dataset.zip
  inflating: /content/resume_data/data/data/ACCOUNTANT/10554236.pdf  
  inflating: /content/resume_data/data/data/ACCOUNTANT/10674770.pdf  
  inflating: /content/resume_data/data/data/ACCOUNTANT/11163645.pdf  
  inflating: /content/resume_data/data/data/ACCOUNTANT/11759079.pdf  
  inflating: /content/resume_data/data/data/ACCOUNTANT/12065211.pdf  
  inflating: /content/resume_data/data/data/A

In [None]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
!pip install pdfplumber # openpyxl sentence-transformers
import pandas as pd
import numpy as np
import nltk
import spacy
import pdfplumber
import os
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Download NLTK Stopwords
nltk.download('stopwords')
nltk.download('punkt')

nlp = spacy.load("en_core_web_sm")
stop_words = set(nltk.corpus.stopwords.words('english'))


Collecting pdfplumber
  Downloading pdfplumber-0.11.5-py3-none-any.whl.metadata (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.5/42.5 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Collecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
# Load Job Dataset (Pick 5000 random jobs to speed up processing)
job_df = pd.read_csv("/content/job_description_data/job_descriptions.csv").sample(n=5000, random_state=42)

# Combine Job Columns into one text field
job_df['combined_text'] = (
    job_df['Job Title'].fillna('') + " " +
    job_df['Job Description'].fillna('') + " " +
    job_df['skills'].fillna('') + " " +
    job_df['Responsibilities'].fillna('')
)

# Convert Job Posting Date to datetime format for sorting recent jobs
job_df['Job Posting Date'] = pd.to_datetime(job_df['Job Posting Date'], errors='coerce')

print(f"✅ Loaded {len(job_df)} Random Job Descriptions!")


✅ Loaded 5000 Random Job Descriptions!


In [None]:
from tqdm import tqdm
def clean_text(text):
    if pd.isnull(text):
        return ""
    text = text.lower()
    text = re.sub(r'[^a-zA-Z]', ' ', text)  # Remove special characters
    words = nltk.word_tokenize(text)
    words = [word for word in words if word not in stop_words and len(word) > 2]
    return ' '.join(words)

# Apply Cleaning to Job Descriptions
tqdm.pandas(desc="🔥 Cleaning Job Descriptions...")
job_df['cleaned_text'] = job_df['combined_text'].progress_apply(clean_text)

print("✅ Job Descriptions Cleaned & Processed!")


🔥 Cleaning Job Descriptions...: 100%|██████████| 5000/5000 [00:01<00:00, 3351.76it/s]

✅ Job Descriptions Cleaned & Processed!





In [None]:
resume_folder = "/content/resume_data/data/data"
resume_texts = []

for category in os.listdir(resume_folder):  # Take ALL folders
    category_path = os.path.join(resume_folder, category)
    temp_resumes = []

    # Select 3 to 5 resumes per folder
    resume_files = [f for f in os.listdir(category_path) if f.endswith(".pdf")]
    selected_resumes = resume_files[:min(len(resume_files), 5)]  # Pick max 5

    for filename in selected_resumes:
        pdf_path = os.path.join(category_path, filename)
        with pdfplumber.open(pdf_path) as pdf:
            text = ''.join([page.extract_text() for page in pdf.pages if page.extract_text()])
            text_cleaned = clean_text(text)
            temp_resumes.append((filename, text_cleaned, category))  # Label = Folder Name

    resume_texts.extend(temp_resumes)

print(f"✅ Selected {len(resume_texts)} resumes from {len(os.listdir(resume_folder))} folders!")


✅ Selected 120 resumes from 24 folders!


In [None]:
# Load SBERT Model (Pretrained Transformer)
from sentence_transformers import SentenceTransformer
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')

# Convert Job Descriptions to SBERT Embeddings
tqdm.pandas(desc="🔥 Encoding Job Descriptions with SBERT...")
job_df['embedding'] = job_df['combined_text'].progress_apply(lambda x: sbert_model.encode(x))

print("✅ Job Descriptions Converted to SBERT Embeddings!")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

🔥 Encoding Job Descriptions with SBERT...: 100%|██████████| 5000/5000 [03:03<00:00, 27.24it/s]

✅ Job Descriptions Converted to SBERT Embeddings!





In [None]:
def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            extracted_text = page.extract_text()
            if extracted_text:
                text += extracted_text + " "
    return text.strip()

# Load & Clean New Resume
resume_text = extract_text_from_pdf("/content/Mohamed_Asif_Resume _1634.pdf")
print(resume_text)

MOHAMED ASIF M
(cid:131) +91 9444202524 # asifoned@gmail.com (cid:239) LinkedIn § GitHub LeetCode
SELF-PRESENTATION
Iamapre-finalyearComputerScienceEngineeringstudentwithastronginterestinDataScience, MachineLearning,
and Full-Stack Development. I am proficient in Python, SQL, Next.js, and AWS, and I consistently enhance my
problem-solving skills on LeetCode.
EDUCATION
Vellore Institute of Technology 2022 – Present
Bachelor of Technology in Computer Science CGPA:7.92 Chennai, Tamil Nadu, India
Chettinad Vidyashram (Schooling) 2010 – 2022
Central Board of Secondary Education (CBSE) 10th Board Percentage: 84 Chennai, Tamil Nadu, India
Chettinad Vidyashram
Central Board of Secondary Education (CBSE) 12th Board Percentage: 91
PROJECTS
Detecting Oil Spills in Marine Environment using AIS Satellite Datasets | React Native, CNN GitHub
• Developed a system to preemptively detect oil spills using satellite imagery and ship tracking data.
• Built a React Native front-end and trained a CNN model t

In [None]:
resume_embedding = sbert_model.encode(resume_text)

In [None]:
# Compute Cosine Similarity Between Resume & Jobs
job_embeddings = np.stack(job_df['embedding'].values)  # Convert list of arrays to 2D numpy array
similarity_scores = cosine_similarity([resume_embedding], job_embeddings)[0]
job_df['Similarity Score'] = similarity_scores

# Get the Top 50 Most Relevant Jobs First
top_jobs = job_df.sort_values(by=['Similarity Score', 'Job Posting Date'], ascending=[False, False]).head(50)

# Ensure 5 Jobs from Different Fields
selected_jobs = []
used_categories = set()

for _, row in top_jobs.iterrows():
    job_title = row['Job Title'].lower()

    # Extract the first 2 words (e.g., "Software Engineer", "Data Scientist")
    category = " ".join(job_title.split()[:2])

    # Ensure only 1 job per category
    if category not in used_categories:
        used_categories.add(category)
        selected_jobs.append(row)

    # Stop when we have 5 unique job types
    if len(selected_jobs) == 5:
        break

# Convert back to DataFrame
final_jobs_df = pd.DataFrame(selected_jobs)

# Print Selected Jobs
print("\n✅ Top 5 Matching Jobs (Different Domains):")
if final_jobs_df.empty:
    print("❌ No jobs found with enough diversity. Try adjusting the resume.")
else:
    for _, row in final_jobs_df.iterrows():
        print(f"🔹 {row['Job Title']} - {row['Company']} ({row['Job Posting Date'].date()})")
        print(f"⭐ Similarity: {round(row['Similarity Score'] * 100, 2)}%")
        print("-" * 50)



✅ Top 5 Matching Jobs (Different Domains):
🔹 Data Scientist - Lockheed Martin (2023-05-26)
⭐ Similarity: 43.62%
--------------------------------------------------
🔹 Data Analyst - Reinsurance Group of America (2023-08-06)
⭐ Similarity: 42.01%
--------------------------------------------------
🔹 Data Engineer - Fresenius SE & Co. KGaA (2023-06-26)
⭐ Similarity: 36.36%
--------------------------------------------------
🔹 Network Engineer - Maruti Suzuki India (2023-08-26)
⭐ Similarity: 33.46%
--------------------------------------------------


In [None]:
import time
import datetime

resume_score = 0
print("\nResume Tips & Ideas 🥂\n")

# High Priority Sections (ATS-Compatible)
if 'Objective' in resume_text or 'Summary' in resume_text:
    resume_score += 10  # Increased priority
    print("[+] Awesome! You have added Objective/Summary")
else:
    print("[-] Please add your career objective or summary. It gives recruiters a clear idea of your career goals.")

if any(keyword in resume_text for keyword in ['Education', 'School', 'College', 'Degree']):
    resume_score += 20  # Increased priority
    print("[+] Awesome! You have added Education Details")
else:
    print("[-] Please add Education. It is critical for recruiters to understand your qualifications.")

if any(keyword in resume_text for keyword in ['EXPERIENCE', 'Experience', 'Work History']):
    resume_score += 25  # Increased priority
    print("[+] Awesome! You have added Experience")
else:
    print("[-] Please add Experience. It helps you stand out from the crowd.")

if any(keyword in resume_text for keyword in ['INTERNSHIPS', 'Internships', 'INTERNSHIP', 'Internship']):
    resume_score += 12  # Increased priority
    print("[+] Awesome! You have added Internships")
else:
    print("[-] Please add Internships. It demonstrates practical experience.")

if any(keyword in resume_text for keyword in ['PROJECTS', 'Projects', 'PROJECT', 'Project']):
    resume_score += 16  # Increased priority
    print("[+] Awesome! You have added your Projects")
else:
    print("[-] Please add Projects. It shows your hands-on experience.")

if any(keyword in resume_text for keyword in ['SKILLS', 'Skills', 'SKILL', 'Skill']):
    resume_score += 12  # Increased priority
    print("[+] Awesome! You have added Skills")
else:
    print("[-] Please add Skills. It highlights your technical and soft skills.")

# Medium Priority Sections
if any(keyword in resume_text for keyword in ['CERTIFICATIONS', 'Certifications', 'Certification']):
    resume_score += 12  # Medium priority
    print("[+] Awesome! You have added your Certifications")
else:
    print("[-] Please add Certifications. It shows specialized knowledge.")

if any(keyword in resume_text for keyword in ['ACHIEVEMENTS', 'Achievements']):
    resume_score += 12  # Medium priority
    print("[+] Awesome! You have added your Achievements")
else:
    print("[-] Please add Achievements. It demonstrates your accomplishments.")

# Low Priority Sections
if 'HOBBIES' in resume_text or 'Hobbies' in resume_text:
    resume_score += 4  # Low priority
    print("[+] Awesome! You have added your Hobbies")
else:
    print("[-] Optional: Add Hobbies to show your personality.")

if 'INTERESTS' in resume_text or 'Interests' in resume_text:
    resume_score += 4  # Low priority
    print("[+] Awesome! You have added your Interests")
else:
    print("[-] Optional: Add Interests to show your passions outside of work.")

# Display the final resume score
print("\n📄 Resume Score 📝")
print(f"Your Resume Writing Score: {resume_score}")
print("Note: This score is calculated based on the content that you have in your Resume.")

# Get current date and time
ts = time.time()
cur_date = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d')
cur_time = datetime.datetime.fromtimestamp(ts).strftime('%H:%M:%S')
timestamp = f"{cur_date}_{cur_time}"

# Error handling for missing file
if not resume_text:
    print("❌ No file uploaded. Please upload a resume file.")



Resume Tips & Ideas 🥂

[-] Please add your career objective or summary. It gives recruiters a clear idea of your career goals.
[+] Awesome! You have added Education Details
[-] Please add Experience. It helps you stand out from the crowd.
[-] Please add Internships. It demonstrates practical experience.
[+] Awesome! You have added your Projects
[+] Awesome! You have added Skills
[-] Please add Certifications. It shows specialized knowledge.
[+] Awesome! You have added your Achievements
[-] Optional: Add Hobbies to show your personality.
[-] Optional: Add Interests to show your passions outside of work.

📄 Resume Score 📝
Your Resume Writing Score: 60
Note: This score is calculated based on the content that you have in your Resume.
