# Resume Screening Automation


In [1]:
# Install dependencies


In [2]:
import pandas as pd
import re
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

print('Imports ready')


Imports ready


In [3]:
# Load dataset
df = pd.read_csv("AI_Resume_Screening.csv")
df.shape
df.head()


Unnamed: 0,Resume_ID,Name,Skills,Experience (Years),Education,Certifications,Job Role,Recruiter Decision,Salary Expectation ($),Projects Count,AI Score (0-100)
0,1,Ashley Ali,"TensorFlow, NLP, Pytorch",10,B.Sc,,AI Researcher,Hire,104895,8,100
1,2,Wesley Roman,"Deep Learning, Machine Learning, Python, SQL",10,MBA,Google ML,Data Scientist,Hire,113002,1,100
2,3,Corey Sanchez,"Ethical Hacking, Cybersecurity, Linux",1,MBA,Deep Learning Specialization,Cybersecurity Analyst,Hire,71766,7,70
3,4,Elizabeth Carney,"Python, Pytorch, TensorFlow",7,B.Tech,AWS Certified,AI Researcher,Hire,46848,0,95
4,5,Julie Hill,"SQL, React, Java",4,PhD,,Software Engineer,Hire,87441,9,100


In [4]:
# Preprocess skills and years
def split_skills(s):
    if pd.isna(s):
        return []
    s = str(s)
    parts = re.split(r'[;,\|]', s)
    parts = [p.strip().lower() for p in parts if p and p.strip()!='']
    return parts

df['skills_list'] = df['Skills'].apply(split_skills)
def parse_years(x):
    try:
        return float(x)
    except:
        m = re.search(r'(\d+\.?\d*)', str(x))
        if m:
            return float(m.group(1))
    return 0.0

df['years_exp'] = df['Experience (Years)'].apply(parse_years)
df.head()


Unnamed: 0,Resume_ID,Name,Skills,Experience (Years),Education,Certifications,Job Role,Recruiter Decision,Salary Expectation ($),Projects Count,AI Score (0-100),skills_list,years_exp
0,1,Ashley Ali,"TensorFlow, NLP, Pytorch",10,B.Sc,,AI Researcher,Hire,104895,8,100,"[tensorflow, nlp, pytorch]",10.0
1,2,Wesley Roman,"Deep Learning, Machine Learning, Python, SQL",10,MBA,Google ML,Data Scientist,Hire,113002,1,100,"[deep learning, machine learning, python, sql]",10.0
2,3,Corey Sanchez,"Ethical Hacking, Cybersecurity, Linux",1,MBA,Deep Learning Specialization,Cybersecurity Analyst,Hire,71766,7,70,"[ethical hacking, cybersecurity, linux]",1.0
3,4,Elizabeth Carney,"Python, Pytorch, TensorFlow",7,B.Tech,AWS Certified,AI Researcher,Hire,46848,0,95,"[python, pytorch, tensorflow]",7.0
4,5,Julie Hill,"SQL, React, Java",4,PhD,,Software Engineer,Hire,87441,9,100,"[sql, react, java]",4.0


In [None]:
# Build resume text
def build_resume_text(row):
    parts = []
    if row.get('skills_list'):
        parts.append(' '.join(row['skills_list']))
    if pd.notna(row.get('Education')) and str(row.get('Education')).strip()!='':
        parts.append(str(row.get('Education')))
    if pd.notna(row.get('Certifications')) and str(row.get('Certifications')).strip()!='':
        parts.append(str(row.get('Certifications')))
    if pd.notna(row.get('Projects Count')):
        parts.append('projects '+str(row.get('Projects Count')))
    return ' '.join(parts).lower()

df['resume_text'] = df.apply(build_resume_text, axis=1)
df[['Resume_ID','Name','resume_text']].head()


In [5]:
# Derive job descriptions from applicants' skills
job_roles = df['Job Role'].fillna('Unknown').unique().tolist()
job_items = []
for role in job_roles:
    subset = df[df['Job Role']==role]
    all_skills = [s for skills in subset['skills_list'] for s in skills]
    top_skills = [s for s,_ in Counter(all_skills).most_common(8)]
    if len(top_skills)==0:
        jd_text = f"{role} (no specific skills listed)"
    else:
        jd_text = f"{role} requires " + ', '.join(top_skills)
    job_items.append({'job_role': role, 'job_description': jd_text})

jobs_df = pd.DataFrame(job_items)
jobs_df.head()


Unnamed: 0,job_role,job_description
0,AI Researcher,"AI Researcher requires tensorflow, nlp, python..."
1,Data Scientist,"Data Scientist requires machine learning, pyth..."
2,Cybersecurity Analyst,Cybersecurity Analyst requires ethical hacking...
3,Software Engineer,"Software Engineer requires java, sql, c++, react"


In [7]:
# Create a combined resume text column for TF-IDF
df['resume_text'] = (
    df['Skills'].fillna('') + ' ' +
    df['Education'].fillna('') + ' ' +
    df['Certifications'].fillna('') + ' ' +
    df['Projects Count'].astype(str)
)

# Also clean skills into a list for later
df['skills_list'] = df['Skills'].fillna('').apply(lambda x: [s.strip().lower() for s in x.split(',') if s.strip()])

# Parse years of experience
df['years_exp'] = pd.to_numeric(df['Experience (Years)'], errors='coerce').fillna(0).astype(int)

df[['Resume_ID', 'Name', 'resume_text']].head()


Unnamed: 0,Resume_ID,Name,resume_text
0,1,Ashley Ali,"TensorFlow, NLP, Pytorch B.Sc 8"
1,2,Wesley Roman,"Deep Learning, Machine Learning, Python, SQL M..."
2,3,Corey Sanchez,"Ethical Hacking, Cybersecurity, Linux MBA Deep..."
3,4,Elizabeth Carney,"Python, Pytorch, TensorFlow B.Tech AWS Certifi..."
4,5,Julie Hill,"SQL, React, Java PhD 9"


In [8]:
# TF-IDF matching
resume_texts = df['resume_text'].fillna('').tolist()
job_texts = jobs_df['job_description'].fillna('').tolist()
vectorizer = TfidfVectorizer(ngram_range=(1,2), min_df=1, stop_words='english')
corpus = resume_texts + job_texts
X = vectorizer.fit_transform(corpus)
n = len(resume_texts)
R = X[:n]
J = X[n:]
sims = cosine_similarity(J, R)

rows = []
TOP_K = 5
for j_idx, role in enumerate(jobs_df['job_role']):
    sim_row = sims[j_idx]
    top_idx = sim_row.argsort()[::-1][:TOP_K]
    for rank, r_idx in enumerate(top_idx, start=1):
        rows.append({
            'job_role': role,
            'rank': rank,
            'Resume_ID': df.iloc[r_idx]['Resume_ID'],
            'Name': df.iloc[r_idx]['Name'],
            'similarity': float(sim_row[r_idx]),
            'candidate_skills': ','.join(df.iloc[r_idx]['skills_list']),
            'candidate_years': df.iloc[r_idx]['years_exp']
        })

results_df = pd.DataFrame(rows)
results_df.to_csv('matching_results.csv', index=False)
print('Saved matching_results.csv')
results_df.head()


Saved matching_results.csv


Unnamed: 0,job_role,rank,Resume_ID,Name,similarity,candidate_skills,candidate_years
0,AI Researcher,1,630,Craig Lewis,0.372921,"tensorflow,nlp,python,pytorch",3
1,AI Researcher,2,898,Patricia Dunn,0.33224,"tensorflow,nlp,python,pytorch",8
2,AI Researcher,3,619,Larry Williams,0.328726,"tensorflow,nlp,python,pytorch",6
3,AI Researcher,4,135,Paul Castro,0.321179,"tensorflow,nlp,python,pytorch",6
4,AI Researcher,5,164,Steven Johnson,0.304662,"tensorflow,nlp,python",1


# Extract Skills

In [9]:
import spacy
from spacy.matcher import PhraseMatcher

# Load English model
nlp = spacy.load("en_core_web_sm")

# Define a skill vocabulary (you can expand this list)
skill_phrases = [
    "python", "java", "c++", "sql", "excel", "pandas", "numpy",
    "scikit-learn", "tensorflow", "keras", "aws", "azure", "nlp", "machine learning"
]

# Create PhraseMatcher
matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
patterns = [nlp.make_doc(skill) for skill in skill_phrases]
matcher.add("SKILLS", patterns)

def extract_skills_spacy(text):
    doc = nlp(text)
    matches = matcher(doc)
    return list(set([doc[start:end].text.lower() for match_id, start, end in matches]))

# Example: apply on resume_text
df['extracted_skills_spacy'] = df['resume_text'].apply(extract_skills_spacy)

df[['Name', 'resume_text', 'extracted_skills_spacy']].head()


Unnamed: 0,Name,resume_text,extracted_skills_spacy
0,Ashley Ali,"TensorFlow, NLP, Pytorch B.Sc 8","[nlp, tensorflow]"
1,Wesley Roman,"Deep Learning, Machine Learning, Python, SQL M...","[sql, machine learning, python]"
2,Corey Sanchez,"Ethical Hacking, Cybersecurity, Linux MBA Deep...",[]
3,Elizabeth Carney,"Python, Pytorch, TensorFlow B.Tech AWS Certifi...","[aws, python, tensorflow]"
4,Julie Hill,"SQL, React, Java PhD 9","[sql, java]"
