In [2]:
import numpy as numpy
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [3]:
jd_text = """
We are looking for a Data Science Intern with skills in Python,
Machine Learning, SQL, and Data Analysis. Experience with pandas,
scikit-learn, and basic statistics is preferred.
"""

resume1 = """
I am a student with strong Python and SQL skills. I have done projects
in data analysis using pandas and matplotlib. I know basic machine learning.
"""

resume2 = """
I am a web developer with experience in HTML, CSS, JavaScript and React.
I have little experience in Python and no machine learning experience.
"""

resume3 = """
I have experience in Python, Machine Learning and Data Science.
I worked with scikit-learn, pandas, numpy and built models for prediction.
Also familiar with SQL and statistics.
"""


In [4]:
def clean_text(text: str) -> str:
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s]", " ", text)  # keep only letters, numbers, spaces
    text = re.sub(r"\s+", " ", text).strip()
    return text

jd_clean = clean_text(jd_text)
resumes_clean = [clean_text(resume1), clean_text(resume2), clean_text(resume3)]

jd_clean, resumes_clean[0][:100]


('we are looking for a data science intern with skills in python machine learning sql and data analysis experience with pandas scikit learn and basic statistics is preferred',
 'i am a student with strong python and sql skills i have done projects in data analysis using pandas ')

In [5]:
corpus = [jd_clean] + resumes_clean   # first = JD, then resumes

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(corpus)

jd_vector = tfidf_matrix[0:1]      # JD
resume_vectors = tfidf_matrix[1:]  # Resumes

similarities = cosine_similarity(jd_vector, resume_vectors)[0]

match_scores = (similarities * 100).round(2)
match_scores


array([40.83, 25.53, 52.9 ])

In [6]:
resume_names = ["resume1", "resume2", "resume3"]

df = pd.DataFrame({
    "resume_name": resume_names,
    "match_score": match_scores
}).sort_values(by="match_score", ascending=False)

df


Unnamed: 0,resume_name,match_score
2,resume3,52.9
0,resume1,40.83
1,resume2,25.53
