In [27]:
import numpy, pandas, sklearn, spacy, transformers, flask, streamlit
print("✅ All core libraries imported successfully!")

✅ All core libraries imported successfully!


In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Sample job description and resumes
job_desc = "Looking for a data scientist skilled in Python, NLP, and machine learning."
resumes = [
    "Experienced data scientist with strong Python and NLP background.",
    "Web developer with expertise in React and JavaScript.",
    "Machine learning engineer skilled in Python, deep learning, and NLP."
]

# Convert text to vectors
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform([job_desc] + resumes)

# Compute cosine similarity of each resume with job description
similarities = cosine_similarity(vectors[0:1], vectors[1:]).flatten()

# Rank resumes by similarity
ranked = sorted(zip(resumes, similarities), key=lambda x: x[1], reverse=True)

for resume, score in ranked:
    print(f"Resume: {resume} | Match Score: {score:.2f}")


Resume: Machine learning engineer skilled in Python, deep learning, and NLP. | Match Score: 0.58
Resume: Experienced data scientist with strong Python and NLP background. | Match Score: 0.37
Resume: Web developer with expertise in React and JavaScript. | Match Score: 0.11


In [29]:
import pandas as pd

# Load dataset (change filename if needed)
df = pd.read_csv("UpdatedResumeDataSet.csv")

# Show first 5 rows
print(df.head())

# Show column names
print("\nColumns in dataset:", df.columns)

# Show dataset shape
print("\nDataset shape:", df.shape)


       Category                                             Resume
0  Data Science  Skills * Programming Languages: Python (pandas...
1  Data Science  Education Details \r\nMay 2013 to May 2017 B.E...
2  Data Science  Areas of Interest Deep Learning, Control Syste...
3  Data Science  Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...
4  Data Science  Education Details \r\n MCA   YMCAUST,  Faridab...

Columns in dataset: Index(['Category', 'Resume'], dtype='object')

Dataset shape: (962, 2)


In [30]:
import re

# Function to clean text
def clean_text(text):
    text = re.sub(r'http\S+', '', text)  # remove links
    text = re.sub(r'\d+', '', text)      # remove numbers
    text = re.sub(r'[^\w\s]', '', text)  # remove punctuation
    text = text.lower().strip()          # lowercase + trim
    return text

# Apply cleaning on resume column
df['cleaned_resume'] = df['Resume'].apply(clean_text)

# Check result
print(df[['Resume', 'cleaned_resume']].head())


                                              Resume  \
0  Skills * Programming Languages: Python (pandas...   
1  Education Details \r\nMay 2013 to May 2017 B.E...   
2  Areas of Interest Deep Learning, Control Syste...   
3  Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...   
4  Education Details \r\n MCA   YMCAUST,  Faridab...   

                                      cleaned_resume  
0  skills  programming languages python pandas nu...  
1  education details \r\nmay  to may  be   uitrgp...  
2  areas of interest deep learning control system...  
3  skills â r â python â sap hana â tableau â sap...  
4  education details \r\n mca   ymcaust  faridaba...  


In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create TF-IDF vectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)

# Fit and transform resumes
X = vectorizer.fit_transform(df['cleaned_resume'])

print("TF-IDF shape:", X.shape)


TF-IDF shape: (962, 5000)


In [32]:
## Sample resume----

job_description = """
We are looking for a Data Scientist with experience in Python, Machine Learning, and NLP. 
Candidates should have strong skills in Pandas, Scikit-learn, and data visualization.
"""

jd_cleaned = clean_text(job_description)


In [33]:
jd_vector = vectorizer.transform([jd_cleaned])

In [34]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Calculate similarity scores
similarity_scores = cosine_similarity(jd_vector, X).flatten()

# Rank candidates (higher = better match)
ranked_indices = np.argsort(similarity_scores)[::-1]

# Show top 5 candidates
top_n = 5
for i in range(top_n):
    idx = ranked_indices[i]
    print(f"\nCandidate {i+1} (Score: {similarity_scores[idx]:.4f})")
    print(df['Resume'][idx][:300], "...")  # print first 300 chars of resume



Candidate 1 (Score: 0.3728)
Skills â¢ Python â¢ Tableau â¢ Data Visualization â¢ R Studio â¢ Machine Learning â¢ Statistics IABAC Certified Data Scientist with versatile experience over 1+ years in managing business, data science consulting and leading innovation projects, bringing business ideas to working real world so ...

Candidate 2 (Score: 0.3728)
Skills â¢ Python â¢ Tableau â¢ Data Visualization â¢ R Studio â¢ Machine Learning â¢ Statistics IABAC Certified Data Scientist with versatile experience over 1+ years in managing business, data science consulting and leading innovation projects, bringing business ideas to working real world so ...

Candidate 3 (Score: 0.3728)
Skills â¢ Python â¢ Tableau â¢ Data Visualization â¢ R Studio â¢ Machine Learning â¢ Statistics IABAC Certified Data Scientist with versatile experience over 1+ years in managing business, data science consulting and leading innovation projects, bringing business ideas to working real world so ..

In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Suppose df["Resume"] has your resumes
vectorizer = TfidfVectorizer(stop_words="english")

# Fit on resumes + job description
all_texts = df["Resume"].tolist() + [job_desc]
tfidf_matrix = vectorizer.fit_transform(all_texts)

# Compare job description with each resume
cosine_similarities = cosine_similarity(
    tfidf_matrix[-1],  # last one is job desc
    tfidf_matrix[:-1]  # all resumes
).flatten()


In [36]:
results = pd.DataFrame({
    "Resume": df["Resume"],
    "Category": df["Category"],
    "Match_Score": cosine_similarities
})

# Sort in descending order
results = results.sort_values(by="Match_Score", ascending=False)

print(results.head())


                                               Resume      Category  \
17  Education Details \r\n B.Tech   Rayat and Bahr...  Data Science   
7   Education Details \r\n B.Tech   Rayat and Bahr...  Data Science   
27  Education Details \r\n B.Tech   Rayat and Bahr...  Data Science   
37  Education Details \r\n B.Tech   Rayat and Bahr...  Data Science   
28  Personal Skills â¢ Ability to quickly grasp t...  Data Science   

    Match_Score  
17     0.292761  
7      0.292761  
27     0.292761  
37     0.292761  
28     0.226729  


In [37]:
def rank_candidates(job_desc, resumes, categories):
    vectorizer = TfidfVectorizer(stop_words="english")
    all_texts = resumes.tolist() + [job_desc]
    tfidf_matrix = vectorizer.fit_transform(all_texts)

    cosine_similarities = cosine_similarity(
        tfidf_matrix[-1], tfidf_matrix[:-1]
    ).flatten()

    results = pd.DataFrame({
        "Resume": resumes,
        "Category": categories,
        "Match_Score": cosine_similarities
    })

    results = results.sort_values(by="Match_Score", ascending=False).reset_index(drop=True)
    return results


In [38]:
job_desc = "Looking for a Python developer with ML experience"
results = rank_candidates(job_desc, df["Resume"], df["Category"])
print(results.head())


                                              Resume          Category  \
0  TECHNICAL PROFICIENCIES Platform: Ubuntu/Fedor...  Python Developer   
1  TECHNICAL PROFICIENCIES Platform: Ubuntu/Fedor...  Python Developer   
2  TECHNICAL PROFICIENCIES Platform: Ubuntu/Fedor...  Python Developer   
3  TECHNICAL PROFICIENCIES Platform: Ubuntu/Fedor...  Python Developer   
4  TECHNICAL PROFICIENCIES Platform: Ubuntu/Fedor...  Python Developer   

   Match_Score  
0     0.158097  
1     0.158097  
2     0.158097  
3     0.158097  
4     0.158097  


In [39]:
import numpy
print(numpy.__version__)
print(numpy.arange(5))


2.3.2
[0 1 2 3 4]
