In [73]:
## STEP 1
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import random
import warnings
warnings.filterwarnings('ignore')

random.seed(42)
np.random.seed(42)


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

## STEP 2  - STOPWORD ko define karege

In [None]:


import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
stop_words = stopwords.words('english')

len(stop_words)

198

In [12]:
list(stop_words)[:10]

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an']

## STEP 3 - Data Template - INput Data (Random data)

In [14]:
# Job Description ke parts
job_description_parts = {
    "role": [
        "Senior Data Scientist",
        "Machine Learning Engineer",
        "Data Analyst",
        "NLP Engineer",
        "ML Operations Engineer"
    ],

    "responsibilities": [
        [
            "develop machine learning models for predictive analytics",
            "analyze large datasets to drive business insights",
            "implement data pipelines using Python and SQL",
            "collaborate with cross-functional teams on data initiatives"
        ],
        [
            "design and build deep learning systems for production",
            "optimize neural network architectures for inference",
            "conduct research on state-of-the-art NLP techniques",
            "deploy models using Docker and Kubernetes"
        ],
        [
            "perform exploratory data analysis on business metrics",
            "create dashboards for stakeholder reporting",
            "develop statistical models for customer behavior prediction",
            "maintain data quality and governance standards"
        ],
        [
            "build NLP pipelines for text classification and entity recognition",
            "fine-tune transformer models for specific use cases",
            "implement named entity recognition systems",
            "develop chatbots using sequence-to-sequence models"
        ]
    ],

    "requirements": [
        [
            "5+ years experience in data science and machine learning",
            "proficiency in Python, R, and SQL",
            "strong understanding of statistical concepts",
            "experience with scikit-learn, TensorFlow, or PyTorch"
        ],
        [
            "masters degree in computer science or related field",
            "3+ years building production ML systems",
            "expertise in deep learning frameworks",
            "familiarity with cloud platforms like AWS or GCP"
        ],
        [
            "proficiency in Python and SQL",
            "knowledge of Tableau or PowerBI for visualization",
            "understanding of business metrics and KPIs",
            "experience with data warehousing tools"
        ]
    ]
}

# Resume ke parts
resume_parts = {
    "experience": [
        [
            "developed machine learning models using TensorFlow and PyTorch",
            "optimized data pipelines reducing processing time by 60%",
            "deployed Python Flask applications on AWS EC2 instances",
            "implemented scikit-learn models achieving 95% accuracy"
        ],
        [
            "built neural networks for image classification tasks",
            "performed feature engineering on large-scale datasets",
            "collaborated with product teams on model implementation",
            "reduced model inference latency from 500ms to 50ms"
        ],
        [
            "analyzed customer behavior using statistical methods",
            "created dashboards in Tableau for executive reporting",
            "performed SQL queries on petabyte-scale databases",
            "improved data quality by implementing validation frameworks"
        ],
        [
            "implemented NLP models for text classification",
            "built transformers for named entity recognition",
            "developed chatbots using sequence models",
            "fine-tuned BERT for domain-specific applications"
        ]
    ],

    "skills": [
        ["Python", "Machine Learning", "TensorFlow", "SQL", "Statistics"],
        ["Deep Learning", "PyTorch", "NLP", "Transformers", "BERT"],
        ["Data Analysis", "Python", "SQL", "Tableau", "Excel"],
        ["NLP", "Python", "Hugging Face", "Transformers", "Information Extraction"]
    ],

    "education": [
        "B.Tech Computer Science from IIT",
        "M.S. Data Science from top university",
        "B.E. Electronics and Communication",
        "M.Tech Machine Learning from premiere institute"
    ]
}

In [16]:
print(f"   - Job role types: {len(job_description_parts['role'])}")
print(f"   - Resume types: {len(resume_parts['experience'])}")

   - Job role types: 5
   - Resume types: 4


#### Gemerate the sample data

In [43]:
def generate_job_desc(n_jobs=3):
  

    jobs = []

    for i in range(n_jobs):
        # Cycle through roles (first 3 get different roles)
        role_idx = i % len(job_description_parts['role'])
        role = job_description_parts['role'][role_idx]

        # Pick corresponding responsibilities and requirements
        responsibilities = job_description_parts['responsibilities'][role_idx]
        requirements = job_description_parts['requirements'][role_idx]

        # Create full JD text
        description = f"""
        JOB DESCRIPTION: {role}

        Responsibilities:
        {'. '.join(responsibilities)}.

        Requirements:
        {'. '.join(requirements)}.

        We are looking for talented professionals to join our team.
        """

        jobs.append({
            'job_id': f'JD_{i+1}',
            'title': role,
            'description': description
        })

    return jobs


In [29]:
def generate_resumes(n_resumes=10):
   

    resumes = []

    for i in range(n_resumes):
        # Cycle through experience types
        exp_idx = i % len(resume_parts['experience'])
        experience = resume_parts['experience'][exp_idx]
        skills = resume_parts['skills'][exp_idx]
        education = random.choice(resume_parts['education'])
        years_exp = random.randint(1, 10)

        candidate_name = f"Candidate_{i+1}"

        resume_text = f"""
        RESUME: {candidate_name}

        Education:
        {education}

        Years of Experience: {years_exp} years

        Professional Experience:
        {'. '.join(experience)}.

        Technical Skills:
        {', '.join(skills)}

        Contact: {candidate_name.lower()}@email.com | +91-9XXXXXXXXX
        """

        resumes.append({
            'resume_id': f'RES_{i+1}',
            'candidate_name': candidate_name,
            'years_experience': years_exp,
            'resume_text': resume_text
        })

    return resumes



In [44]:
jobs = generate_job_desc(3)
resumes = generate_resumes(12)

In [45]:
len(jobs), len(resumes)

(3, 12)

## Step 4 - Text Preprocessing

1. Lowercase
2. Remove Special Character
3. Remove extra spaces
4. Tokenization
5. Remove stopwords
6. Filter Short Tokens

In [52]:
def clean_text(text):
    text = text.lower()
    
    text = re.sub(r'[^a-z0-9\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    
    tokens = text.split()
    
    tokens = [
        token for token in tokens if token not in stop_words and len(token)>1
    ]
    
    cleanedText = ' '.join(tokens)
    
    return cleanedText

In [56]:
clean_text('I am a good programmer, and i love to teach and code ;;jfdlkj2lkj     dkkjad;flkj  ')

'good programmer love teach code jfdlkj2lkj dkkjad flkj'

In [65]:
print("CLEANING TEXT.....")
print("-"*80)

cleaned_jobs = []
for job in jobs:
    cleaned_desc = clean_text(job['description'])
    cleaned_jobs.append({
        'job_id': job['job_id'],
        'title': job['title'],
        'original_text': job['description'],
        'cleaned_text': cleaned_desc
    })

print("BEFORE CLEANING")
print(jobs[0]['description'][:100])
print("-"*80)

print("After CLEANING")
print(cleaned_jobs[0]['cleaned_text'][:100])
print("-"*80)

CLEANING TEXT.....
--------------------------------------------------------------------------------
BEFORE CLEANING

        JOB DESCRIPTION: Senior Data Scientist

        Responsibilities:
        develop machine l
--------------------------------------------------------------------------------
After CLEANING
job description senior data scientist responsibilities develop machine learning models predictive an
--------------------------------------------------------------------------------


In [67]:
cleaned_resumes = []
for resume in resumes:
    cleaned_res = clean_text(resume['resume_text'])
    cleaned_resumes.append({
        'resume_id': resume['resume_id'],
        'candidate_name': resume['candidate_name'],
        'years_experience': resume['years_experience'],
        'original_text': resume['resume_text'],
        'cleaned_text': cleaned_res
    })
    
    

print("BEFORE CLEANING")
print(resumes[0]['resume_text'][:100])
print("-"*80)

print("After CLEANING")
print(cleaned_resumes[0]['cleaned_text'][:100])
print("-"*80)

BEFORE CLEANING

        RESUME: Candidate_1

        Education:
        M.Tech Machine Learning from premiere insti
--------------------------------------------------------------------------------
After CLEANING
resume candidate education tech machine learning premiere institute years experience 10 years profes
--------------------------------------------------------------------------------


## Vectorizationj - TFIDF

In [72]:
all_cleaned_text = []
text_sources = []


for job in cleaned_jobs:
    all_cleaned_text.append(job['cleaned_text'])
    text_sources.append({'type': 'job', 'id': job['job_id'], 'title': job['title']})
    

for resume in cleaned_resumes:
    all_cleaned_text.append(resume['cleaned_text'])
    text_sources.append({'type': 'resume', 'id': resume['resume_id'], 'name':resume['candidate_name']})
    
print(f"‚úÖ Total texts to vectorize: {len(all_cleaned_text)}")
print(f"   - Jobs: {len(cleaned_jobs)}")
print(f"   - Resumes: {len(cleaned_resumes)}\n")
    

‚úÖ Total texts to vectorize: 15
   - Jobs: 3
   - Resumes: 12



In [74]:
# TFIDF
vectorizer = TfidfVectorizer(
    max_features = 5000,
    ngram_range= (1,2),
    min_df = 2,
    max_df = 0.8, 
    lowercase=True,
    stop_words = 'english'
)

In [75]:
tfidf_matrix = vectorizer.fit_transform(all_cleaned_text)

In [76]:

print(f"‚úÖ Vectorization complete!\n")
print(f"Matrix Details:")
print(f"   Shape: {tfidf_matrix.shape}")
print(f"   ‚Üí {tfidf_matrix.shape[0]} documents (rows)")
print(f"   ‚Üí {tfidf_matrix.shape[1]} features (columns)")
print(f"   ‚Üí Sparsity: {(1 - tfidf_matrix.nnz / (tfidf_matrix.shape[0] * tfidf_matrix.shape[1])) * 100:.2f}%")
print(f"      (High sparsity = memory efficient! Mostly zeros)\n")


‚úÖ Vectorization complete!

Matrix Details:
   Shape: (15, 284)
   ‚Üí 15 documents (rows)
   ‚Üí 284 features (columns)
   ‚Üí Sparsity: 72.54%
      (High sparsity = memory efficient! Mostly zeros)



In [77]:
# Show vocabulary sample
feature_names = vectorizer.get_feature_names_out()
print(f"üìö Vocabulary size: {len(feature_names)}")
print(f"   Sample terms: {list(feature_names[:15])}\n")

# Separate job and resume vectors
n_jobs = len(cleaned_jobs)
n_resumes = len(cleaned_resumes)

job_vectors = tfidf_matrix[:n_jobs]
resume_vectors = tfidf_matrix[n_jobs:]

print(f"‚úÖ Vectors separated:")
print(f"   Job vectors: {job_vectors.shape}")
print(f"   Resume vectors: {resume_vectors.shape}")

üìö Vocabulary size: 284
   Sample terms: ['10', '500ms', '500ms 50ms', '50ms', '50ms technical', '60', '60 deployed', '91', '91 9xxxxxxxxx', '95', '95 accuracy', '9xxxxxxxxx', 'accuracy', 'accuracy technical', 'achieving']

‚úÖ Vectors separated:
   Job vectors: (3, 284)
   Resume vectors: (12, 284)


## Cosine Similarity

In [90]:
all_results = []

for job_id, job in enumerate(cleaned_jobs):
    print(f"\nüìã JOB: {job['title']} (ID: {job['job_id']})")
    print("-" * 70)
    # print(job_vectors[job_id])
    job_vector = job_vectors[job_id]
    
    similarities = cosine_similarity(job_vector, resume_vectors)
    # print(similarities)
    
    #flat to 1d array
    flat_similarities = similarities.flatten()
    
    
    #store the result
    
    for resume_idx, resume in enumerate(cleaned_resumes):
        score = flat_similarities[resume_idx]
        
        all_results.append({
            'job_id': job['job_id'],
            'job_title': job['title'],
            'resume_id': resume['resume_id'],
            'candidate_name': resume['candidate_name'],
            'years_experience': resume['years_experience'],
            'similarity_score': score
        })
        
    # Print top 3 matches for this job
    top_k = 3
    top_indices = np.argsort(flat_similarities)[::-1][:top_k]

    print(f"\nüèÜ Top {top_k} Matching Candidates:")
    print()

    for rank, resume_idx in enumerate(top_indices, 1):
        score = flat_similarities[resume_idx]
        candidate = cleaned_resumes[resume_idx]
        score_percent = score * 100

        # Visual bar
        bar_length = int(score_percent / 5)
        bar = "‚ñà" * bar_length + "‚ñë" * (20 - bar_length)

        print(f"{rank}. {candidate['candidate_name']} ({candidate['years_experience']} yrs)")
        print(f"   Score: {score:.4f} ({score_percent:.1f}%)")
        print(f"   {bar}")
        print()
        
        
        
    


üìã JOB: Senior Data Scientist (ID: JD_1)
----------------------------------------------------------------------

üèÜ Top 3 Matching Candidates:

1. Candidate_1 (10 yrs)
   Score: 0.3039 (30.4%)
   ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë

2. Candidate_9 (9 yrs)
   Score: 0.2801 (28.0%)
   ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë

3. Candidate_5 (6 yrs)
   Score: 0.2801 (28.0%)
   ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë


üìã JOB: Machine Learning Engineer (ID: JD_2)
----------------------------------------------------------------------

üèÜ Top 3 Matching Candidates:

1. Candidate_6 (5 yrs)
   Score: 0.1810 (18.1%)
   ‚ñà‚ñà‚ñà‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë

2. Candidate_2 (9 yrs)
   Score: 0.1743 (17.4%)
   ‚ñà‚ñà‚ñà‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë

3. Candidate_1 (10 yrs)
   Score: 0.1612 (16.1%)
   ‚ñà‚ñà‚ñà‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë


üìã JOB: Data An

In [91]:

results_df = pd.DataFrame(all_results)

print("üìä COMPLETE MATCHING RESULTS")
print("="*100)
print()

# Sort by job and score
results_df_sorted = results_df.sort_values(
    by=['job_id', 'similarity_score'],
    ascending=[True, False]
)

# Display
print(results_df_sorted.to_string(index=False))

print("\n" + "="*100)

# Key statistics
print("\nüìà KEY STATISTICS:")
print("-" * 100)

print("\n1. Average match scores by job:")
for job_id in results_df['job_id'].unique():
    job_data = results_df[results_df['job_id'] == job_id]
    job_title = job_data['job_title'].iloc[0]
    avg_score = job_data['similarity_score'].mean()
    max_score = job_data['similarity_score'].max()

    print(f"\n   {job_title} ({job_id}):")
    print(f"      Average: {avg_score:.4f}")
    print(f"      Best: {max_score:.4f}")

print("\n2. Top candidates (highest average across all jobs):")
candidate_avg = results_df.groupby('candidate_name')['similarity_score'].mean().sort_values(ascending=False)
print()
for candidate, score in candidate_avg.head(5).items():
    print(f"   {candidate}: {score:.4f}")


üìä COMPLETE MATCHING RESULTS

job_id                 job_title resume_id candidate_name  years_experience  similarity_score
  JD_1     Senior Data Scientist     RES_1    Candidate_1                10          0.303868
  JD_1     Senior Data Scientist     RES_5    Candidate_5                 6          0.280105
  JD_1     Senior Data Scientist     RES_9    Candidate_9                 9          0.280105
  JD_1     Senior Data Scientist     RES_7    Candidate_7                 3          0.165742
  JD_1     Senior Data Scientist    RES_11   Candidate_11                 5          0.134090
  JD_1     Senior Data Scientist     RES_3    Candidate_3                 9          0.126774
  JD_1     Senior Data Scientist    RES_10   Candidate_10                 9          0.106250
  JD_1     Senior Data Scientist     RES_2    Candidate_2                 9          0.101572
  JD_1     Senior Data Scientist    RES_12   Candidate_12                 3          0.077467
  JD_1     Senior Data Scien