<a href="https://colab.research.google.com/github/paoins/Job-Recommendation/blob/main/Recommendation_system.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import kagglehub
path = kagglehub.dataset_download("asaniczka/1-3m-linkedin-jobs-and-skills-2024")
print(path)

In [None]:

import shutil
import os

dataset_src = path
dataset_working = '/content/dataset'

# Remove if exists
shutil.rmtree(dataset_working, ignore_errors=True)

shutil.copytree(dataset_src, dataset_working)
print("Dataset copied to /content/dataset")


# Project Overview

## What Are We Building?
- A job recommendation system that suggests relevant data science jobs to candidates based on:

  - Their skills
  - Experience level
  - Location preferences
  - Similar candidates' behavior (collaborative filtering)


## Why This Approach?
- **Real-world problem:** Job boards show thousands of irrelevant jobs. We want to rank jobs by relevance.

- **Our solution:** Three recommendation approaches:

    1. **Content-Based:** Match based on skills/experience (like a smart filter)
    2. **Collaborative Filtering:** Learn from patterns (like "people who liked X also liked Y")
    3. **Hybrid:** Combine both for best results


# 1.DATA LOADING & PREPROCESSING
Loads 1.3 million LinkedIn jobs, filters for data science roles, cleans the data, and prepares it for modeling.


In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
import re
import ast
import random
from scipy.sparse import csr_matrix


- We loads only specific columns from the CSV Why: The dataset has 30+ columns; we only need 8 to save memory

**Columns explained:**

 - **job_link**: Unique identifier for each job
- **job_title**: "Senior Data Scientist", "ML Engineer", etc.
- **company**: "Google", "Meta", etc.
- **job_location**: "San Francisco, CA" or "Remote"
- **got_summary**: Boolean : does this job have a description?
- **got_ner**: Boolean = was Named Entity Recognition applied?


In [None]:
# Load Job Posting
usecols = [
            'job_link', 'job_title', 'company', 'job_location',
            'first_seen', 'search_city', 'got_summary', 'got_ner'
        ]

job_postings = pd.read_csv(
            "/content/dataset/linkedin_job_postings.csv",
            usecols=usecols
        )

In [None]:
print(f" Loaded {len(job_postings):,} job postings")


- Load job Job summaries (full descriptions) and Job skills (required skills like "Python", "SQL")


In [None]:
# Loading Job Summaries
job_summaries = pd.read_csv(
            "/content/dataset/job_summary.csv"
        )
print(f"Loaded {len(job_summaries):,} job summaries")



In [None]:
# Loading skills
job_skills = pd.read_csv(
            "/content/dataset/job_skills.csv"
        )
print(f'Loaded {len(job_skills):,} job skills')

In [None]:
# Data Check
print(f"\nData Quality Check:")
print(f"  - Missing job titles: {job_postings['job_title'].isna().sum()}")
print(f"  - Missing locations: {job_postings['job_location'].isna().sum()}")
print(f"  - Duplicate job links: {job_postings['job_link'].duplicated().sum()}")


- Combines all 3 datasets using `job_link` as the key

In [None]:
# Merge the 3 datasets

# Merge postings with summaries
merged = job_postings.merge(job_summaries, on='job_link', how='left')
print(f"After merging summaries: {len(merged):,} rows")

# Merge with skills
df = merged.merge(job_skills, on='job_link', how='left')
print(f"After merging skills: {len(df):,} rows")


- `parse_skills`: Convert skills to list
- `extract_experience_level`: Extracts experience level from job title

In [None]:
# Split , Strip , Lower and strip
def parse_skills(skills):
    if pd.isna(skills):
        return []
    return [s.strip().lower() for s in skills.split(',') if s.strip()]

# Infers seniority from Title
def extract_experience_level(title):
  # Lower
    title = str(title).lower()
    if any(x in title for x in ['senior', 'sr', 'lead', 'principal']):
        return 'Senior'
    if any(x in title for x in ['junior', 'jr', 'entry', 'graduate', 'intern']):
        return 'Junior'
    if any(x in title for x in ['manager', 'head', 'director']):
        return 'Manager'
    return 'Mid' # If no keyword assume mid

- Removes jobs without skills or summaries
- Creates new columns
  - `skills_list`: Parsed skills (list format)
  - `experience_level`: Extracted from title
  - `job_id`: Unique ID like 'JOB_000001', 'JOB_000002'


In [None]:
# Track the loss
initial_rows = len(df)

# Remove the job without skills
df = df[df['job_skills'].notna()].copy()
print(f"Removed {initial_rows - len(df):,} jobs without skills")

# Remove jobs without summaries
df = df[df['job_summary'].notna()].copy()
print(f"Kept {len(df):,} jobs with both skills and summaries")

# Handle missing locations safely
df['job_location'] = df['job_location'].fillna('Remote')

# Parse skills into a usable format
df['skills_list'] = df['job_skills'].apply(parse_skills)

# Infer experience level from title
df['experience_level'] = df['job_title'].apply(extract_experience_level)

# Generate stable job IDs
df['job_id'] = ['JOB_' + str(i).zfill(6) for i in range(len(df))]

- Filters for only data science jobs

In [None]:
# Filter for data science related jobs

# Keywords for data science jobs
ds_keywords = [
            'data scientist', 'data science', 'machine learning', 'ml engineer',
            'data analyst', 'data engineer', 'ai engineer', 'analytics',
            'business intelligence', 'deep learning', 'nlp engineer',
            'research scientist', 'applied scientist'
        ]

# Filter based on job title
mask = df['job_title'].str.lower().str.contains('|'.join(ds_keywords), na=False)
filtered_df = df[mask].copy()

print(f" Found {len(filtered_df):,} data science related jobs")



In [None]:
from pathlib import Path

# 1. Ensure the directory exists
output_dir = Path("data/processed")
output_dir.mkdir(parents=True, exist_ok=True)

# 2. Save the dataset to CSV
output_path = output_dir / "data_science_jobs.csv"
filtered_df.to_csv(output_path, index=False)

print(f" Saved {len(filtered_df):,} Data Science jobs to {output_path}")


In [None]:
# Flatten all skills
all_skills = []
for skills_list in filtered_df['skills_list']:
            all_skills.extend(skills_list)

# Count occurrences
skill_counts = pd.Series(all_skills).value_counts()

print(f"Found {len(skill_counts):,} unique skills")
print(f"\nTop 20 most common skills:")
print(skill_counts.head(20))

# 2. Create Candidate
- Creates 1,000 realistic fake candidate profiles with skills, experience, education, and salary expectations.

- **Problem**: We don't have real candidate data (privacy concerns)

* **Solution**: Simulate realistic candidates so we can:

    - Test the recommendation system
    - Train the collaborative filtering model
    - Show the app working with sample data


1. We loop through every job's skill list
2. Add all skills to one big list (with duplicates)
3. `set()` removes duplicates to keep unique skills
4. Count how often each skill appears

In [None]:
# Extract Unique skills from job data
def extract_skill_pool(job_skills_df):
    all_skills = []
    for skills_list in job_skills_df['skills_list']:
        all_skills.extend(skills_list)

    skill_pool = list(set(all_skills))
    skill_frequencies = pd.Series(all_skills).value_counts()

    print(f" Extracted {len(skill_pool)} unique skills from job data")
    return skill_pool, skill_frequencies


In [None]:
def create_single_candidate(idx, skill_pool, skill_frequencies):

    # Randomly assigns years of experience (0-15) with realistic distribution
    weights = np.array([0.10, 0.10, 0.10, 0.10, 0.10, 0.10,
                        0.07, 0.07, 0.07, 0.07, 0.07,
                        0.03, 0.03, 0.03, 0.03, 0.03])
    experience_years = np.random.choice(range(16), p=weights / weights.sum())

    # Maps years of experience to Experience Level and Number of skills
    if experience_years == 0:
        exp_level, n_skills = 'Internship', np.random.randint(3, 8)
    elif experience_years <= 2:
        exp_level, n_skills = 'Entry Level', np.random.randint(5, 12)
    elif experience_years <= 5:
        exp_level, n_skills = 'Mid Level', np.random.randint(8, 18)
    elif experience_years <= 10:
        exp_level, n_skills = 'Senior', np.random.randint(12, 25)
    else:
        exp_level, n_skills = 'Leadership', np.random.randint(15, 30)

    # Gives each candidate a "specialization" and not just random skills
    SKILL_CLUSTERS = {
        "ML": ["machine learning", "deep learning", "pytorch", "tensorflow", "nlp", "computer vision"],
        "Data": ["python", "sql", "pandas", "numpy", "data analysis", "statistics"],
        "DE": ["spark", "hadoop", "airflow", "aws", "gcp", "etl", "data engineering"],
        "BI": ["tableau", "power bi", "excel", "data visualization", "business intelligence"],
        "Backend": ["java", "scala", "apis", "microservices"]
    }

    cluster = random.choice(list(SKILL_CLUSTERS.keys()))
    cluster_skills = [s for s in SKILL_CLUSTERS[cluster] if s in skill_pool]

    top_skills = skill_frequencies.head(200).index.tolist()
    other_skills = list(set(skill_pool) - set(cluster_skills))

    # Picks skills for the candidate ( 60% from their specialization cluster + 40% from other areas)
    n_cluster = max(1, int(n_skills * 0.6))
    n_other = n_skills - n_cluster
    selected_skills = (
        random.sample(cluster_skills, min(len(cluster_skills), n_cluster)) +
        random.sample(other_skills, min(len(other_skills), n_skills - n_cluster))
    )

    selected_skills = list(set(selected_skills))[:n_skills]

    # Assigns education based on experience level
    EDU_BY_LEVEL = {
        'Internship': ["Bachelor's in Computer Science", "Bachelor's in Data Science"],
        'Entry Level': ["Bachelor's in Computer Science", "Bachelor's in Data Science"],
        'Mid Level': ["Master's in Computer Science", "Bachelor's in Engineering"],
        'Senior': ["Master's in Computer Science", "Master's in Applied Mathematics"],
        'Leadership': ["Master's in Computer Science", "PhD in Machine Learning"]
    }

    # Set salary exp
    salary_ranges = {
        'Internship': (40000, 80000),
        'Entry Level': (60000, 100000),
        'Mid Level': (90000, 150000),
        'Senior': (130000, 200000),
        'Leadership': (160000, 280000)
    }
    min_sal, max_sal = salary_ranges[exp_level]

    locations = [
        'Remote', 'New York, NY', 'San Francisco, CA', 'Seattle, WA',
        'Austin, TX', 'Boston, MA', 'Chicago, IL'
    ]

    # Randomly picks 1-3 preferred cities
    preferred_locations = random.sample(locations, random.randint(1, 3))
    salary_min = np.random.randint(min_sal, max_sal - 20000)

    return {
        'candidate_id': f'CAND_{idx+1:06d}',
        'experience_years': int(experience_years),
        'experience_level': exp_level,
        'domain': cluster,
        'skills_list': selected_skills,
        'skills': ', '.join(selected_skills),
        'education': random.choice(EDU_BY_LEVEL[exp_level]),
        'desired_salary_min': int(salary_min),
        'desired_salary_max': int(salary_min + np.random.randint(20000, 40000)),
        'preferred_locations': ', '.join(preferred_locations),
        'open_to_remote': 'Remote' in preferred_locations,
        'willing_to_relocate': random.random() > 0.5
    }

In [None]:
def generate_candidates(skill_pool, skill_frequencies, n_candidates=1000):
    print(f"\n Generating {n_candidates} candidate profiles...")
    candidates = []

    for i in range(n_candidates):
        candidates.append(
            create_single_candidate(i, skill_pool, skill_frequencies)
        )

        if (i + 1) % 200 == 0:
            print(f"   Generated {i+1}/{n_candidates} candidates...")

    df = pd.DataFrame(candidates)
    print(f" Generated {len(df)} candidates")
    return df


In [None]:
def save_candidates(df, filename="candidates.csv"):
    output_dir = Path("data/processed")
    output_dir.mkdir(parents=True, exist_ok=True)

    output_path = output_dir / filename
    df.to_csv(output_path, index=False)
    print(f"\n Saved {len(df)} candidates to {output_path}")


In [None]:
print("="*60)
print("   Candidate Profile Generation ")
print("="*60)

skill_pool, skill_frequencies = extract_skill_pool(filtered_df)
candidates_df = generate_candidates(skill_pool, skill_frequencies, n_candidates=1000)
save_candidates(candidates_df)


# 3. Feature Engineering
- Converts text and categorical data into numbers that machine learning algorithms can understand.

In [None]:
filtered_df.head()

In [None]:
# Cleans up skill names
def clean_skill(skill):
    skill = skill.lower().strip()
    if any(char.isdigit() for char in skill):
        return None
    if re.search(r'[\$\+\*\(\)]', skill):
        return None
    if len(skill.split()) > 3:
        return None
    return skill


In [None]:
filtered_df['skills_list'] = filtered_df['skills_list'].apply(
    lambda skills: [clean_skill(s) for s in skills if clean_skill(s)]
)

In [None]:
# Create feature matrix for jobs
from sklearn.feature_extraction.text import TfidfVectorizer
import gc
from sklearn.preprocessing import MultiLabelBinarizer

skill_encoder = MultiLabelBinarizer()
tfidf_vectorizer = TfidfVectorizer(
            max_features=500,
            stop_words='english',
            ngram_range=(1, 2),
            min_df=2
  )

# 1. Clear memory from the previous crash
gc.collect()

# 2. Identify the Top 1000 skills
all_skills_flat = [s for sublist in filtered_df['skills_list'] for s in sublist]
top_1000_skills = pd.Series(all_skills_flat).value_counts().head(1000).index.tolist()

print(f" Selected top 1000 skills out of {len(set(all_skills_flat))} total")

# 3. Create the encoder using ONLY those 1000 skills
skill_encoder = MultiLabelBinarizer(classes=top_1000_skills)

print("Encoding skills...")
skill_matrix = skill_encoder.fit_transform(filtered_df['skills_list'])
skill_feature_names = top_1000_skills

# 4. Process Text, Experience, and Remote
print("Vectorizing job descriptions...")
summaries = filtered_df['job_summary'].fillna('')
tfidf_matrix = tfidf_vectorizer.fit_transform(summaries)

print("Encoding experience and location...")
exp_level_dummies = pd.get_dummies(filtered_df['experience_level'], prefix='exp')
filtered_df['is_remote'] = filtered_df['job_location'].str.contains('remote', case=False, na=False).astype(int)

# 5. Combine everything
print("Combining features...")

skill_df = pd.DataFrame(
    skill_matrix,
    columns=[f'skill_{s}' for s in skill_feature_names]
)

tfidf_df = pd.DataFrame(
    tfidf_matrix.toarray(),
    columns=[f'tfidf_{i}' for i in range(tfidf_matrix.shape[1])]
)

job_feature_df = pd.concat([
    filtered_df[['job_id', 'is_remote']].reset_index(drop=True),
    exp_level_dummies.reset_index(drop=True),
    skill_df,
    tfidf_df
], axis=1)
job_feature_df.to_csv("data/processed/job_features.csv", index=False)

print(f" Created feature matrix: {job_feature_df.shape}")

In [None]:
# Create feature matrix for candidates using fitted encoders
candidates_df= pd.read_csv("data/processed/candidates.csv")
candidates_df['skills_list'] = candidates_df['skills_list'].apply(ast.literal_eval)

print(f"Loaded {len(candidates_df)} candidates")
print(f"Sample skills: {type(candidates_df['skills_list'].iloc[0])}")

# Validation
print("\n  Data Validation:")
print(f"  Skills list type: {type(candidates_df['skills_list'].iloc[0])}")
print(f"  Sample skills: {candidates_df['skills_list'].iloc[0][:3]}")

assert isinstance(candidates_df['skills_list'].iloc[0], list), \
    "ERROR: skills_list should be a list, not a string!"


# 1. Skill-based features (using same encoder as jobs)
print("   Encoding candidate skills...")
skill_matrix = skill_encoder.transform(candidates_df['skills_list'])
skill_feature_names = skill_encoder.classes_

# 2. Experience level encoding
print("   Encoding experience levels...")
exp_level_dummies = pd.get_dummies(
            candidates_df['experience_level'],
            prefix='exp'
)

       # Ensure same columns as jobs
for col in ['exp_Entry Level', 'exp_Internship', 'exp_Leadership',
                    'exp_Mid Level', 'exp_Senior']:
                    if col not in exp_level_dummies.columns:
                      exp_level_dummies[col] = 0

# 3. Remote preference
print("   Processing preferences...")
candidates_df['is_remote'] = candidates_df['open_to_remote'].astype(int)

        # Combine features
skill_df = pd.DataFrame(
            skill_matrix,
            columns=[f'skill_{s}' for s in skill_feature_names]
        )


candidate_feature_df = pd.concat([
            candidates_df[['candidate_id', 'is_remote']].reset_index(drop=True),
            exp_level_dummies.reset_index(drop=True),
            skill_df
        ], axis=1)

candidate_feature_df.to_csv("data/processed/candidate_features.csv", index=False)
print(f"Created candidate feature matrix: {candidate_feature_df.shape}")



In [None]:
print(f"\n Feature Summary:")
print(f"  Total features: {candidate_feature_df.shape[1]}")
print(f"  - Skill features: {skill_matrix.shape[1]}")
print(f"  - Experience features: {len(exp_level_dummies.columns)}")

# 4. Content_based Recomender
- Recommends jobs by directly comparing candidate profile to job requirements

In [None]:
# Calculates how well candidate's skills match job requirements
def calculate_skill_match_score(candidate_skills, job_skills):
    candidate_set = set(candidate_skills)
    job_set = set(job_skills)
    if not job_set: return 0.0

    intersection = len(candidate_set & job_set)
    union = len(candidate_set | job_set)

    # jaccard = (skills in common) / (all unique skills)
    jaccard_score = intersection / union if union > 0 else 0

    # coverage = (skills you have that job needs) / (total skills job needs)
    coverage_score = intersection / len(job_set) if len(job_set) > 0 else 0

    return (0.6 * coverage_score) + (0.4 * jaccard_score)


# Scores how well candidate's experience matches job
def calculate_experience_match(candidate_exp, job_exp):
    hierarchy = {'Internship': 0, 'Entry Level': 1, 'Mid Level': 2, 'Senior': 3, 'Leadership': 4}
    cand_level = hierarchy.get(candidate_exp, 2)
    job_level = hierarchy.get(job_exp, 2)
    diff = abs(cand_level - job_level)

    mapping = {0: 1.0, 1: 0.7, 2: 0.4}
    return mapping.get(diff, 0.2)

# Scores location compatibility
def calculate_location_match(candidate_locations, job_location, candidate_remote):
    if pd.isna(job_location): return 0.5
    job_loc = str(job_location).lower()

    if 'remote' in job_loc and candidate_remote: return 1.0

    candidate_locs = [loc.strip().lower() for loc in str(candidate_locations).split(',')]
    for loc in candidate_locs:
        if loc in job_loc or job_loc in loc: return 1.0

    return 0.5 if candidate_remote else 0.3

In [None]:
def calculate_salary_match(candidate_salary, job_salary=None):
    if job_salary is None:
        return 0.7
    return 1.0 if candidate_salary <= job_salary else 0.4

In [None]:
def get_recommendations(candidate_id, df_jobs, df_candidates, top_k=5):
    # Validate candidate exists
    candidate_match = df_candidates[df_candidates['candidate_id'] == candidate_id]
    if len(candidate_match) == 0:
        raise ValueError(f"Candidate {candidate_id} not found!")

    candidate = candidate_match.iloc[0]
    results =[]

    for _, job in df_jobs.iterrows():
        # Get individual scores
        s_score = calculate_skill_match_score(candidate['skills_list'], job['skills_list'])
        e_score = calculate_experience_match(candidate['experience_level'], job['experience_level'])
        l_score = calculate_location_match(candidate['preferred_locations'], job['job_location'], candidate['open_to_remote'])
        sal_score = calculate_salary_match(candidate.get('desired_salary_min'), job.get('salary'))

        # Weighted Total
        total = 0.45 * s_score + 0.25 * e_score + 0.15 * l_score + 0.15 * sal_score


        results.append({
            'job_id': job['job_id'],
            'job_title': job['job_title'],
            'company': job['company'],
            'overall_score': total,
            'matching_skills': list(set(candidate['skills_list']) & set(job['skills_list'])),
            'missing_skills': list(set(job['skills_list']) - set(candidate['skills_list']))
        })

    return pd.DataFrame(results).sort_values('overall_score', ascending=False).head(top_k)

In [None]:
# 1. Pick a candidate
my_cand_id = candidates_df.iloc[0]['candidate_id']

# 2. Run the recommender
top_jobs = get_recommendations(my_cand_id, filtered_df, candidates_df)

# 3. Look at the results
print(f"Top matches for {my_cand_id}:")
display(top_jobs)

In [None]:
top_k = len(top_jobs)
print(f"\n  Top {top_k} Job Recommendations for {my_cand_id}")
print(f"   Candidate Profile: {candidates_df[candidates_df['candidate_id']==my_cand_id]['experience_level'].values[0]}")
print(f"   Domain: {candidates_df[candidates_df['candidate_id']==my_cand_id]['domain'].values[0]}\n")

for idx, row in top_jobs.iterrows():
    print(f"{idx+1}. {row['job_title']} at {row['company']}")
    print(f"   Match Score: {row['overall_score']:.2%}")
    print(f"    Matching Skills: {', '.join(row['matching_skills'][:5])}")
    print(f"    Missing Skills: {', '.join(row['missing_skills'][:3])}\n")


# 5.Colab Filtering
- Learns patterns from how candidates interact with jobs to make recommendations

### Simulating user interactions
- We will simulate which candidates applied to which jobs and to make it more realistic: candidates apply to job that they match well with

In [None]:
def simulate_interactions(candidates_df, jobs_df):
     #Parse skills if they're strings
    if isinstance(candidates_df['skills_list'].iloc[0], str):
        candidates_df = candidates_df.copy()
        candidates_df['skills_list'] = candidates_df['skills_list'].apply(ast.literal_eval)

    if isinstance(jobs_df['skills_list'].iloc[0], str):
        jobs_df = jobs_df.copy()
        jobs_df['skills_list'] = jobs_df['skills_list'].apply(ast.literal_eval)

    # Simulates candidates applying to jobs
    interactions = []

    sample_candidates = candidates_df.sample(min(500, len(candidates_df)))
    sample_jobs = jobs_df.sample(min(2000, len(jobs_df)))

    for _, candidate in sample_candidates.iterrows():
        n_applications = np.random.randint(5, 15)

        scores = []
        eligible_jobs = []

        #  Filter realistic jobs
        for _, job in sample_jobs.iterrows():
            # Candidates only apply to jobs with >25% skill match
            skill_score = calculate_skill_match_score(
                candidate['skills_list'],
                job['skills_list']
            )

            if skill_score < 0.25:
                continue  # realism fix

            exp_score = calculate_experience_match(
                candidate['experience_level'],
                job['experience_level']
            )

            total_score = 0.7 * skill_score + 0.3 * exp_score
            scores.append(total_score)
            eligible_jobs.append(job)

        if len(eligible_jobs) == 0:
            continue

        # Sample applications
        scores = np.array(scores)
        probs = scores / scores.sum()

        selected_indices = np.random.choice(
            len(eligible_jobs),
            size=min(n_applications, len(eligible_jobs)),
            replace=False,
            p=probs
        )

        #  Generates ratings (1-5 stars) correlated with skill match
        for idx in selected_indices:
            job = eligible_jobs[idx]

            skill_score = calculate_skill_match_score(
                candidate['skills_list'],
                job['skills_list']
            )

            rating = np.clip(
                1 + 4 * skill_score + np.random.normal(0, 0.3),
                1, 5
            )

            interactions.append({
                'candidate_id': candidate['candidate_id'],
                'job_id': job['job_id'],
                'rating': rating,
                'applied': 1
            })

    return pd.DataFrame(interactions)


In [None]:
print("Generating candidate-job interactions...")
interactions_df = simulate_interactions(candidates_df, filtered_df)

print(f"\nInteraction Statistics:")
print(f"  Total interactions: {len(interactions_df):,}")
print(f"  Unique candidates: {interactions_df['candidate_id'].nunique()}")
print(f"  Unique jobs: {interactions_df['job_id'].nunique()}")
print(f"  Avg applications per candidate: {len(interactions_df) / interactions_df['candidate_id'].nunique():.1f}")
print(f"  Rating distribution:\n{interactions_df['rating'].value_counts().sort_index()}")

# Save interactions
interactions_df.to_csv('data/processed/interactions.csv', index=False)


- Creates a collaborative filtering model

In [None]:
from sklearn.decomposition import TruncatedSVD

class CollaborativeRecommender:
    """
    Matrix factorization based collaborative filtering
    """

    def __init__(self, n_factors=50):
        self.n_factors = n_factors
        self.svd = TruncatedSVD(n_components=n_factors, random_state=42)
        self.user_factors = None
        self.item_factors = None
        self.user_to_idx = None
        self.job_to_idx = None
        self.idx_to_job = None

    def fit(self, interactions_df):
        """
        Fit the collaborative filtering model
        """
        print(" Training collaborative filtering model...")

        # Create user-item matrix
        self.user_to_idx = {user: idx for idx, user in
                           enumerate(interactions_df['candidate_id'].unique())}
        self.job_to_idx = {job: idx for idx, job in
                          enumerate(interactions_df['job_id'].unique())}
        self.idx_to_job = {idx: job for job, idx in self.job_to_idx.items()}

        n_users = len(self.user_to_idx)
        n_jobs = len(self.job_to_idx)

        print(f"  Matrix size: {n_users} candidates × {n_jobs} jobs")

        # Create sparse matrix
        rows = interactions_df['candidate_id'].map(self.user_to_idx)
        cols = interactions_df['job_id'].map(self.job_to_idx)
        data = interactions_df['rating'].values

        user_item_matrix = csr_matrix((data, (rows, cols)), shape=(n_users, n_jobs))

        # Apply SVD
        print(f"  Applying SVD with {self.n_factors} factors...")
        self.user_factors = self.svd.fit_transform(user_item_matrix)
        self.item_factors = self.svd.components_.T

        print(f" Model trained!")
        print(f"  Explained variance: {self.svd.explained_variance_ratio_.sum():.2%}")

        return self

    def predict(self, candidate_id, job_id):
        """
        Predict rating for candidate-job pair
        """
        if candidate_id not in self.user_to_idx or job_id not in self.job_to_idx:
            return 0.0

        user_idx = self.user_to_idx[candidate_id]
        job_idx = self.job_to_idx[job_id]

        prediction = np.dot(self.user_factors[user_idx], self.item_factors[job_idx])
        return prediction

    def recommend_jobs(self, candidate_id, jobs_df, top_k=10, exclude_applied=None):
        """
        Recommend jobs for a candidate using CF
        """
        if candidate_id not in self.user_to_idx:
            return pd.DataFrame()  # Cold start

        user_idx = self.user_to_idx[candidate_id]

        # Calculate scores for all jobs
        scores = np.dot(self.user_factors[user_idx], self.item_factors.T)
        scores = (scores - scores.min()) / (scores.max() - scores.min() + 1e-10)

        # Get job IDs in order
        job_ids = [self.idx_to_job[i] for i in range(len(scores))]

        # Create recommendations dataframe
        recs = pd.DataFrame({
            'job_id': job_ids,
            'cf_score': scores
        })

        # Merge with job details
        recs = recs.merge(
            jobs_df[['job_id', 'job_title', 'company', 'job_location', 'experience_level']],
            on='job_id'
        )

        # Exclude already applied jobs
        if exclude_applied is not None:
            recs = recs[~recs['job_id'].isin(exclude_applied)]

        # Sort and return top K
        return recs.sort_values('cf_score', ascending=False).head(top_k)

# Train the model
n_factors = min(30, interactions_df['job_id'].nunique() - 1)

cf_recommender = CollaborativeRecommender(n_factors=n_factors)
cf_recommender.fit(interactions_df)


print("\n Collaborative filtering model ready")


In [None]:
print("\n Sample Recommendations:")
sample_candidate = candidates_df.sample(1).iloc[0]
print(f"Candidate: {sample_candidate['candidate_id']}")
print(f"  Experience: {sample_candidate['experience_level']}")
print(f"  Domain: {sample_candidate['domain']}")

# Get their applied jobs
applied_jobs = interactions_df[
    interactions_df['candidate_id'] == sample_candidate['candidate_id']
]['job_id'].tolist()

# Get CF recommendations
recs = cf_recommender.recommend_jobs(
    sample_candidate['candidate_id'],
    filtered_df,
    top_k=5,
    exclude_applied=applied_jobs
)

print("\nTop 5 Recommended Jobs:")
print(recs[['job_title', 'company', 'cf_score']].to_string(index=False))


In [None]:
import pickle
import os

os.makedirs("models", exist_ok=True)


# Save collaborative model
with open("models/cf_model.pkl", "wb") as f:
    pickle.dump(cf_recommender, f)


## Hybrid

In [None]:
def safe_minmax(series):
    if series.max() == series.min():
        return np.zeros(len(series))
    return (series - series.min()) / (series.max() - series.min())

In [None]:
def hybrid_recommend(candidate_id, content_weight=0.6, cf_weight=0.4, top_k=10):
    """
    Combine content-based and collaborative filtering

    Parameters:
    - content_weight: Weight for content-based score (0-1)
    - cf_weight: Weight for collaborative filtering score (0-1)
    """

    # Get content-based recommendations
    content_recs = get_recommendations(candidate_id, filtered_df, candidates_df, top_k=100)

    # Get CF recommendations (if candidate has interactions)
    candidate_interactions = interactions_df[
        interactions_df['candidate_id'] == candidate_id
    ]['job_id'].tolist()

    cf_recs = cf_recommender.recommend_jobs(
        candidate_id,
        filtered_df,
        top_k=100,
        exclude_applied=candidate_interactions
    )

    # If no CF recommendations (cold start), use only content-based
    if len(cf_recs) == 0:
      print(f" Cold start for {candidate_id} - using content-based only")

      content_recs = content_recs.head(top_k).copy()
      content_recs['content_score_norm'] = (
        content_recs['overall_score'] / content_recs['overall_score'].max()
      )
      content_recs['cf_score_norm'] = 0.0
      content_recs['hybrid_score'] = content_recs['content_score_norm']

      return content_recs


    # Merge both recommendations
    # Normalize scores to 0-1 range
    content_recs['content_score_norm'] = safe_minmax(content_recs['overall_score'])
    cf_recs['cf_score_norm'] = safe_minmax(cf_recs['cf_score'])

    # Merge on job_title and company (since job_id might differ)
    merged = content_recs.merge(
        cf_recs[['job_id', 'cf_score_norm']],
        on='job_id',
        how='left'
    )

    # Fill NaN CF scores with 0 (jobs not in CF model)
    merged['cf_score_norm'] = merged['cf_score_norm'].fillna(0)

    # Calculate hybrid score
    merged['hybrid_score'] = (
        content_weight * merged['content_score_norm'] +
        cf_weight * merged['cf_score_norm']
    )

    # Sort by hybrid score
    result = merged.sort_values('hybrid_score', ascending=False).head(top_k)

    return result[[
        'job_id','job_title', 'company', 'hybrid_score',
        'content_score_norm', 'cf_score_norm',
        'matching_skills', 'missing_skills'
    ]]

# Test hybrid recommender
print("="*60)
print("Testing Hybrid Recommender")
print("="*60)

test_candidate = candidates_df.iloc[0]['candidate_id']
print(f"\nCandidate: {test_candidate}")

hybrid_recs = hybrid_recommend(test_candidate, content_weight=0.6, cf_weight=0.4, top_k=10)

print("\n Top 10 Hybrid Recommendations:")
display(hybrid_recs)


In [None]:
def compare_recommenders(candidate_id, top_k=5):
    """
    Compare all three recommendation approaches
    """
    print("="*80)
    print(f"Recommendation Comparison for {candidate_id}")
    print("="*80)

    candidate = candidates_df[candidates_df['candidate_id'] == candidate_id].iloc[0]
    print(f"\nCandidate Profile:")
    print(f"  Experience: {candidate['experience_level']} ({candidate['experience_years']} years)")
    print(f"  Top Skills: {candidate['skills_list'][:5]}")
    print(f"  Preferred Locations: {candidate['preferred_locations']}")

    # 1. Content-Based
    print("\n" + "─"*80)
    print(" CONTENT-BASED RECOMMENDATIONS")
    print("─"*80)
    content_recs = get_recommendations(candidate_id, filtered_df, candidates_df, top_k=top_k)
    for i, (_, row) in enumerate(content_recs.iterrows(), 1):
        print(f"{i}. {row['job_title']} at {row['company']}")
        print(f"   Score: {row['overall_score']:.3f}")
        print(f"   Matching: {len(row['matching_skills'])} skills | Missing: {len(row['missing_skills'])} skills")

    # 2. Collaborative Filtering
    print("\n" + "─"*80)
    print(" COLLABORATIVE FILTERING RECOMMENDATIONS")
    print("─"*80)

    candidate_interactions = interactions_df[
        interactions_df['candidate_id'] == candidate_id
    ]['job_id'].tolist()

    cf_recs = cf_recommender.recommend_jobs(
        candidate_id,
        filtered_df,
        top_k=top_k,
        exclude_applied=candidate_interactions
    )

    if len(cf_recs) > 0:
        for i, (_, row) in enumerate(cf_recs.iterrows(), 1):
            print(f"{i}. {row['job_title']} at {row['company']}")
            print(f"   CF Score: {row['cf_score']:.3f}")
    else:
        print(" No CF recommendations (cold start)")

    # 3. Hybrid
    print("\n" + "─"*80)
    print(" HYBRID RECOMMENDATIONS (Best of Both)")
    print("─"*80)
    hybrid_recs = hybrid_recommend(candidate_id, top_k=top_k)
    for i, (_, row) in enumerate(hybrid_recs.iterrows(), 1):
        print(f"{i}. {row['job_title']} at {row['company']}")
        print(f"   Hybrid: {row['hybrid_score']:.3f} | Content: {row['content_score_norm']:.3f} | CF: {row['cf_score_norm']:.3f}")

    print("\n" + "="*80)

# Run comparison
test_candidate = candidates_df.iloc[5]['candidate_id']  # Try different candidates
compare_recommenders(test_candidate, top_k=5)


In [None]:
from sklearn.metrics import ndcg_score, precision_score, recall_score

def evaluate_recommender(interactions_test, recommender_func, k=10):
    """
    Evaluate recommendation quality

    Metrics:
    - Precision@K: What % of recommendations are relevant?
    - Recall@K: What % of relevant items are recommended?
    - NDCG@K: Ranking quality (higher rated items should rank higher)
    """

    precisions = []
    recalls = []
    ndcgs = []

    # Get unique candidates from test set
    test_candidates = interactions_test['candidate_id'].unique()

    for candidate_id in test_candidates[:100]:  # Sample 100
        # Get actual relevant jobs (
        actual_relevant = interactions_test[
            (interactions_test['candidate_id'] == candidate_id) &
            (interactions_test['rating'] >= 4.0)
        ]['job_id'].tolist()

        if len(actual_relevant) == 0:
            continue

        # Get recommendations
        try:
            recs = recommender_func(candidate_id, top_k=k)
            if len(recs) == 0:
                continue

            recommended_jobs = recs['job_id'].tolist()

            # Calculate metrics
            hits = len(set(recommended_jobs) & set(actual_relevant))

            precision = hits / k if k > 0 else 0
            recall = hits / len(actual_relevant) if len(actual_relevant) > 0 else 0

            precisions.append(precision)
            recalls.append(recall)

        except:
            continue

    return {
        'precision@k': np.mean(precisions) if precisions else 0,
        'recall@k': np.mean(recalls) if recalls else 0,
        'coverage': len(precisions) / len(test_candidates)
    }

# Split interactions into train/test
from sklearn.model_selection import train_test_split

train_interactions, test_interactions = train_test_split(
    interactions_df,
    test_size=0.2,
    random_state=42
)

print(" Evaluation Results (on test set)")
print("="*60)

# Evaluate content-based
print("\n1. Content-Based Recommender:")
content_metrics = evaluate_recommender(
    test_interactions,
    lambda cid, top_k: get_recommendations(cid, filtered_df, candidates_df, top_k),
    k=10
)
for metric, value in content_metrics.items():
    print(f"   {metric}: {value:.4f}")

# Evaluate hybrid
print("\n2. Hybrid Recommender:")
hybrid_metrics = evaluate_recommender(
    test_interactions,
    lambda cid, top_k: hybrid_recommend(cid, top_k=top_k),
    k=10
)
for metric, value in hybrid_metrics.items():
    print(f"   {metric}: {value:.4f}")

print("\n Evaluation complete!")
