In [1]:
"""
📘 Notebook: 2_data_preprocessing.ipynb
--------------------------------------------------
Purpose:
- Clean and preprocess structured and text data.
- Feature engineering: skill overlap, salary mismatch, experience gap, text lengths.
- Encode categorical features.
- Train/test split (50/50 split).
- Output clean datasets for modeling.

Inputs:
- data/raw/ai_hiring_dataset.csv or data/processed/eda_augmented.csv

Outputs:
- data/processed/train.csv
- data/processed/test.csv
"""

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Load data
df = pd.read_csv('/Users/nikhilraj/Documents/HiringAiModel/data/ai_hiring_assignment_dataset_5000.csv')

# Handle missing certifications (fill with 'None')
df['certifications'] = df['certifications'].fillna("None")

# --- Feature Engineering ---

# Skill overlap between candidate and required skills
def compute_skill_overlap(row):
    candidate = set(row['candidate_skills'].lower().split(','))
    required = set(row['required_skills'].lower().split(','))
    return len(candidate & required), len(required), len(candidate)

df[['skill_overlap', 'required_skill_count', 'candidate_skill_count']] = df.apply(
    lambda row: pd.Series(compute_skill_overlap(row)), axis=1
)
df['skill_match_ratio'] = df['skill_overlap'] / df['required_skill_count']

# Salary mismatch
avg_budgeted_salary = (df['budgeted_salary_min'] + df['budgeted_salary_max']) / 2
df['salary_diff'] = df['expected_salary'] - avg_budgeted_salary

# Experience gap
df['experience_gap'] = df['years_experience'] - df['min_experience']

# Text lengths
df['job_desc_len'] = df['job_description'].str.len()
df['past_titles_len'] = df['past_job_titles'].str.len()

# Encode categorical variables
label_cols = ['education_level', 'candidate_location', 'job_location']
label_encoders = {}
for col in label_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Final feature columns
feature_cols = [
    'years_experience', 'expected_salary', 'min_experience',
    'budgeted_salary_min', 'budgeted_salary_max',
    'education_level', 'candidate_location', 'job_location', 'job_title',
    'candidate_skills', 'past_job_titles', 'certifications',
    'required_skills', 'job_description',
    'skill_overlap', 'required_skill_count', 'candidate_skill_count',
    'skill_match_ratio', 'salary_diff', 'experience_gap',
    'job_desc_len', 'past_titles_len'
]

X = df[feature_cols]
y = df['is_fit']

# --- Split data: 50% train, 50% test ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, stratify=y)

# Save splits
X_train.to_csv("../data/processed/train.csv", index=False)
y_train.to_csv("../data/processed/train_labels.csv", index=False)
X_test.to_csv("../data/processed/test.csv", index=False)
y_test.to_csv("../data/processed/test_labels.csv", index=False)

# Save label encoders (if needed later for inference)
import joblib
for col, le in label_encoders.items():
    joblib.dump(le, f"../models/{col}_label_encoder.pkl")

print("✅ Preprocessing complete. Train/test (50/50) splits saved to /data/processed")


✅ Preprocessing complete. Train/test (50/50) splits saved to /data/processed


In [2]:
# Save feature column list for inference
import json

features_info = {
    "feature_cols": feature_cols,
    "label_encoded_cols": label_cols,
    "text_cols": ['candidate_skills', 'past_job_titles', 'certifications', 'required_skills', 'job_description']
}

with open("../models/features_metadata.json", "w") as f:
    json.dump(features_info, f)
