In [None]:
# main.ipynb
# Inference notebook for structured + text-based candidate-job fit
# Choose model: uncomment XGBoost or Logistic Regression pipeline

import pandas as pd
import numpy as np
import joblib

# --- Load New Data ---
new_data = pd.read_csv("../data/ai_hiring_assignment_dataset_5000.csv")
print(f"Loaded {len(new_data)} new rows")

# --- Reproduce Feature Engineering ---
# Fill certifications
new_data['certifications'] = new_data['certifications'].fillna("None")

# Skill overlap
def compute_skill_overlap(row):
    candidate = set(row['candidate_skills'].lower().split(','))
    required = set(row['required_skills'].lower().split(','))
    return len(candidate & required), len(required), len(candidate)

new_data[['skill_overlap', 'required_skill_count', 'candidate_skill_count']] = new_data.apply(
    lambda row: pd.Series(compute_skill_overlap(row)), axis=1
)
new_data['skill_match_ratio'] = new_data['skill_overlap'] / new_data['required_skill_count']

# Salary mismatch
avg_budgeted_salary = (new_data['budgeted_salary_min'] + new_data['budgeted_salary_max']) / 2
new_data['salary_diff'] = new_data['expected_salary'] - avg_budgeted_salary

# Experience gap
new_data['experience_gap'] = new_data['years_experience'] - new_data['min_experience']

# Text lengths
new_data['job_desc_len'] = new_data['job_description'].str.len()
new_data['past_titles_len'] = new_data['past_job_titles'].str.len()

# Label Encoding for categorical features
label_cols = ['education_level', 'candidate_location', 'job_location']
for col in label_cols:
    le = joblib.load(f"../models/{col}_label_encoder.pkl")
    new_data[col] = le.transform(new_data[col])

# Combine text columns
text_cols = ['candidate_skills', 'past_job_titles', 'certifications', 'required_skills', 'job_description']
def combine_text(df):
    return df[text_cols].fillna("").agg(" ".join, axis=1)
new_data["combined_text"] = combine_text(new_data)

# --- Final Inference Features ---
feature_cols = [
    'years_experience', 'expected_salary', 'min_experience',
    'budgeted_salary_min', 'budgeted_salary_max',
    'education_level', 'candidate_location', 'job_location', 'job_title',
    'candidate_skills', 'past_job_titles', 'certifications',
    'required_skills', 'job_description',
    'skill_overlap', 'required_skill_count', 'candidate_skill_count',
    'skill_match_ratio', 'salary_diff', 'experience_gap',
    'job_desc_len', 'past_titles_len', 'combined_text'
]
X_new = new_data[feature_cols]

# -------------------------
# Load ONE model below
# -------------------------


model = joblib.load("../models/xgb_pipeline.joblib")



y_pred = model.predict(X_new)
y_proba = model.predict_proba(X_new)[:, 1]


new_data["fit_prediction"] = y_pred
new_data["fit_probability"] = y_proba
new_data.to_csv("../inference_results.csv", index=False)
print("Results saved to ../results/inference_results.csv")


✅ Loaded 5000 new rows
📁 Results saved to ../results/inference_results.csv


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
