# Job Recommendation System


## Data Loading and Exploration

In [None]:
import json
import pandas as pd

# Load job ads and queries from JSONL files
job_ads = [json.loads(line) for line in open("ad_detail_v1.jsonl", "r")]
job_ads_df = pd.DataFrame(job_ads)

queries = [json.loads(line) for line in open("qry_rel_v1.jsonl", "r")]
queries_df = pd.DataFrame(queries)

print(f"Job ads: {len(job_ads_df)} Queries: {len(queries_df)}")

In [None]:
queries_df.head()

In [None]:
# Calculating the mean and standard deviation of relevance counts per query
import numpy as np
import matplotlib.pyplot as plt

relevance_levels = [4, 3, 2, 1]
means = []
stds = []

for rel in relevance_levels:
    counts = queries_df['relevance'].apply(lambda x: x.count(rel))
    means.append(counts.mean())
    stds.append(counts.std())

plt.figure(figsize=(6,4))
plt.bar(relevance_levels, means, yerr=stds, capsize=8,color="#3b5348", edgecolor='black')
plt.xlabel('Relevance')
plt.ylabel('Average Count per Query')
plt.title('Mean and Std of Relevance Counts per Query')
plt.xticks(relevance_levels)
plt.show()

In [None]:
# Calculate total relevance score per query
total_relevance_per_query = queries_df['relevance'].apply(len)

plt.figure(figsize=(8,5))
plt.hist(total_relevance_per_query, bins=20, color="#3b5348", edgecolor='black')
plt.xlabel('Job count per Query')
plt.ylabel('Number of Queries')
plt.title('Job count per Query')

# Set custom x-ticks with smaller delta
min_x = total_relevance_per_query.min()
max_x = total_relevance_per_query.max()
plt.xticks(np.arange(min_x, max_x + 1, step=50))  

plt.show()

In [None]:
job_ads_df.head()

In [None]:
# Unique keys in metadata
unique_keys = {key for data in job_ads_df['metadata'] if isinstance(data, dict) for key in data}
unique_keys       


## Combining Preprocessed Job Data for Vectorization
#### (preprocessing inclused cleaning, normalization, preservation of special terms, and stemming ). Salary data are excluded for now.

In [None]:
from clean_text import clean_text

# Combine all cleaned text fields into one column
job_ads_df['clean_jobs'] = (
    job_ads_df['title'].apply(clean_text) + ' ' + 
    job_ads_df['abstract'].apply(clean_text) + ' ' + 
    job_ads_df['content'].apply(clean_text) + ' ' +
    job_ads_df['metadata'].apply(clean_text)
).apply(lambda x: ' '.join(x.split()))

# Process search queries
queries_df['clean_query'] = queries_df['query_keywords'].apply(clean_text)

print(f"updated job_ads_df: {list(job_ads_df.columns)}")

In [None]:
job_ads_df.head()

# TF-IDF Vectorization

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
tfidf_job_matrix = tfidf_vectorizer.fit_transform(job_ads_df['clean_jobs'])
tfidf_query_matrix = tfidf_vectorizer.transform(queries_df['clean_query'])

In [None]:
queries_df['clean_query']

In [None]:
print(tfidf_job_matrix)

# Data balancing
#### (undersampling using the smallest ad_ids count across queries)

In [None]:
min_ad_ids_len = queries_df['ad_ids'].apply(len).min()
print(f"Min ad_ids length: {min_ad_ids_len}")

max_ad_ids_len = queries_df['ad_ids'].apply(len).max()
print(f"Max ad_ids length: {max_ad_ids_len}")

mean_ad_ids_len = queries_df['ad_ids'].apply(len).mean()
print(f"Mean ad_ids length: {mean_ad_ids_len}")


In [None]:
# Undersampling
queries_df['ad_ids_balanced'] = queries_df['ad_ids'].apply(lambda x: x[:min_ad_ids_len])
queries_df['relevance_balanced'] = queries_df['relevance'].apply(lambda x: x[:min_ad_ids_len])

In [None]:
queries_df.head()   

# Adding zeros
#### (Job ads not mentioned in queries should be treated as having relevance '0'. We add 5 such jobs with '0' relevance)

In [None]:
# Add unlabeled jobs with zero relevance
import random

unique_job_ads = set(job_ads_df['ad_id'])
    
def add_zero_relevance(row):
    miss_ad_ids = list(unique_job_ads - set(row['ad_ids']))
    if miss_ad_ids:
        sampled_ids = random.sample(miss_ad_ids, min_ad_ids_len)
        row['ad_ids_balanced'] = list(row['ad_ids_balanced']) + sampled_ids
        row['relevance_balanced'] = list(row['relevance_balanced']) + [0] * min_ad_ids_len
    return row

queries_df = queries_df.apply(add_zero_relevance, axis=1)        

In [None]:
print(queries_df['ad_ids_balanced'].apply(lambda x: x[-10:]))
print(queries_df['relevance_balanced'].apply(lambda x: x[-10:]))

# Train-test split

In [None]:
from sklearn.model_selection import train_test_split

train_queries_df, test_queries_df = train_test_split(queries_df, test_size=0.2, random_state=42)
print(f"Train queries: {len(train_queries_df)}, Test queries: {len(test_queries_df)}")

In [None]:
train_queries_df.head()

In [None]:
job_ads_df.head()

# Creating query-job pairs for training and test data sets 

In [None]:

def build_pair_df(sample_queries_df, job_ads_df):
    data = []
    
    for query_idx, query_row in sample_queries_df.iterrows():
        #clean_query = query_row['clean_query']
        ad_ids_balanced = query_row['ad_ids_balanced']
        relevance_balanced = query_row['relevance_balanced']
        
        for ad_id, relevance in zip(ad_ids_balanced, relevance_balanced):
            job_ad = job_ads_df[job_ads_df['ad_id'] == ad_id]  
            if not job_ad.empty:
                job_idx = job_ad.index[0]
                data.append({
                    'query_idx': query_idx,
                    'job_id': ad_id,
                    'job_idx': job_idx,
                    'relevance': relevance,
                })
    return pd.DataFrame(data)

training_data_df = build_pair_df(train_queries_df, job_ads_df)
test_data_df = build_pair_df(test_queries_df, job_ads_df)

print(f"Training_labeled_df:", list(training_data_df.columns))
print(f"Number of query-job pairs: {len(training_data_df)}")
print(f"\nRelevance distribution: {training_data_df['relevance'].value_counts().sort_index(ascending = False)}")

In [None]:
training_data_df.head()

In [None]:
test_data_df.head()

# Building feature matrix and labels array

In [None]:
from scipy.sparse import vstack, hstack

def build_feature_matrix(sample_data_df):
    X_features = []
    y_labels = []
    for i, row in sample_data_df.iterrows():
        job_idx = row['job_idx']
        query_idx = row['query_idx']
        job_vec = tfidf_job_matrix[job_idx]
        query_vec = tfidf_query_matrix[query_idx]
        
        # Combine job and query features
        pair_vec = hstack([job_vec, query_vec])
        
        # Append the feature vectors and labels
        X_features.append(pair_vec)
        y_labels.append(row['relevance'])

    # Stack the feature vectors and convert labels to numpy array
    X = vstack(X_features)
    y = np.array(y_labels)
        
    return X, y


X_train, y_train = build_feature_matrix(training_data_df)
X_test, y_test = build_feature_matrix(test_data_df)

print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")

In [None]:
print(X_train)

# Training the model

In [None]:
from xgboost import XGBRegressor

# Initialize and train the model
xgb_model = XGBRegressor(
    objective='reg:squarederror',
    n_estimators=100,
    verbosity=2,
)
xgb_model.fit(X_train, y_train)

# Predict relevance
y_train_pred = xgb_model.predict(X_train)
y_test_pred = xgb_model.predict(X_test)


# Metrics

In [None]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, ndcg_score

rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
r2 = r2_score(y_train, y_train_pred)
mae = mean_absolute_error(y_train, y_train_pred)
ndcg = ndcg_score([y_train], [y_train_pred])

print(f"Train RMSE: {rmse:.4f}")
print(f"Train R^2: {r2:.4f}")
print(f"Train MAE: {mae:.4f}")
print(f"Train NDCG: {ndcg:.4f}")

rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
r2 = r2_score(y_test, y_test_pred)
mae = mean_absolute_error(y_test, y_test_pred)
ndcg = ndcg_score([y_test], [y_test_pred])

print(f"\nTest RMSE: {rmse:.4f}")
print(f"Test R^2: {r2:.4f}")
print(f"Test MAE: {mae:.4f}")
print(f"Test NDCG: {ndcg:.4f}")

# Saving Model and Vectorizer 

In [None]:
import joblib
import os

# Create model directory if it doesn't exist
os.makedirs("model", exist_ok=True)

# Save XGBoost model
xgb_model.save_model("model/xgb_model.json")

# Save TF-IDF vectorizer
joblib.dump(tfidf_vectorizer, "model/tfidf_vectorizer.joblib")

# Save TF-IDF job matrix
joblib.dump(tfidf_job_matrix, "model/tfidf_job_matrix.joblib")