In [None]:
import pandas as pd
import numpy as np
import joblib
import os
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score, f1_score, log_loss
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import torch

In [None]:
# --- Configuration ---
RANDOM_SEED = 42
MODEL_DIR = 'models/'
os.makedirs(MODEL_DIR, exist_ok=True) # Ensure directory exists
EMBEDDING_MODEL_NAME = 'all-MiniLM-L6-v2' 
print(f"Ensured directory '{MODEL_DIR}' exists.")

In [None]:
# --- Initialize Models (Run once) ---
print("Loading SBERT Model...")
SBERT_MODEL = SentenceTransformer(EMBEDDING_MODEL_NAME)
EMBEDDING_DIM = SBERT_MODEL.get_sentence_embedding_dimension()
print(f"SBERT Model Loaded. Embedding Dimension: {EMBEDDING_DIM}")

In [None]:
# --- Hybrid Feature Extraction ---

def get_semantic_embedding(text):
    """Generates the semantic embedding vector using SBERT."""
    # SBERT can process lists, which is more efficient
    if isinstance(text, str):
        text = [text]
    return SBERT_MODEL.encode(text, convert_to_numpy=True)

In [None]:
def combine_hybrid_features(df, fitted_tfidf, sbert_model):
    """
    Transforms the JD-Resume pairs into the hybrid numerical feature matrix (X).
    X_features will have columns: [TFIDF_Similarity, SBERT_Similarity]
    """
    X_features = []

    # Get embeddings for all JD and Resume texts in one batch (much faster)
    all_jd_emb = sbert_model.encode(df['job_description'].tolist(), convert_to_numpy=True)
    all_res_emb = sbert_model.encode(df['resume_text'].tolist(), convert_to_numpy=True)
    
    # Get TF-IDF vectors for all texts in one batch
    all_jd_tfidf = fitted_tfidf.transform(df['job_description'].tolist())
    all_res_tfidf = fitted_tfidf.transform(df['resume_text'].tolist())

    for i in range(len(df)):
        # 1. Lexical Feature (TF-IDF Similarity)
        # Cosine similarity between JD and Resume TF-IDF vectors
        tfidf_similarity = cosine_similarity(all_jd_tfidf[i], all_res_tfidf[i])[0][0]
        
        # 2. Semantic Feature (SBERT Similarity)
        # Cosine similarity between JD and Resume SBERT embeddings
        # Reshape is needed for cosine_similarity function if only one vector
        semantic_similarity = cosine_similarity(
            all_jd_emb[i].reshape(1, -1), 
            all_res_emb[i].reshape(1, -1)
        )[0][0]

        # 3. Concatenate the final feature vector for XGBoost
        feature_vector = [tfidf_similarity, semantic_similarity]
        X_features.append(feature_vector)
        
    return np.array(X_features)

In [None]:
# Load the new data
df = pd.read_csv("job_resume_pairs.csv")

# --- Use the explicit columns ---
# X_text now contains the two text columns
X_text = df[['job_description', 'resume_text']]
# y is the explicit label
y = df['label']

print(f"Dataset Size: {len(df)} samples")
print(f"Fit (1) examples: {df['label'].sum()}")
print(f"No Fit (0) examples: {len(df) - df['label'].sum()}")

In [None]:
# 1. Split into Training Pool (80%) and Final Test Set (20%)
X_pool, X_test, y_pool, y_test = train_test_split(
    X_text, y, test_size=0.2, random_state=RANDOM_SEED, stratify=y
)

# 2. Split Training Pool into Training Set (75% of pool) and Validation Set (25% of pool)
X_train, X_val, y_train, y_val = train_test_split(
    X_pool, y_pool, test_size=(0.25), random_state=RANDOM_SEED, stratify=y_pool
)

print("-" * 30)
print(f"Training Set Size: {len(X_train)}")
print(f"Validation Set Size: {len(X_val)}")
print(f"Test Set Size: {len(X_test)}")
print("-" * 30)

In [None]:
# --- 4. Feature Transformation and XGBoost Training ---

# --- STEP 1: TRAIN AND SAVE TF-IDF VECTORIZER (Lexical Model) ---
print("Fitting TF-IDF Vectorizer on ALL Training Text...")
# Concatenate JD and Resume text to form the corpus for TF-IDF fitting
train_corpus = X_train['job_description'].tolist() + X_train['resume_text'].tolist()

tfidf_vectorizer = TfidfVectorizer(stop_words='english')
# FIT ONLY ON TRAINING DATA's corpus!
tfidf_vectorizer.fit(train_corpus) 

# Save the fitted TF-IDF model
joblib.dump(tfidf_vectorizer, MODEL_DIR + 'tfidf_vectorizer.pkl') 
print(f"TF-IDF Model saved to {MODEL_DIR}...")


# --- STEP 2: HYBRID FEATURE EXTRACTION ---
print("Extracting Hybrid Features for all three sets...")
# Transform all three sets using the *fitted* TF-IDF and SBERT
X_train_hybrid = combine_hybrid_features(X_train, tfidf_vectorizer, SBERT_MODEL)
X_val_hybrid = combine_hybrid_features(X_val, tfidf_vectorizer, SBERT_MODEL)
X_test_hybrid = combine_hybrid_features(X_test, tfidf_vectorizer, SBERT_MODEL)


# --- STEP 3: TRAIN XGBOOST CLASSIFIER (Balanced Data) ---
print("Training XGBoost Classifier...")
xgb_model = XGBClassifier(
    use_label_encoder=False, 
    eval_metric='logloss', 
    random_state=RANDOM_SEED,
    # Set to a fixed high number since we are NOT using early_stopping_rounds in v3.0.5
    n_estimators=1000, 
    # scale_pos_weight is omitted because the data is balanced (1:1)
)

# Train the model with the balanced, transformed training data
xgb_model.fit(
    X_train_hybrid, y_train,
    verbose=False
)

# Save the trained XGBoost model
joblib.dump(xgb_model, MODEL_DIR + 'xgb_classifier.pkl')
print(f"XGBoost Model saved to {MODEL_DIR}...")

In [None]:
print("\n" + "="*40)
print("  FINAL MODEL EVALUATION (TEST SET)")
print("="*40)

# Predict on the unseen Test Set
y_pred = xgb_model.predict(X_test_hybrid)
y_proba = xgb_model.predict_proba(X_test_hybrid)[:, 1]

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
logloss = log_loss(y_test, y_proba)


print(f"Accuracy on Test Set: {accuracy:.4f}")
print(f"F1-Score on Test Set: {f1:.4f}")
print(f"Log Loss on Test Set: {logloss:.4f}")

# Detailed report for thesis
print("\nClassification Report (ISO-IEC 25010 Metrics):")
print(classification_report(y_test, y_pred, target_names=['No Fit (0)', 'Fit (1)']))