In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, average_precision_score, precision_score, recall_score
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.layers import Dropout
from tensorflow.keras.optimizers.legacy import Adam
from imblearn.under_sampling import NearMiss
from tensorflow.keras.metrics import Precision
from tensorflow.keras.metrics import Recall
from tensorflow.keras.metrics import AUC
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import precision_recall_curve
import keras_tuner as kt
import joblib


In [2]:
original_df = pd.read_csv('pairs_with_embeddings.csv')

In [3]:
df=original_df

In [4]:
need_emb_cols = [f'need_emb_{i}' for i in range(3072)]
res_emb_cols = [f'res_emb_{i}' for i in range(3072)]

# Convert to NumPy arrays
need_embs = df[need_emb_cols].to_numpy()
res_embs = df[res_emb_cols].to_numpy()

# Compute cosine similarity row-wise
cos_sim = np.einsum('ij,ij->i', need_embs, res_embs) / (
    np.linalg.norm(need_embs, axis=1) * np.linalg.norm(res_embs, axis=1) + 1e-9
)

df['cosine_similarity'] = cos_sim

In [5]:
# Create richer features
df['l2_distance'] = np.linalg.norm(need_embs - res_embs, axis=1)
df['dot_product'] = np.einsum('ij,ij->i', need_embs, res_embs)

# Optional: elementwise interaction features
interaction = need_embs * res_embs
interaction_mean = interaction.mean(axis=1)
df['interaction_mean'] = interaction_mean

In [6]:
clusters = pd.read_csv('clusters.csv')
clusters.head()

Unnamed: 0,academic_resource_id,name
0,391,Cluster 1
1,321,Sociología_Prácticas
2,640,Cluster 2
3,225,Cluster 3
4,253,Cluster 4


In [7]:
merged = df.merge(clusters, on="academic_resource_id", how="left")
merged.head()

Unnamed: 0,need_id,need_name,need_description,need_expiration_date,need_created_at,need_internship,offer_name,offer_description,offer_semester,offer_company_year,...,res_emb_3067,res_emb_3068,res_emb_3069,res_emb_3070,res_emb_3071,cosine_similarity,l2_distance,dot_product,interaction_mean,name
0,1456,Investigación sobre los factores que inciden e...,De acuerdo a la temáticas sociales y en especi...,2019-08-04,2019-02-04 14:56:21.388586,True,Pasantía de College 1-2019,"Pasantía cocurricular de al menos 160 horas, d...",0.0,2019,...,0.048212,0.003851,-0.013416,-0.028706,-0.004475,0.411339,1.085045,0.411339,0.000134,College CCNN_Pasantías
1,5168,Apoyo y elaboración de plan de marketing para ...,La Fábrica de Renca es una entidad privada sin...,2024-03-20,2023-09-20 20:05:59.124840,True,Práctica Profesional 2-2023,Práctica profesional obligatoria de 320 horas ...,1.0,2023,...,0.039514,-0.000184,-0.006662,-0.018958,-0.012061,0.486173,1.013733,0.486173,0.000158,Comercial_Prácticas
2,5152,Taller de Intervención para Programa de Calle ...,El curso taller de intervención tiene como pro...,2024-03-06,2023-09-06 19:51:21.818464,True,Taller de Intervención 2-2023,Taller de intervención dirigido a estudiantes ...,1.0,2023,...,0.049481,-0.002994,-0.004206,-0.003688,0.003104,0.782764,0.659146,0.782764,0.000255,Trabajo Social_Práctica Inicial
3,3993,Diagnóstico de la población migrante en Renca ...,Desde el departamento de inclusión de la munic...,2022-09-22,2022-03-22 13:54:44.014323,True,Pasantía Verano-2021,"Pasantía cocurricular de al menos 160 horas, d...",2.0,2021,...,0.045297,-0.006526,-0.019708,-0.01277,-0.00596,0.520189,0.979603,0.520189,0.000169,Sociología_Prácticas
4,1907,Práctica Inicial Trabajo Social (II) - Unidad ...,Dentro de los objetivos de la Unidad Técnica d...,2020-02-19,2019-08-19 14:00:49.570249,True,Práctica Inicial II 2-2019,Segunda parte del taller de intervención dirig...,1.0,2019,...,0.046884,-0.001429,-0.001713,-0.005516,0.009988,0.396248,1.098865,0.396248,0.000129,Trabajo Social_Práctica Inicial


In [8]:
df = merged
df = df.drop(columns=['academic_resource_description','need_id','need_name','need_description','need_expiration_date','need_created_at','offer_description','offer_name','offer_semester','offer_company_year','offer_expiration_date','offer_academic_resource_id','offer_created_at','academic_resource_name'])


In [9]:
df["need_internship"] = df["need_internship"].astype(int)

In [10]:
df[df.isnull().any(axis=1)].shape

(0, 6154)

In [11]:
df = df.dropna()
df.head()

Unnamed: 0,need_internship,academic_resource_id,academic_resource_level,academic_resource_type_id,has_match,need_emb_0,need_emb_1,need_emb_2,need_emb_3,need_emb_4,...,res_emb_3067,res_emb_3068,res_emb_3069,res_emb_3070,res_emb_3071,cosine_similarity,l2_distance,dot_product,interaction_mean,name
0,1,250,1,24,1,-0.011038,0.009111,-0.007517,-0.016986,0.014604,...,0.048212,0.003851,-0.013416,-0.028706,-0.004475,0.411339,1.085045,0.411339,0.000134,College CCNN_Pasantías
1,1,205,2,21,1,-0.008638,0.036582,-0.013364,-0.018138,0.011307,...,0.039514,-0.000184,-0.006662,-0.018958,-0.012061,0.486173,1.013733,0.486173,0.000158,Comercial_Prácticas
2,1,582,1,21,1,0.010936,0.034407,-0.001854,0.008706,0.020503,...,0.049481,-0.002994,-0.004206,-0.003688,0.003104,0.782764,0.659146,0.782764,0.000255,Trabajo Social_Práctica Inicial
3,1,330,2,24,1,-0.007967,0.022626,-0.01308,-0.032534,0.038485,...,0.045297,-0.006526,-0.019708,-0.01277,-0.00596,0.520189,0.979603,0.520189,0.000169,Sociología_Prácticas
4,1,286,1,21,1,0.002075,-0.005712,-0.003436,-0.047204,0.015465,...,0.046884,-0.001429,-0.001713,-0.005516,0.009988,0.396248,1.098865,0.396248,0.000129,Trabajo Social_Práctica Inicial


In [12]:
# dummy categories columns
categorical_cols = ['academic_resource_id','academic_resource_level','academic_resource_type_id','name']
df_dummies = pd.get_dummies(df, columns=categorical_cols)
df_dummies
df = df_dummies

In [13]:
X = df.drop(columns=['has_match'])
y = df['has_match']

In [14]:
# 6. Split the data
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [15]:
# 5. Normalize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
# Use the same scaler to transform the validation and test sets
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

In [16]:
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

print("Shape of original training data:", X_train_scaled.shape)
print("Shape of resampled training data:", X_train_resampled.shape)

Shape of original training data: (68108, 6664)
Shape of resampled training data: (129362, 6664)


In [17]:
callbacks = [
    EarlyStopping(monitor="val_pr_auc", mode="max", patience=5, restore_best_weights=True),
    ReduceLROnPlateau(monitor="val_pr_auc", mode="max", factor=0.5, patience=2, verbose=1),
]

In [18]:
# 7. Build the model
model = Sequential([
    Input(shape=(X_train_resampled.shape[1],)),

    Dense(96, activation='relu'),
    BatchNormalization(),
    Dropout(0.4),

    Dense(64, activation='relu'),
    BatchNormalization(),
    Dropout(0.3),

    Dense(1, activation='sigmoid')  # clasificación binaria
])

In [19]:
pr_auc = AUC(curve="PR", name="pr_auc")
model.compile(
    optimizer=Adam(learning_rate=1e-3),
    loss="binary_crossentropy",
    metrics=[Precision(name="precision"), Recall(name="recall"), AUC(name="auc"), pr_auc],
)
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 96)                639840    
                                                                 
 batch_normalization (Batch  (None, 96)                384       
 Normalization)                                                  
                                                                 
 dropout (Dropout)           (None, 96)                0         
                                                                 
 dense_1 (Dense)             (None, 64)                6208      
                                                                 
 batch_normalization_1 (Bat  (None, 64)                256       
 chNormalization)                                                
                                                                 
 dropout_1 (Dropout)         (None, 64)                0

In [20]:
# 8. Train the model
history = model.fit(
    X_train_resampled, y_train_resampled,
    validation_data=(X_val_scaled, y_val),
    epochs=100,
    batch_size=32,
    callbacks=callbacks,
    # class_weight=class_weight_dict
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 5: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 10: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.
Epoch 11/100
Epoch 12/100
Epoch 12: ReduceLROnPlateau reducing learning rate to 0.0001250000059371814.
Epoch 13/100


In [36]:
model=best_model

In [37]:
# 9. Evaluate the model
metrics = model.evaluate(X_test_scaled, y_test)
print(metrics)

[0.09717106819152832, 0.7047492265701294]


In [35]:
# 10. Plot training history
plt.figure(figsize=(12, 5))

# Precision
plt.plot(history.history['precision'], label='Train Precision')
plt.plot(history.history['val_precision'], label='Val Precision')

# Recall
plt.plot(history.history['recall'], label='Train Recall')
plt.plot(history.history['val_recall'], label='Val Recall')

# PR-AUC
plt.plot(history.history['pr_auc'], label='Train PR-AUC')
plt.plot(history.history['val_pr_auc'], label='Val PR-AUC')

plt.xlabel('Epoch')
plt.ylabel('Score')
plt.title('Training History (Precision, Recall, PR-AUC)')
plt.legend()
plt.grid(True)
plt.show()

KeyError: 'precision'

<Figure size 1200x500 with 0 Axes>

In [38]:
y_probs = model.predict(X_val_scaled)
prec, rec, thresholds = precision_recall_curve(y_val, y_probs)
f2_scores = (5 * prec * rec) / (4 * prec + rec + 1e-9)
best_idx = np.argmax(f2_scores)
best_threshold = thresholds[best_idx]
print("Best threshold for F2:", best_threshold)
print("Precision:", prec[best_idx], "Recall:", rec[best_idx], "F2:", f2_scores[best_idx])

Best threshold for F2: 0.209748
Precision: 0.6015625 Recall: 0.7530562347188264 F2: 0.7169459960486737


In [39]:
for thr in np.arange(0, 1.0, 0.01):
    y_pred = (y_probs >= thr).astype(int)
    precision = precision_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)
    pr_auc = average_precision_score(y_val, y_probs)
    print(f"Threshold: {thr:.2f}, Precision: {precision:.2f}, Recall: {recall:.2f}, PR AUC: {pr_auc:.2f}")

Threshold: 0.00, Precision: 0.05, Recall: 1.00, PR AUC: 0.70
Threshold: 0.01, Precision: 0.28, Recall: 0.89, PR AUC: 0.70
Threshold: 0.02, Precision: 0.33, Recall: 0.86, PR AUC: 0.70
Threshold: 0.03, Precision: 0.37, Recall: 0.85, PR AUC: 0.70
Threshold: 0.04, Precision: 0.39, Recall: 0.84, PR AUC: 0.70
Threshold: 0.05, Precision: 0.41, Recall: 0.82, PR AUC: 0.70
Threshold: 0.06, Precision: 0.42, Recall: 0.81, PR AUC: 0.70
Threshold: 0.07, Precision: 0.44, Recall: 0.80, PR AUC: 0.70
Threshold: 0.08, Precision: 0.46, Recall: 0.79, PR AUC: 0.70
Threshold: 0.09, Precision: 0.47, Recall: 0.79, PR AUC: 0.70
Threshold: 0.10, Precision: 0.48, Recall: 0.79, PR AUC: 0.70
Threshold: 0.11, Precision: 0.50, Recall: 0.79, PR AUC: 0.70
Threshold: 0.12, Precision: 0.51, Recall: 0.78, PR AUC: 0.70
Threshold: 0.13, Precision: 0.52, Recall: 0.77, PR AUC: 0.70
Threshold: 0.14, Precision: 0.53, Recall: 0.77, PR AUC: 0.70
Threshold: 0.15, Precision: 0.54, Recall: 0.76, PR AUC: 0.70
Threshold: 0.16, Precisi

In [40]:
y_probs = model.predict(X_test_scaled)
y_pred = (y_probs > 0.05).astype("int32")
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
pr_auc = average_precision_score(y_test, y_probs)
print("PR AUC:", pr_auc)

[[7582  499]
 [  63  370]]
              precision    recall  f1-score   support

           0       0.99      0.94      0.96      8081
           1       0.43      0.85      0.57       433

    accuracy                           0.93      8514
   macro avg       0.71      0.90      0.77      8514
weighted avg       0.96      0.93      0.94      8514

PR AUC: 0.7087053164710877


In [41]:
test_precision = precision_score(y_test, y_pred)
test_recall = recall_score(y_test, y_pred)

print(f"Test Set Metrics for Filename -> Precision: {test_precision:.3f}, Recall: {test_recall:.3f}")

Test Set Metrics for Filename -> Precision: 0.426, Recall: 0.855


In [43]:
base_filename = (
    f"prauc_{pr_auc:.3f}_"
    f"thresh_{best_threshold:.3f}_"
    f"prec_{test_precision:.3f}_"
    f"rec_{test_recall:.3f}"
)

model_filename = f"model_{base_filename}.keras"
scaler_filename = f"scaler_{base_filename}.pkl"

print(f"Saving model to: {model_filename}")
model.save(model_filename)
print(f"Saving scaler to: {scaler_filename}")
joblib.dump(scaler, scaler_filename)


Saving model to: model_prauc_0.709_thresh_0.210_prec_0.426_rec_0.855.keras
Saving scaler to: scaler_prauc_0.709_thresh_0.210_prec_0.426_rec_0.855.pkl


['scaler_prauc_0.709_thresh_0.210_prec_0.426_rec_0.855.pkl']

In [30]:
import keras_tuner as kt

# 1. Create a model-building function
def build_model(hp):
    model = Sequential()
    model.add(Input(shape=(X_train_scaled.shape[1],)))

    # Tune the number of units in the first Dense layer
    hp_units_1 = hp.Int('units_1', min_value=32, max_value=1024, step=32)
    model.add(Dense(units=hp_units_1, activation='relu'))
    model.add(BatchNormalization())
    # Tune the dropout rate
    hp_dropout_1 = hp.Float('dropout_1', min_value=0.2, max_value=0.5, step=0.1)
    model.add(Dropout(hp_dropout_1))

    # Add another tunable hidden layer
    hp_units_2 = hp.Int('units_2', min_value=32, max_value=512, step=32)
    model.add(Dense(units=hp_units_2, activation='relu'))
    model.add(BatchNormalization())
    hp_dropout_2 = hp.Float('dropout_2', min_value=0.2, max_value=0.5, step=0.1)
    model.add(Dropout(hp_dropout_2))

    model.add(Dense(1, activation='sigmoid'))

    # Tune the learning rate for the optimizer
    hp_learning_rate = hp.Choice('learning_rate', values=[1e-3, 1e-4, 1e-5])
    
    pr_auc = AUC(curve="PR", name="pr_auc")
    model.compile(optimizer=Adam(learning_rate=hp_learning_rate),
                  loss='binary_crossentropy',
                  metrics=[pr_auc])
    return model

# 2. Instantiate the tuner
# We'll use Hyperband, an efficient algorithm for finding good hyperparameters
tuner = kt.Hyperband(build_model,
                     objective=kt.Objective("val_pr_auc", direction="max"), # Your key metric
                     max_epochs=20,
                     factor=3,
                     directory='models',
                     project_name='match_tuning')

# Define an early stopping callback to prevent wasting time on bad trials
stop_early = EarlyStopping(monitor='val_pr_auc', mode='max', patience=5)

# 3. Run the search
tuner.search(X_train_scaled, y_train,
             epochs=50,
             validation_data=(X_val_scaled, y_val),
             callbacks=[stop_early])

# Get the optimal hyperparameters
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

Reloading Tuner from models/match_tuning/tuner0.json


In [31]:
print(f"""
The hyperparameter search is complete.
Optimal units in first layer: {best_hps.get('units_1')}
Optimal dropout in first layer: {best_hps.get('dropout_1')}
Optimal units in second layer: {best_hps.get('units_2')}
Optimal dropout in second layer: {best_hps.get('dropout_2')}
Optimal learning rate: {best_hps.get('learning_rate')}
""")


The hyperparameter search is complete.
Optimal units in first layer: 96
Optimal dropout in first layer: 0.4
Optimal units in second layer: 64
Optimal dropout in second layer: 0.30000000000000004
Optimal learning rate: 0.001



In [32]:
# Build the model with the optimal hyperparameters and train it
best_model = tuner.hypermodel.build(best_hps)
history = best_model.fit(X_train_scaled, y_train, epochs=50, validation_data=(X_val_scaled, y_val), callbacks=[stop_early])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50


In [51]:
# save the models hyperparameters on a .txt with the same base_filename
hyperparams = f"""
The hyperparameter search is complete.
Optimal units in first layer: {best_hps.get('units_1')}
Optimal dropout in first layer: {best_hps.get('dropout_1')}
Optimal units in second layer: {best_hps.get('units_2')}
Optimal dropout in second layer: {best_hps.get('dropout_2')}
Optimal learning rate: {best_hps.get('learning_rate')}
"""
with open(f"hyperparams_{base_filename}.txt", "w") as f:
    f.write(hyperparams)

In [66]:
import lightgbm as lgb
from sklearn.metrics import classification_report, confusion_matrix

neg = (y_train == 0).sum()
pos = (y_train == 1).sum()
scale_pos_weight = neg / pos

# 1. Initialize and train the LightGBM model
# Use is_unbalanced=True to let the model handle the class imbalance.
# Train on the original (but scaled) training data, NOT the SMOTE'd data.
lgbm = lgb.LGBMClassifier(objective='binary',
                          n_estimators=1000,
                          learning_rate=0.005,
                          random_state=42,
    scale_pos_weight=scale_pos_weight)

# Train the model
lgbm.fit(X_train, y_train,
         eval_set=[(X_val, y_val)],
         eval_metric="average_precision",
         callbacks=[
             lgb.early_stopping(stopping_rounds=10), # Added stopping_rounds for clarity
             lgb.log_evaluation(period=10)            # This is the line that fixes the error
         ])


# 2. Find the optimal threshold for LightGBM on the validation set (same as you did for the NN)
y_probs_lgbm_val = lgbm.predict_proba(X_val)[:, 1]
prec_lgbm, rec_lgbm, thresholds_lgbm = precision_recall_curve(y_val, y_probs_lgbm_val)
f2_scores_lgbm = (5 * prec_lgbm * rec_lgbm) / (4 * prec_lgbm + rec_lgbm + 1e-9)
best_threshold_lgbm = thresholds_lgbm[np.argmax(f2_scores_lgbm)]

print(f"\nBest Threshold for LightGBM (F2-score): {best_threshold_lgbm:.4f}")

# Evaluate on the test set with the optimized threshold
y_probs_lgbm_test = lgbm.predict_proba(X_test)[:, 1]
y_pred_lgbm_test = (y_probs_lgbm_test >= best_threshold_lgbm).astype(int)

print("\n--- LightGBM Performance on Test Set ---")
print(confusion_matrix(y_test, y_pred_lgbm_test))
print(classification_report(y_test, y_pred_lgbm_test))
print("PR AUC:", average_precision_score(y_test, y_probs_lgbm_test))

[LightGBM] [Info] Number of positive: 3427, number of negative: 64681
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.462660 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1497459
[LightGBM] [Info] Number of data points in the train set: 68108, number of used features: 6629
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.050317 -> initscore=-2.937782
[LightGBM] [Info] Start training from score -2.937782
Training until validation scores don't improve for 10 rounds
[10]	valid_0's average_precision: 0.271302	valid_0's binary_logloss: 0.183753
[20]	valid_0's average_precision: 0.282293	valid_0's binary_logloss: 0.182261
Early stopping, best iteration is:
[18]	valid_0's average_precision: 0.281686	valid_0's binary_logloss: 0.182231

Best Threshold for LightGBM (F2-score): 0.0983

--- LightGBM Performance on Test Set ---
[[7184  897]
 [ 139  294]]
              precision    recall  f1-score   support



In [67]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_curve

# 1. Initialize and train the Random Forest model
print("--- Training Random Forest ---")
rf = RandomForestClassifier(
    n_estimators=200,          # The number of trees in the forest
    class_weight='balanced',   # Handles class imbalance
    random_state=42,
    n_jobs=-1                  # Use all available CPU cores
)

# Train on the original (un-SMOTEd) data
rf.fit(X_train, y_train)

# 2. Find the optimal threshold on the validation set
y_probs_rf_val = rf.predict_proba(X_val)[:, 1]
prec_rf, rec_rf, thresholds_rf = precision_recall_curve(y_val, y_probs_rf_val)
f2_scores_rf = (5 * prec_rf * rec_rf) / (4 * prec_rf + rec_rf + 1e-9)
best_threshold_rf = thresholds_rf[np.argmax(f2_scores_rf)]

print(f"\nBest Threshold for Random Forest (F2-score): {best_threshold_rf:.4f}")

# 3. Evaluate on the test set with the optimized threshold
y_probs_rf_test = rf.predict_proba(X_test)[:, 1]
y_pred_rf_test = (y_probs_rf_test >= best_threshold_rf).astype(int)

print("\n--- Random Forest Performance on Test Set ---")
print(confusion_matrix(y_test, y_pred_rf_test))
print(classification_report(y_test, y_pred_rf_test))
print("PR AUC:", average_precision_score(y_test, y_probs_rf_test))

--- Training Random Forest ---

Best Threshold for Random Forest (F2-score): 0.1000

--- Random Forest Performance on Test Set ---
[[7592  489]
 [ 133  300]]
              precision    recall  f1-score   support

           0       0.98      0.94      0.96      8081
           1       0.38      0.69      0.49       433

    accuracy                           0.93      8514
   macro avg       0.68      0.82      0.73      8514
weighted avg       0.95      0.93      0.94      8514

PR AUC: 0.44253688325846163


In [70]:
from xgboost import XGBClassifier

# 1. Calculate the weight for the positive class
# ratio of negative samples to positive samples
scale_pos_weight = y_train.value_counts()[0] / y_train.value_counts()[1]

# 2. Initialize and train the XGBoost model
print("\n--- Training XGBoost ---")
xgb = XGBClassifier(
    objective='binary:logistic',
    eval_metric='aucpr',
    scale_pos_weight=scale_pos_weight, # Handles class imbalance
    n_estimators=2000,
    learning_rate=0.05,
    random_state=42,
    n_jobs=-1
)

# Train the model with early stopping
xgb.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    verbose=False  # Set to True to see training progress
)

# 3. Find the optimal threshold on the validation set
y_probs_xgb_val = xgb.predict_proba(X_val)[:, 1]
prec_xgb, rec_xgb, thresholds_xgb = precision_recall_curve(y_val, y_probs_xgb_val)
f2_scores_xgb = (5 * prec_xgb * rec_xgb) / (4 * prec_xgb + rec_xgb + 1e-9)
best_threshold_xgb = thresholds_xgb[np.argmax(f2_scores_xgb)]

print(f"\nBest Threshold for XGBoost (F2-score): {best_threshold_xgb:.4f}")

# 4. Evaluate on the test set with the optimized threshold
y_probs_xgb_test = xgb.predict_proba(X_test)[:, 1]
y_pred_xgb_test = (y_probs_xgb_test >= best_threshold_xgb).astype(int)

print("\n--- XGBoost Performance on Test Set ---")
print(confusion_matrix(y_test, y_pred_xgb_test))
print(classification_report(y_test, y_pred_xgb_test))
print("PR AUC:", average_precision_score(y_test, y_probs_xgb_test))


--- Training XGBoost ---

Best Threshold for XGBoost (F2-score): 0.0496

--- XGBoost Performance on Test Set ---
[[7692  389]
 [ 116  317]]
              precision    recall  f1-score   support

           0       0.99      0.95      0.97      8081
           1       0.45      0.73      0.56       433

    accuracy                           0.94      8514
   macro avg       0.72      0.84      0.76      8514
weighted avg       0.96      0.94      0.95      8514

PR AUC: 0.5720201585437346


In [53]:
# --- 1. Preparation: Create the Results DataFrame (Run this once) ---
# This example uses your champion Neural Network model.
# You can swap 'model' with 'xgb' or another trained model to see its predictions.

# Get predictions on the test set using your best model and threshold
y_probs_test = model.predict(X_test_scaled)
y_pred_test = (y_probs_test > best_threshold).astype(int) # Use the threshold you found for the NN

# Create a DataFrame with predictions linked back to the original data
results_df = merged.loc[X_test.index].copy()
results_df['has_match'] = y_test
results_df['probability'] = y_probs_test
results_df['prediction'] = y_pred_test

# Select the most relevant columns for a clean view
view_cols = ['need_id', 'need_name', 'offer_name', 'name', 'has_match', 'prediction', 'probability']
final_results = results_df[view_cols]

print("Prediction results have been prepared.")


# --- 2. The Interactive Function ---
def display_need_predictions(need_id, results_df):
    """
    Displays a summary of model predictions for a specific need_id,
    sorted by probability.
    """
    # Filter the results for the specified need_id
    group = results_df[results_df['need_id'] == need_id]

    # Check if the need_id exists in the test set results
    if group.empty:
        print(f"Sorry, Need ID: {need_id} was not found in the test set.")
        return

    # Sort the results by the model's predicted probability
    group = group.sort_values('probability', ascending=False)
    
    need_name = group['need_name'].iloc[0]
    
    print(f"\n--- Predictions for Need ID: {need_id} ({need_name[:50]}...) ---")
    
    # Calculate summary stats for this need
    true_matches = group['has_match'].sum()
    predicted_matches = group['prediction'].sum()
    correctly_predicted = group[(group['has_match'] == 1) & (group['prediction'] == 1)].shape[0]

    print(f"Summary: The model found {correctly_predicted} of the {true_matches} true matches for this need.")
    print("-" * 80)
    
    # Print the ranked list of potential offers for this need
    print(group[['offer_name', 'name', 'has_match', 'prediction', 'probability']].to_string(index=False))
    print("-" * 80)

# --- 3. How to Use the Function ---

# First, find some interesting need_ids to inspect from your test set
# (e.g., ones that we know have at least one true match)
needs_with_matches_in_test = final_results[final_results['has_match'] == 1]['need_id'].unique()
print("\nYou can inspect the following need_ids from the test set:")
print(needs_with_matches_in_test)


Prediction results have been prepared.

You can inspect the following need_ids from the test set:
[ 4618  8288   326 12015   765  4208   931  5552  4698   367  3493  5112
 11388  1419  1684  5444  1906 12418 11355  3877  4116   325  3202 10531
 12609  5056   163 14622 17955  4231   531 11558 12081  4657  2103  4822
  1003  6867 17493 18087  3909   964  1972  1167 14919  5278  3680   607
  1126  2762   949  2757   644   514  4325 18149   331  3326   610  4550
  2302  3047  4151  4040  5514   932  5352  2316 16569  5189  5001  4175
  4898 10497  1925   630  3225  1192  2525 17757  5371  5621  3169   577
  8716   370  2432  5641  2190  5353  1127   849  4485 16404  5163  3522
 17597  5201 12807 17692  1461  4063 10333  1747  4447  2593  5072  3824
  3128   995  2752  4190  1797    36  2590 12840 14688  1838  3875  5354
 12775  4604  4742  5094  3726   407  5159  5522  3408  3434  5466 17988
  3293 17889  5095   113  3427   551  2591   467  3882 13116 16075  4658
   994  7165  4563  5745  

In [56]:

# Now, call the function with a need_id of your choice
# Example:
display_need_predictions(4179, final_results)
display_need_predictions(1717, final_results)


--- Predictions for Need ID: 4179 (Identificación y reconocimiento histórico de áreas...) ---
Summary: The model found 0 of the 1 true matches for this need.
--------------------------------------------------------------------------------
                   offer_name                name  has_match  prediction  probability
              Pasantía 2-2022 Geografía_Prácticas          1           0 8.184763e-04
Gestión de Operaciones 2-2022          Cluster 54          0           0 1.882446e-08
             Ergonomia 1-2023          Cluster 74          0           0 8.812553e-09
--------------------------------------------------------------------------------

--- Predictions for Need ID: 1717 (Gestión del proceso de provisión de bienes y servi...) ---
Summary: The model found 0 of the 1 true matches for this need.
--------------------------------------------------------------------------------
                                                    offer_name                       name  has_

In [55]:
# Filter for rows that are False Negatives (True Match = 1, Predicted = 0)
missed_matches_df = final_results[(final_results['has_match'] == 1) & (final_results['prediction'] == 0)]

# Get a unique list of the need_ids from that filtered DataFrame
needs_with_missed_matches = missed_matches_df['need_id'].unique()

print("The following need_ids had at least one true match that the model failed to predict:")
print(needs_with_missed_matches)

The following need_ids had at least one true match that the model failed to predict:
[ 3493  1419  4116   325  5056   531  2103  4822  6867  1126   610  2302
  3225  1192  5641  2190  1127   849  4485 12807    36  2590 12775  4742
  5522  5466   113  5297  1278   888  2201   476   613  1352  2855    63
   198  3510 11563  1483   179  4179  1604  2186  1044  3120  2944  1290
  3608  1180  1568   794  1717  4323  1924  2769  2683  4642  4648  4927
  1049 11555  3054  9015  1115   249  5192  2597  5366   779  3532  5432
  2448  1651 11125   127  4293   125  1010  5643  5433]


In [57]:
missed_matches_df.head()

Unnamed: 0,need_id,need_name,offer_name,name,has_match,prediction,probability
274,3493,Propuestas para aumentar movilidad sostenible ...,Movilidad Sostenible y Ciclo-Inclusión Como Re...,Cluster 136,1,0,0.008588
379,1419,Diseño del departamento municipal de la inter...,Proyecto de titulación Magister CP 1-2019,Cluster 142,1,0,0.024479
3742,4116,Talleres Cuidado y Bienestar Vocal para Educad...,Pasantía de Fonoaudiología 2-2022,Cluster 111,1,0,0.004254
848,325,Modelo de costeo de prestaciones para prioriza...,Práctica Profesional Verano-2017,Comercial_Prácticas,1,0,0.028921
419,5056,Diagnóstico para la implementación de sistemas...,Sistemas de Información 1-2023,Cluster 80,1,0,0.000306


# 