In [1]:
# Libraries
import pandas as pd
import numpy as np
import sklearn
from sklearn.preprocessing import LabelEncoder
import requests
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, log_loss, roc_curve, recall_score, confusion_matrix, classification_report, roc_auc_score, precision_recall_curve, auc
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
import xgboost as xgb
from sklearn.dummy import DummyClassifier

from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the dataset
data = load_dataset('criteo/Fairjob')
df = data['train'].to_pandas()

In [9]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

# Utils

def click_rank_utility(df, y_pred):
    """
    Implémente la fonction Click-Rank Utility U(f).
    """
    df_test = df.loc[y_pred.index].copy()
    df_test = df_test.assign(pred=y_pred)

    # Evite le biais de position
    df_test = df_test[df_test['displayrandom'] == 1].copy()

    # Trier les produits affichés dans chaque impression par score de prédiction croissant
    df_test = df_test.sort_values(by=['impression_id', 'pred'], ascending=[True, True])  

    # Calcul du rank
    df_test['rank_order'] = df_test.groupby('impression_id')['pred'].rank(method='min', ascending=True)

    # I(Y_D = 1) * rank_J f(X_J, D)
    df_test['utility'] = df_test['click'] * df_test['rank_order']

    # Moyenne de U(f) sur chaque impression’
    click_rank_per_impression = df_test.groupby('impression_id')['utility'].mean()

    # Moyenne finale sur toutes les bannières
    return click_rank_per_impression.mean()

def nllh(y_true, y_pred):
    y_pred_np = np.array(y_pred) + 1e-15
    y_true_np = np.array(y_true)
    
    epsilon = 1e-15

    y_pred_clip = np.clip(y_pred_np, epsilon, 1 - epsilon)

    log_loss_np = y_true_np * np.log(y_pred_clip) + (1 - y_true_np) * np.log(1 - y_pred_clip)
    return -np.mean(log_loss_np)

def DP(df, y_pred : pd.Series):

    df_test = df.loc[y_pred.index].copy()
    df_test['y_pred'] = y_pred

    df_senior = df_test [df_test['senior'] == 1]
    df_senior_male = df_senior[df_senior['protected_attribute'] == 1]
    df_senior_female = df_senior[df_senior['protected_attribute'] == 0]
    return np.mean(df_senior_male['y_pred']) - np.mean(df_senior_female['y_pred'])

def AUC(y_true, y_pred_prob):
    fpr, tpr, _ = roc_curve(y_true, y_pred_prob)
    return auc(fpr, tpr)

def report_results(df, y_pred_prob, y_pred):

    if not isinstance(y_pred, pd.Series):
        raise ValueError("y_pred must be a pandas Series")
    
    df_test = df.loc[y_pred.index].copy()
    y_true = df_test['click']

    click_rank_value = click_rank_utility(df, y_pred)
    nllh_value = nllh(y_true, y_pred_prob)
    auc_value = AUC(y_true, y_pred_prob)
    dp_value = DP(df, y_pred)

    return {
    'Click Rank Utility': round(click_rank_value, 5),
    'Negative Log-Likelihood': round(nllh_value, 5),
    'AUC': round(auc_value, 5),
    'Demographic Parity': round(dp_value, 5)
}


In [10]:
#Test des metriques

# Séparer les features et la target
X = df.drop(columns=["click"])
y = df["click"]

# Splitter les données en train/test
X_train1, X_test1, y_train1, y_test1 = train_test_split(X, y, test_size=0.2, random_state=42)

# Convertir les données au format DMatrix (format optimisé pour XGBoost)
dtrain1 = xgb.DMatrix(X_train1, label=y_train1)
dtest1 = xgb.DMatrix(X_test1, label=y_test1)

# Définir les paramètres du modèle
params = {
    'objective': 'binary:logistic',  # Classification binaire
    'eval_metric': 'auc',            # Utiliser l'AUC comme métrique
    'max_depth': 6,                  # Profondeur maximale des arbres
    'eta': 0.3,                      # Taux d'apprentissage
    'subsample': 0.8,                # Sous-échantillonnage des lignes
    'colsample_bytree': 0.8          # Sous-échantillonnage des colonnes
}

# Entraîner le modèle XGBoost
model1 = xgb.train(params, dtrain1, num_boost_round=100)

# Générer les prédictions de probabilités
y_pred_prob1 = model1.predict(dtest1)

# Générer les prédictions binaires avec un seuil de 0.5
y_pred1 = (y_pred_prob1 >= 0.5).astype(int)

# Transformer les prédictions en Series avec les bons index
y_pred_prob1 = pd.Series(y_pred_prob1, index=X_test1.index)
y_pred1 = pd.Series(y_pred1, index=X_test1.index)

# Afficher les métriques
report_results(df, y_pred_prob1, y_pred1)


{'Click Rank Utility': np.float64(0.00853),
 'Negative Log-Likelihood': np.float64(0.03439),
 'AUC': np.float64(0.83964),
 'Demographic Parity': np.float64(-0.00023)}

In [11]:
# Entrainement d'un XGBoost unfair

import optuna
import xgboost as xgb
from sklearn.model_selection import train_test_split, cross_val_score
import pandas as pd
import numpy as np


X = df.drop(columns=["click"])
y = df["click"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

def objective(trial):
    """ Fonction d'optimisation pour choisir les meilleurs hyperparamètres """
    params = {
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'objective': 'binary:logistic',
        'eval_metric': 'auc'
    }
    
    model = xgb.XGBClassifier(**params)
    score = cross_val_score(model, X_train, y_train, scoring='roc_auc', cv=3).mean()

    #print(f"Trial {trial.number}: AUC = {score:.5f}, Params = {params}")
    
    return score

# Effectuer N essais pour le tuning
N_trials = 50
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=N_trials)

best_params = study.best_params
print("Best parameters:", best_params)

# Entraîner le modèle final avec les meilleurs paramètres
best_model = xgb.XGBClassifier(**best_params)
best_model.fit(X_train, y_train)

y_pred_prob1 = best_model.predict_proba(X_test)[:, 1]
y_pred1 = (y_pred_prob1 >= 0.5).astype(int)

# Transformer les prédictions en Series avec les bons index
y_pred_prob1 = pd.Series(y_pred_prob1, index=X_test.index)
y_pred1 = pd.Series(y_pred1, index=X_test.index)

# Afficher les métriques
xgb_unfair_results = report_results(df, y_pred_prob1, y_pred1)
xgb_unfair_results

[I 2025-02-12 15:33:34,626] A new study created in memory with name: no-name-3a6234bb-ba9a-45e6-a354-8d23df0420cd
[I 2025-02-12 15:33:57,787] Trial 0 finished with value: 0.8079871086135838 and parameters: {'max_depth': 9, 'subsample': 0.7934362249624916, 'n_estimators': 129, 'colsample_bytree': 0.8846381704202704}. Best is trial 0 with value: 0.8079871086135838.


Trial 0: AUC = 0.80799, Params = {'max_depth': 9, 'subsample': 0.7934362249624916, 'n_estimators': 129, 'colsample_bytree': 0.8846381704202704, 'objective': 'binary:logistic', 'eval_metric': 'auc'}


[I 2025-02-12 15:34:33,102] Trial 1 finished with value: 0.7970314490310524 and parameters: {'max_depth': 8, 'subsample': 0.7009062773765784, 'n_estimators': 235, 'colsample_bytree': 0.9477720814985668}. Best is trial 0 with value: 0.8079871086135838.


Trial 1: AUC = 0.79703, Params = {'max_depth': 8, 'subsample': 0.7009062773765784, 'n_estimators': 235, 'colsample_bytree': 0.9477720814985668, 'objective': 'binary:logistic', 'eval_metric': 'auc'}


[I 2025-02-12 15:34:46,997] Trial 2 finished with value: 0.8421657757049799 and parameters: {'max_depth': 4, 'subsample': 0.9610563947950373, 'n_estimators': 151, 'colsample_bytree': 0.8050024121346935}. Best is trial 2 with value: 0.8421657757049799.


Trial 2: AUC = 0.84217, Params = {'max_depth': 4, 'subsample': 0.9610563947950373, 'n_estimators': 151, 'colsample_bytree': 0.8050024121346935, 'objective': 'binary:logistic', 'eval_metric': 'auc'}


[I 2025-02-12 15:35:14,738] Trial 3 finished with value: 0.8160186018553395 and parameters: {'max_depth': 6, 'subsample': 0.8828689622263044, 'n_estimators': 255, 'colsample_bytree': 0.9048788275031903}. Best is trial 2 with value: 0.8421657757049799.


Trial 3: AUC = 0.81602, Params = {'max_depth': 6, 'subsample': 0.8828689622263044, 'n_estimators': 255, 'colsample_bytree': 0.9048788275031903, 'objective': 'binary:logistic', 'eval_metric': 'auc'}


[I 2025-02-12 15:35:30,788] Trial 4 finished with value: 0.8340041549520266 and parameters: {'max_depth': 5, 'subsample': 0.97754818230519, 'n_estimators': 162, 'colsample_bytree': 0.8774965860895325}. Best is trial 2 with value: 0.8421657757049799.


Trial 4: AUC = 0.83400, Params = {'max_depth': 5, 'subsample': 0.97754818230519, 'n_estimators': 162, 'colsample_bytree': 0.8774965860895325, 'objective': 'binary:logistic', 'eval_metric': 'auc'}


[I 2025-02-12 15:35:52,267] Trial 5 finished with value: 0.8126961529131812 and parameters: {'max_depth': 6, 'subsample': 0.6685557168978757, 'n_estimators': 186, 'colsample_bytree': 0.8969628024847198}. Best is trial 2 with value: 0.8421657757049799.


Trial 5: AUC = 0.81270, Params = {'max_depth': 6, 'subsample': 0.6685557168978757, 'n_estimators': 186, 'colsample_bytree': 0.8969628024847198, 'objective': 'binary:logistic', 'eval_metric': 'auc'}


[I 2025-02-12 15:36:07,827] Trial 6 finished with value: 0.8225369835599029 and parameters: {'max_depth': 7, 'subsample': 0.8010874160377506, 'n_estimators': 115, 'colsample_bytree': 0.7192993618825296}. Best is trial 2 with value: 0.8421657757049799.


Trial 6: AUC = 0.82254, Params = {'max_depth': 7, 'subsample': 0.8010874160377506, 'n_estimators': 115, 'colsample_bytree': 0.7192993618825296, 'objective': 'binary:logistic', 'eval_metric': 'auc'}


[I 2025-02-12 15:36:28,063] Trial 7 finished with value: 0.8123145703543285 and parameters: {'max_depth': 7, 'subsample': 0.6229621975688122, 'n_estimators': 151, 'colsample_bytree': 0.6001299524003234}. Best is trial 2 with value: 0.8421657757049799.


Trial 7: AUC = 0.81231, Params = {'max_depth': 7, 'subsample': 0.6229621975688122, 'n_estimators': 151, 'colsample_bytree': 0.6001299524003234, 'objective': 'binary:logistic', 'eval_metric': 'auc'}


[I 2025-02-12 15:36:37,405] Trial 8 finished with value: 0.8380399352513387 and parameters: {'max_depth': 7, 'subsample': 0.8603426402113679, 'n_estimators': 55, 'colsample_bytree': 0.7900143710972038}. Best is trial 2 with value: 0.8421657757049799.


Trial 8: AUC = 0.83804, Params = {'max_depth': 7, 'subsample': 0.8603426402113679, 'n_estimators': 55, 'colsample_bytree': 0.7900143710972038, 'objective': 'binary:logistic', 'eval_metric': 'auc'}


[I 2025-02-12 15:37:08,008] Trial 9 finished with value: 0.8027607111384015 and parameters: {'max_depth': 8, 'subsample': 0.7278086363573799, 'n_estimators': 211, 'colsample_bytree': 0.6910946354478098}. Best is trial 2 with value: 0.8421657757049799.


Trial 9: AUC = 0.80276, Params = {'max_depth': 8, 'subsample': 0.7278086363573799, 'n_estimators': 211, 'colsample_bytree': 0.6910946354478098, 'objective': 'binary:logistic', 'eval_metric': 'auc'}


[I 2025-02-12 15:37:29,182] Trial 10 finished with value: 0.8363955230030401 and parameters: {'max_depth': 3, 'subsample': 0.5114907792628439, 'n_estimators': 285, 'colsample_bytree': 0.6080154277718297}. Best is trial 2 with value: 0.8421657757049799.


Trial 10: AUC = 0.83640, Params = {'max_depth': 3, 'subsample': 0.5114907792628439, 'n_estimators': 285, 'colsample_bytree': 0.6080154277718297, 'objective': 'binary:logistic', 'eval_metric': 'auc'}


[I 2025-02-12 15:37:35,744] Trial 11 finished with value: 0.8435656736131713 and parameters: {'max_depth': 3, 'subsample': 0.9856508428448265, 'n_estimators': 52, 'colsample_bytree': 0.7884426027274445}. Best is trial 11 with value: 0.8435656736131713.


Trial 11: AUC = 0.84357, Params = {'max_depth': 3, 'subsample': 0.9856508428448265, 'n_estimators': 52, 'colsample_bytree': 0.7884426027274445, 'objective': 'binary:logistic', 'eval_metric': 'auc'}


[I 2025-02-12 15:37:42,083] Trial 12 finished with value: 0.8429921408054319 and parameters: {'max_depth': 3, 'subsample': 0.9909669461182908, 'n_estimators': 51, 'colsample_bytree': 0.7972438621712953}. Best is trial 11 with value: 0.8435656736131713.


Trial 12: AUC = 0.84299, Params = {'max_depth': 3, 'subsample': 0.9909669461182908, 'n_estimators': 51, 'colsample_bytree': 0.7972438621712953, 'objective': 'binary:logistic', 'eval_metric': 'auc'}


[I 2025-02-12 15:37:48,773] Trial 13 finished with value: 0.8437587610354519 and parameters: {'max_depth': 3, 'subsample': 0.9145571999127025, 'n_estimators': 52, 'colsample_bytree': 0.5151215566171842}. Best is trial 13 with value: 0.8437587610354519.


Trial 13: AUC = 0.84376, Params = {'max_depth': 3, 'subsample': 0.9145571999127025, 'n_estimators': 52, 'colsample_bytree': 0.5151215566171842, 'objective': 'binary:logistic', 'eval_metric': 'auc'}


[I 2025-02-12 15:37:58,400] Trial 14 finished with value: 0.8454302582897961 and parameters: {'max_depth': 4, 'subsample': 0.9074263204923084, 'n_estimators': 90, 'colsample_bytree': 0.5024414469525368}. Best is trial 14 with value: 0.8454302582897961.


Trial 14: AUC = 0.84543, Params = {'max_depth': 4, 'subsample': 0.9074263204923084, 'n_estimators': 90, 'colsample_bytree': 0.5024414469525368, 'objective': 'binary:logistic', 'eval_metric': 'auc'}


[I 2025-02-12 15:38:08,292] Trial 15 finished with value: 0.8446617846093244 and parameters: {'max_depth': 4, 'subsample': 0.8944706881292999, 'n_estimators': 93, 'colsample_bytree': 0.5042593765224104}. Best is trial 14 with value: 0.8454302582897961.


Trial 15: AUC = 0.84466, Params = {'max_depth': 4, 'subsample': 0.8944706881292999, 'n_estimators': 93, 'colsample_bytree': 0.5042593765224104, 'objective': 'binary:logistic', 'eval_metric': 'auc'}


[I 2025-02-12 15:38:19,505] Trial 16 finished with value: 0.8412027396304165 and parameters: {'max_depth': 5, 'subsample': 0.8329707514533755, 'n_estimators': 95, 'colsample_bytree': 0.5027769467265135}. Best is trial 14 with value: 0.8454302582897961.


Trial 16: AUC = 0.84120, Params = {'max_depth': 5, 'subsample': 0.8329707514533755, 'n_estimators': 95, 'colsample_bytree': 0.5027769467265135, 'objective': 'binary:logistic', 'eval_metric': 'auc'}


[I 2025-02-12 15:38:30,675] Trial 17 finished with value: 0.8407813594752521 and parameters: {'max_depth': 5, 'subsample': 0.8998849944760523, 'n_estimators': 97, 'colsample_bytree': 0.5793998220459394}. Best is trial 14 with value: 0.8454302582897961.


Trial 17: AUC = 0.84078, Params = {'max_depth': 5, 'subsample': 0.8998849944760523, 'n_estimators': 97, 'colsample_bytree': 0.5793998220459394, 'objective': 'binary:logistic', 'eval_metric': 'auc'}


[I 2025-02-12 15:38:40,430] Trial 18 finished with value: 0.8450076782740975 and parameters: {'max_depth': 4, 'subsample': 0.7813771069168447, 'n_estimators': 87, 'colsample_bytree': 0.6422840889324822}. Best is trial 14 with value: 0.8454302582897961.


Trial 18: AUC = 0.84501, Params = {'max_depth': 4, 'subsample': 0.7813771069168447, 'n_estimators': 87, 'colsample_bytree': 0.6422840889324822, 'objective': 'binary:logistic', 'eval_metric': 'auc'}


[I 2025-02-12 15:38:56,787] Trial 19 finished with value: 0.8094511644890785 and parameters: {'max_depth': 10, 'subsample': 0.7658022889938343, 'n_estimators': 83, 'colsample_bytree': 0.6617718870903014}. Best is trial 14 with value: 0.8454302582897961.


Trial 19: AUC = 0.80945, Params = {'max_depth': 10, 'subsample': 0.7658022889938343, 'n_estimators': 83, 'colsample_bytree': 0.6617718870903014, 'objective': 'binary:logistic', 'eval_metric': 'auc'}


[I 2025-02-12 15:39:09,828] Trial 20 finished with value: 0.8389811888959611 and parameters: {'max_depth': 4, 'subsample': 0.6308225768600588, 'n_estimators': 128, 'colsample_bytree': 0.6385616928753614}. Best is trial 14 with value: 0.8454302582897961.


Trial 20: AUC = 0.83898, Params = {'max_depth': 4, 'subsample': 0.6308225768600588, 'n_estimators': 128, 'colsample_bytree': 0.6385616928753614, 'objective': 'binary:logistic', 'eval_metric': 'auc'}


[I 2025-02-12 15:39:19,812] Trial 21 finished with value: 0.84489057326657 and parameters: {'max_depth': 4, 'subsample': 0.8398530339796707, 'n_estimators': 79, 'colsample_bytree': 0.5462594619970691}. Best is trial 14 with value: 0.8454302582897961.


Trial 21: AUC = 0.84489, Params = {'max_depth': 4, 'subsample': 0.8398530339796707, 'n_estimators': 79, 'colsample_bytree': 0.5462594619970691, 'objective': 'binary:logistic', 'eval_metric': 'auc'}


[I 2025-02-12 15:39:29,171] Trial 22 finished with value: 0.8437857424382352 and parameters: {'max_depth': 4, 'subsample': 0.8245492053100443, 'n_estimators': 77, 'colsample_bytree': 0.5515010958215312}. Best is trial 14 with value: 0.8454302582897961.


Trial 22: AUC = 0.84379, Params = {'max_depth': 4, 'subsample': 0.8245492053100443, 'n_estimators': 77, 'colsample_bytree': 0.5515010958215312, 'objective': 'binary:logistic', 'eval_metric': 'auc'}


[I 2025-02-12 15:39:43,305] Trial 23 finished with value: 0.8350926925347343 and parameters: {'max_depth': 5, 'subsample': 0.739662372460672, 'n_estimators': 130, 'colsample_bytree': 0.5565263986086192}. Best is trial 14 with value: 0.8454302582897961.


Trial 23: AUC = 0.83509, Params = {'max_depth': 5, 'subsample': 0.739662372460672, 'n_estimators': 130, 'colsample_bytree': 0.5565263986086192, 'objective': 'binary:logistic', 'eval_metric': 'auc'}


[I 2025-02-12 15:39:51,746] Trial 24 finished with value: 0.8461445354279159 and parameters: {'max_depth': 4, 'subsample': 0.9221847448520932, 'n_estimators': 73, 'colsample_bytree': 0.6348774818475812}. Best is trial 24 with value: 0.8461445354279159.


Trial 24: AUC = 0.84614, Params = {'max_depth': 4, 'subsample': 0.9221847448520932, 'n_estimators': 73, 'colsample_bytree': 0.6348774818475812, 'objective': 'binary:logistic', 'eval_metric': 'auc'}


[I 2025-02-12 15:40:05,358] Trial 25 finished with value: 0.834240625686231 and parameters: {'max_depth': 6, 'subsample': 0.9403773053687586, 'n_estimators': 113, 'colsample_bytree': 0.7353648357554631}. Best is trial 24 with value: 0.8461445354279159.


Trial 25: AUC = 0.83424, Params = {'max_depth': 6, 'subsample': 0.9403773053687586, 'n_estimators': 113, 'colsample_bytree': 0.7353648357554631, 'objective': 'binary:logistic', 'eval_metric': 'auc'}


[I 2025-02-12 15:40:14,300] Trial 26 finished with value: 0.8440534014503059 and parameters: {'max_depth': 5, 'subsample': 0.9336832799162459, 'n_estimators': 69, 'colsample_bytree': 0.6393054515967291}. Best is trial 24 with value: 0.8461445354279159.


Trial 26: AUC = 0.84405, Params = {'max_depth': 5, 'subsample': 0.9336832799162459, 'n_estimators': 69, 'colsample_bytree': 0.6393054515967291, 'objective': 'binary:logistic', 'eval_metric': 'auc'}


[I 2025-02-12 15:40:31,908] Trial 27 finished with value: 0.8319894900904234 and parameters: {'max_depth': 4, 'subsample': 0.5358487169099743, 'n_estimators': 193, 'colsample_bytree': 0.6864409860418567}. Best is trial 24 with value: 0.8461445354279159.


Trial 27: AUC = 0.83199, Params = {'max_depth': 4, 'subsample': 0.5358487169099743, 'n_estimators': 193, 'colsample_bytree': 0.6864409860418567, 'objective': 'binary:logistic', 'eval_metric': 'auc'}


[I 2025-02-12 15:40:41,861] Trial 28 finished with value: 0.845467420673891 and parameters: {'max_depth': 3, 'subsample': 0.864824447374703, 'n_estimators': 109, 'colsample_bytree': 0.6213715603210395}. Best is trial 24 with value: 0.8461445354279159.


Trial 28: AUC = 0.84547, Params = {'max_depth': 3, 'subsample': 0.864824447374703, 'n_estimators': 109, 'colsample_bytree': 0.6213715603210395, 'objective': 'binary:logistic', 'eval_metric': 'auc'}


[I 2025-02-12 15:40:52,562] Trial 29 finished with value: 0.8465797598569944 and parameters: {'max_depth': 3, 'subsample': 0.865015359929133, 'n_estimators': 121, 'colsample_bytree': 0.6055094773262627}. Best is trial 29 with value: 0.8465797598569944.


Trial 29: AUC = 0.84658, Params = {'max_depth': 3, 'subsample': 0.865015359929133, 'n_estimators': 121, 'colsample_bytree': 0.6055094773262627, 'objective': 'binary:logistic', 'eval_metric': 'auc'}


[I 2025-02-12 15:41:03,089] Trial 30 finished with value: 0.8465171445203948 and parameters: {'max_depth': 3, 'subsample': 0.874772771023255, 'n_estimators': 117, 'colsample_bytree': 0.5885085456546688}. Best is trial 29 with value: 0.8465797598569944.


Trial 30: AUC = 0.84652, Params = {'max_depth': 3, 'subsample': 0.874772771023255, 'n_estimators': 117, 'colsample_bytree': 0.5885085456546688, 'objective': 'binary:logistic', 'eval_metric': 'auc'}


[I 2025-02-12 15:41:13,359] Trial 31 finished with value: 0.8447924262169462 and parameters: {'max_depth': 3, 'subsample': 0.8667135331647964, 'n_estimators': 114, 'colsample_bytree': 0.6031189880678056}. Best is trial 29 with value: 0.8465797598569944.


Trial 31: AUC = 0.84479, Params = {'max_depth': 3, 'subsample': 0.8667135331647964, 'n_estimators': 114, 'colsample_bytree': 0.6031189880678056, 'objective': 'binary:logistic', 'eval_metric': 'auc'}


[I 2025-02-12 15:41:24,933] Trial 32 finished with value: 0.8450597519021175 and parameters: {'max_depth': 3, 'subsample': 0.8083673980896149, 'n_estimators': 135, 'colsample_bytree': 0.6809512937174028}. Best is trial 29 with value: 0.8465797598569944.


Trial 32: AUC = 0.84506, Params = {'max_depth': 3, 'subsample': 0.8083673980896149, 'n_estimators': 135, 'colsample_bytree': 0.6809512937174028, 'objective': 'binary:logistic', 'eval_metric': 'auc'}


[I 2025-02-12 15:41:37,366] Trial 33 finished with value: 0.8459004250423892 and parameters: {'max_depth': 3, 'subsample': 0.8503401493052128, 'n_estimators': 149, 'colsample_bytree': 0.5852593290325926}. Best is trial 29 with value: 0.8465797598569944.


Trial 33: AUC = 0.84590, Params = {'max_depth': 3, 'subsample': 0.8503401493052128, 'n_estimators': 149, 'colsample_bytree': 0.5852593290325926, 'objective': 'binary:logistic', 'eval_metric': 'auc'}


[I 2025-02-12 15:41:49,534] Trial 34 finished with value: 0.8460666460348688 and parameters: {'max_depth': 3, 'subsample': 0.9450379318051969, 'n_estimators': 149, 'colsample_bytree': 0.5793268732200696}. Best is trial 29 with value: 0.8465797598569944.


Trial 34: AUC = 0.84607, Params = {'max_depth': 3, 'subsample': 0.9450379318051969, 'n_estimators': 149, 'colsample_bytree': 0.5793268732200696, 'objective': 'binary:logistic', 'eval_metric': 'auc'}


[I 2025-02-12 15:42:02,711] Trial 35 finished with value: 0.8461693314965976 and parameters: {'max_depth': 3, 'subsample': 0.9484342152955333, 'n_estimators': 167, 'colsample_bytree': 0.9884971173357977}. Best is trial 29 with value: 0.8465797598569944.


Trial 35: AUC = 0.84617, Params = {'max_depth': 3, 'subsample': 0.9484342152955333, 'n_estimators': 167, 'colsample_bytree': 0.9884971173357977, 'objective': 'binary:logistic', 'eval_metric': 'auc'}


[I 2025-02-12 15:42:17,229] Trial 36 finished with value: 0.8412354344567463 and parameters: {'max_depth': 4, 'subsample': 0.9574932351519744, 'n_estimators': 167, 'colsample_bytree': 0.9518008325035818}. Best is trial 29 with value: 0.8465797598569944.


Trial 36: AUC = 0.84124, Params = {'max_depth': 4, 'subsample': 0.9574932351519744, 'n_estimators': 167, 'colsample_bytree': 0.9518008325035818, 'objective': 'binary:logistic', 'eval_metric': 'auc'}


[I 2025-02-12 15:42:31,456] Trial 37 finished with value: 0.8463716363411425 and parameters: {'max_depth': 3, 'subsample': 0.9246503119771963, 'n_estimators': 184, 'colsample_bytree': 0.9980185430162408}. Best is trial 29 with value: 0.8465797598569944.


Trial 37: AUC = 0.84637, Params = {'max_depth': 3, 'subsample': 0.9246503119771963, 'n_estimators': 184, 'colsample_bytree': 0.9980185430162408, 'objective': 'binary:logistic', 'eval_metric': 'auc'}


[I 2025-02-12 15:43:01,936] Trial 38 finished with value: 0.8088881611469151 and parameters: {'max_depth': 8, 'subsample': 0.9606370062956062, 'n_estimators': 217, 'colsample_bytree': 0.9899366729503001}. Best is trial 29 with value: 0.8465797598569944.


Trial 38: AUC = 0.80889, Params = {'max_depth': 8, 'subsample': 0.9606370062956062, 'n_estimators': 217, 'colsample_bytree': 0.9899366729503001, 'objective': 'binary:logistic', 'eval_metric': 'auc'}


[I 2025-02-12 15:43:32,642] Trial 39 finished with value: 0.8118523419695504 and parameters: {'max_depth': 10, 'subsample': 0.9960308754621688, 'n_estimators': 177, 'colsample_bytree': 0.8616574179043128}. Best is trial 29 with value: 0.8465797598569944.


Trial 39: AUC = 0.81185, Params = {'max_depth': 10, 'subsample': 0.9960308754621688, 'n_estimators': 177, 'colsample_bytree': 0.8616574179043128, 'objective': 'binary:logistic', 'eval_metric': 'auc'}


[I 2025-02-12 15:43:50,659] Trial 40 finished with value: 0.8433454266208192 and parameters: {'max_depth': 3, 'subsample': 0.8793646893183988, 'n_estimators': 243, 'colsample_bytree': 0.9942914757040003}. Best is trial 29 with value: 0.8465797598569944.


Trial 40: AUC = 0.84335, Params = {'max_depth': 3, 'subsample': 0.8793646893183988, 'n_estimators': 243, 'colsample_bytree': 0.9942914757040003, 'objective': 'binary:logistic', 'eval_metric': 'auc'}


[I 2025-02-12 15:44:06,258] Trial 41 finished with value: 0.8446110259889287 and parameters: {'max_depth': 3, 'subsample': 0.9191896137589441, 'n_estimators': 205, 'colsample_bytree': 0.9356212834121378}. Best is trial 29 with value: 0.8465797598569944.


Trial 41: AUC = 0.84461, Params = {'max_depth': 3, 'subsample': 0.9191896137589441, 'n_estimators': 205, 'colsample_bytree': 0.9356212834121378, 'objective': 'binary:logistic', 'eval_metric': 'auc'}


[I 2025-02-12 15:44:21,010] Trial 42 finished with value: 0.8434285767380537 and parameters: {'max_depth': 4, 'subsample': 0.9241695624315864, 'n_estimators': 165, 'colsample_bytree': 0.9681741611880061}. Best is trial 29 with value: 0.8465797598569944.


Trial 42: AUC = 0.84343, Params = {'max_depth': 4, 'subsample': 0.9241695624315864, 'n_estimators': 165, 'colsample_bytree': 0.9681741611880061, 'objective': 'binary:logistic', 'eval_metric': 'auc'}


[I 2025-02-12 15:44:35,491] Trial 43 finished with value: 0.8453164354017416 and parameters: {'max_depth': 3, 'subsample': 0.9643140579764098, 'n_estimators': 189, 'colsample_bytree': 0.8354980121259135}. Best is trial 29 with value: 0.8465797598569944.


Trial 43: AUC = 0.84532, Params = {'max_depth': 3, 'subsample': 0.9643140579764098, 'n_estimators': 189, 'colsample_bytree': 0.8354980121259135, 'objective': 'binary:logistic', 'eval_metric': 'auc'}


[I 2025-02-12 15:44:51,435] Trial 44 finished with value: 0.8299044843514736 and parameters: {'max_depth': 6, 'subsample': 0.879417064070193, 'n_estimators': 135, 'colsample_bytree': 0.9080611793970566}. Best is trial 29 with value: 0.8465797598569944.


Trial 44: AUC = 0.82990, Params = {'max_depth': 6, 'subsample': 0.879417064070193, 'n_estimators': 135, 'colsample_bytree': 0.9080611793970566, 'objective': 'binary:logistic', 'eval_metric': 'auc'}


[I 2025-02-12 15:44:59,006] Trial 45 finished with value: 0.8451090310747098 and parameters: {'max_depth': 3, 'subsample': 0.8095977624630545, 'n_estimators': 66, 'colsample_bytree': 0.7467980008610817}. Best is trial 29 with value: 0.8465797598569944.


Trial 45: AUC = 0.84511, Params = {'max_depth': 3, 'subsample': 0.8095977624630545, 'n_estimators': 66, 'colsample_bytree': 0.7467980008610817, 'objective': 'binary:logistic', 'eval_metric': 'auc'}


[I 2025-02-12 15:45:16,436] Trial 46 finished with value: 0.834322549487453 and parameters: {'max_depth': 5, 'subsample': 0.9720421565154775, 'n_estimators': 179, 'colsample_bytree': 0.9135206415135948}. Best is trial 29 with value: 0.8465797598569944.


Trial 46: AUC = 0.83432, Params = {'max_depth': 5, 'subsample': 0.9720421565154775, 'n_estimators': 179, 'colsample_bytree': 0.9135206415135948, 'objective': 'binary:logistic', 'eval_metric': 'auc'}


[I 2025-02-12 15:45:33,864] Trial 47 finished with value: 0.8433456828979607 and parameters: {'max_depth': 3, 'subsample': 0.7159853246910572, 'n_estimators': 229, 'colsample_bytree': 0.7105522890746637}. Best is trial 29 with value: 0.8465797598569944.


Trial 47: AUC = 0.84335, Params = {'max_depth': 3, 'subsample': 0.7159853246910572, 'n_estimators': 229, 'colsample_bytree': 0.7105522890746637, 'objective': 'binary:logistic', 'eval_metric': 'auc'}


[I 2025-02-12 15:46:18,173] Trial 48 finished with value: 0.8071458633389823 and parameters: {'max_depth': 9, 'subsample': 0.903880387340334, 'n_estimators': 295, 'colsample_bytree': 0.5294987272359146}. Best is trial 29 with value: 0.8465797598569944.


Trial 48: AUC = 0.80715, Params = {'max_depth': 9, 'subsample': 0.903880387340334, 'n_estimators': 295, 'colsample_bytree': 0.5294987272359146, 'objective': 'binary:logistic', 'eval_metric': 'auc'}


[I 2025-02-12 15:46:28,568] Trial 49 finished with value: 0.8446809974344692 and parameters: {'max_depth': 4, 'subsample': 0.9368256264447123, 'n_estimators': 102, 'colsample_bytree': 0.9693561061306252}. Best is trial 29 with value: 0.8465797598569944.


Trial 49: AUC = 0.84468, Params = {'max_depth': 4, 'subsample': 0.9368256264447123, 'n_estimators': 102, 'colsample_bytree': 0.9693561061306252, 'objective': 'binary:logistic', 'eval_metric': 'auc'}
Best parameters: {'max_depth': 3, 'subsample': 0.865015359929133, 'n_estimators': 121, 'colsample_bytree': 0.6055094773262627}


{'Click Rank Utility': np.float64(0.00853),
 'Negative Log-Likelihood': np.float64(0.03394),
 'AUC': np.float64(0.844),
 'Demographic Parity': np.float64(-2e-05)}

In [18]:
# Dummy Classifier

from sklearn.dummy import DummyClassifier

X = df.drop(columns=["click", 'protected_attribute'])
y = df["click"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Entraîner un DummyClassifier qui prédit toujours la classe majoritaire
dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(X_train, y_train)

y_pred_prob_dummy = dummy_clf.predict_proba(X_test)[:, 1]
y_pred_dummy = dummy_clf.predict(X_test)

# Transformer les prédictions en Series avec les bons index
y_pred_prob_dummy = pd.Series(y_pred_prob_dummy, index=X_test.index)
y_pred_dummy = pd.Series(y_pred_dummy, index=X_test.index)

# Afficher les métriques du DummyClassifier
dummy_unaware_results = report_results(df, y_pred_prob_dummy, y_pred_dummy)
dummy_unaware_results

{'Click Rank Utility': np.float64(0.00853),
 'Negative Log-Likelihood': np.float64(0.23177),
 'AUC': np.float64(0.5),
 'Demographic Parity': np.float64(0.0)}

In [21]:
# Entrainement d'un XGBoost qui ne prend pas en compte l'attribut protégé

import optuna
import xgboost as xgb
from sklearn.model_selection import train_test_split, cross_val_score
import pandas as pd
import numpy as np

X = df.drop(columns=["click", 'protected_attribute'])
y = df["click"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

def objective(trial):
    """ Fonction d'optimisation pour choisir les meilleurs hyperparamètres """
    params = {
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'objective': 'binary:logistic',
        'eval_metric': 'auc'
    }
    
    model = xgb.XGBClassifier(**params)
    score = cross_val_score(model, X_train, y_train, scoring='roc_auc', cv=3).mean()

    #print(f"Trial {trial.number}: AUC = {score:.5f}, Params = {params}")
    
    return score

# Effectuer 100 essais pour le tuning
N_trials = 50
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=N_trials)

best_params = study.best_params
print("Best parameters:", best_params)

# Entraîner le modèle final avec les meilleurs paramètres
best_model = xgb.XGBClassifier(**best_params)
best_model.fit(X_train, y_train)

y_pred_prob1 = best_model.predict_proba(X_test)[:, 1]
y_pred1 = (y_pred_prob1 >= 0.5).astype(int)

# Transformer les prédictions en Series avec les bons index
y_pred_prob1 = pd.Series(y_pred_prob1, index=X_test.index)
y_pred1 = pd.Series(y_pred1, index=X_test.index)

# Afficher les métriques
xgb_unaware_results = report_results(df, y_pred_prob1, y_pred1)
xgb_unaware_results

[I 2025-02-12 16:25:53,376] A new study created in memory with name: no-name-57d814fc-0a8e-436f-b923-d25b0a0dec73
[I 2025-02-12 16:26:37,180] Trial 0 finished with value: 0.8014781273746093 and parameters: {'max_depth': 8, 'subsample': 0.6762646802660599, 'n_estimators': 232, 'colsample_bytree': 0.644533834586019}. Best is trial 0 with value: 0.8014781273746093.
[I 2025-02-12 16:27:05,556] Trial 1 finished with value: 0.8011640912886371 and parameters: {'max_depth': 9, 'subsample': 0.6854534034252376, 'n_estimators': 129, 'colsample_bytree': 0.9485423779680141}. Best is trial 0 with value: 0.8014781273746093.
[I 2025-02-12 16:27:26,020] Trial 2 finished with value: 0.8187911738779999 and parameters: {'max_depth': 7, 'subsample': 0.941832896425137, 'n_estimators': 178, 'colsample_bytree': 0.6076202852248869}. Best is trial 2 with value: 0.8187911738779999.
[I 2025-02-12 16:27:50,907] Trial 3 finished with value: 0.8116254249199488 and parameters: {'max_depth': 7, 'subsample': 0.89887542

Best parameters: {'max_depth': 3, 'subsample': 0.8721040931502022, 'n_estimators': 149, 'colsample_bytree': 0.5169089489283786}


{'Click Rank Utility': np.float64(0.00853),
 'Negative Log-Likelihood': np.float64(0.03395),
 'AUC': np.float64(0.84448),
 'Demographic Parity': np.float64(-3e-05)}

In [22]:
# Création du DataFrame à partir des résultats
df_results = pd.DataFrame({
    "Model": ["Dummy unaware", "XGB unaware", "XGB unfair"],
    "NLLH ↓": [dummy_unaware_results["Negative Log-Likelihood"], xgb_unaware_results["Negative Log-Likelihood"], xgb_unfair_results["Negative Log-Likelihood"]],
    "AUC ↑": [dummy_unaware_results["AUC"], xgb_unaware_results["AUC"], xgb_unfair_results["AUC"]],
    "DP ↓": [dummy_unaware_results["Demographic Parity"], xgb_unaware_results["Demographic Parity"], xgb_unfair_results["Demographic Parity"]],
    "U ↑": [dummy_unaware_results["Click Rank Utility"], xgb_unaware_results["Click Rank Utility"], xgb_unfair_results["Click Rank Utility"]],
})
print(df_results)

           Model   NLLH ↓    AUC ↑     DP ↓      U ↑
0  Dummy unaware  0.23177  0.50000  0.00000  0.00853
1    XGB unaware  0.03395  0.84448 -0.00003  0.00853
2     XGB unfair  0.03394  0.84400 -0.00002  0.00853


| Model          | NLLH ↓  | AUC ↑  | DP ↓    | U ↑    |
|---------------|--------|--------|--------|--------|
| Dummy unaware | 0.23177 | 0.50000 | 0.00000 | 0.00853 |
| XGB unaware   | 0.03395 | 0.84298 | -0.00000 | 0.00853 |
| XGB unfair    | 0.03394 | 0.84400 | -0.00002 | 0.00853 |


In [28]:
from sklearn.metrics import classification_report
y_pred_prob1 = best_model.predict_proba(X_test)[:, 1]
y_pred1 = (y_pred_prob1 >= 0.5).astype(int)
print(classification_report(y_pred1, y_test))

              precision    recall  f1-score   support

           0       1.00      0.99      1.00    214443
           1       0.00      0.33      0.00         3

    accuracy                           0.99    214446
   macro avg       0.50      0.66      0.50    214446
weighted avg       1.00      0.99      1.00    214446

