In [7]:
#!pip install datasets
from datasets import load_dataset

#import warnings
#warnings.filterwarnings("ignore")

import pandas as pd
pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
print("Setup Complete")
from sklearn.metrics import auc, roc_curve, precision_score, recall_score, precision_recall_curve

import numpy as np

Setup Complete


In [3]:
ds = load_dataset("criteo/FairJob")
df = ds['train'].to_pandas()

In [4]:
from sklearn.model_selection import train_test_split

X = df.drop(columns=['click','user_id','impression_id','product_id'])
y = df['click']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [5]:
from sklearn.metrics import log_loss
import numpy as np

def evaluate_log_loss_by_class(y_true, y_proba):
    """
    Calcule la log loss globale, la log loss par classe, et la confiance moyenne sur les vrais positifs.
    
    Parameters
    ----------
    y_true : array-like
        Vecteur des vraies étiquettes (0 ou 1)
    y_proba : array-like, shape (n_samples, 2)
        Probabilités prédites (sorties de predict_proba)

    Returns
    -------
    dict with:
        - 'log_loss_global'
        - 'log_loss_y0'
        - 'log_loss_y1'
        - 'mean_proba_y1' (moyenne des p(y=1) quand y=1)
    """
    y_true = np.array(y_true)
    y_proba = np.array(y_proba)

    return {
        "log_loss_global": round(log_loss(y_true, y_proba), 5),
        "log_loss_y0": round(log_loss(y_true[y_true == 0], y_proba[y_true == 0], labels=[0, 1]), 5),
        "log_loss_y1": round(log_loss(y_true[y_true == 1], y_proba[y_true == 1], labels=[0, 1]), 5),
        "mean_proba_y1": round(float(y_proba[y_true == 1, 1].mean()), 5)
        }



### Modelisation

In [10]:
import optuna
from sklearn.preprocessing import TargetEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import log_loss
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

In [12]:
from sklearn import set_config

set_config(display='diagram')

cat_cols_bin = list(X_train.columns[0:3])       
rank_col = ['rank']                             
cat_cols = list(X_train.columns[4:17])      

data_preproc = ColumnTransformer([('multicat_encoding', TargetEncoder(), cat_cols)], 
                              remainder='passthrough', force_int_remainder_cols=False)

logreg_pipe = Pipeline([('preprocessing', data_preproc),
                     ('logreg',LogisticRegression())
                     ])

logreg_param_dict = {
    'logreg__C': 1.0,
    'logreg__penalty': "l2",
    'logreg__class_weight': "balanced",
    'logreg__max_iter': 1000,
    'logreg__solver': "lbfgs",
    'logreg__random_state': 42
}

logreg_pipe


In [21]:
print("no class_weight, no scaling")

cat_cols_bin = list(X_train.columns[0:3])       
rank_col = ['rank']                             
cat_cols = list(X_train.columns[4:17])      

data_preproc = ColumnTransformer([('multicat_encoding', TargetEncoder(), cat_cols)], 
                              remainder='passthrough', force_int_remainder_cols=False)

logreg_pipe = Pipeline([('preprocessing', data_preproc),
                     ('logreg',LogisticRegression())
                     ])

logreg_param_dict = {
    'logreg__C': 1.0,
    'logreg__penalty': "l2",
    'logreg__class_weight': "balanced",
    'logreg__max_iter': 1000,
    'logreg__solver': "lbfgs",
    'logreg__random_state': 42
}

logreg_pipe.set_params(**logreg_param_dict)
logreg_pipe.fit(X_train, y_train)

y_prob = logreg_pipe.predict_proba(X_test)
print("\nTraining done!")

results = evaluate_log_loss_by_class(y_test, y_prob)
print(" Résultats log loss :\n", results)


no class_weight, no scaling

Training done!
 Résultats log loss :
 {'log_loss_global': 0.55741, 'log_loss_y0': 0.55746, 'log_loss_y1': 0.55016, 'mean_proba_y1': 0.62599}


In [55]:
print("class_weight = balanced, scaling")

cat_cols_bin = list(X_train.columns[0:3])       
rank_col = ['rank']                             
cat_cols = list(X_train.columns[4:17])      
num_cols = [col for col in X_train.columns if col not in cat_cols + cat_cols_bin + rank_col]


cat_pipeline = make_pipeline(
    TargetEncoder(),
    StandardScaler()
)

data_preproc = ColumnTransformer([
    ('cat_preproc', cat_pipeline, cat_cols),
    ('num_scaling', StandardScaler(), num_cols)
], remainder='passthrough', verbose_feature_names_out=False)

logreg_model = LogisticRegression()

logreg_pipe = Pipeline([
    ('preprocessing', data_preproc),
    ('logreg', logreg_model)
])

logreg_param_dict = {
    'logreg__C': 1.0,
    'logreg__penalty': "l2",
    'logreg__class_weight': "balanced",
    'logreg__max_iter': 1000,
    'logreg__solver': "lbfgs",
    'logreg__random_state': 42
}

logreg_pipe.set_params(**logreg_param_dict)
logreg_pipe.fit(X_train, y_train)

y_prob = logreg_pipe.predict_proba(X_test)
print("\nTraining done!")

results = evaluate_log_loss_by_class(y_test, y_prob)
print(" Résultats log loss :\n", results)


class_weight = balanced, scaling

Training done!
 Résultats log loss :
 {'log_loss_global': 0.51302, 'log_loss_y0': 0.51212, 'log_loss_y1': 0.64067, 'mean_proba_y1': 0.59398}


In [58]:
y_prob.shape

(214446, 2)

In [59]:
from functions import report_results

report_results(df, y_prob[:,1], y_test)

{'Click Rank Utility': np.float64(0.01384),
 'Negative Log-Likelihood': np.float64(0.51302),
 'AUC': np.float64(0.78448),
 'Demographic Parity': np.float64(-3e-05)}

### Optuna

In [60]:
logreg_pipe = Pipeline([
    ('preprocessing', data_preproc),  
    ('logreg', LogisticRegression())
])

def objective(trial):
    X_train_sub, X_val, y_train_sub, y_val = train_test_split(
        X_train, y_train, test_size=0.2, stratify=y_train, random_state=42
    )

    C = trial.suggest_float("C", 1e-2, 1e2, log=True)
    max_iter = trial.suggest_int("max_iter", 100, 2000)

    logreg_pipe.set_params(
        logreg__C=C,
        logreg__penalty="l2",
        logreg__class_weight="balanced",
        logreg__max_iter=max_iter,
        logreg__solver="lbfgs",
        logreg__random_state=42
    )

    logreg_pipe.fit(X_train_sub, y_train_sub)
    y_proba = logreg_pipe.predict_proba(X_val)

    return log_loss(y_val, y_proba)

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=20) 

# 🔧 Appliquer les meilleurs paramètres trouvés
best_params = study.best_trial.params

logreg_pipe.set_params(
    logreg__C=best_params["C"],
    logreg__penalty="l2",
    logreg__class_weight="balanced",
    logreg__max_iter=best_params["max_iter"],
    logreg__solver="lbfgs",
    logreg__random_state=42
)

logreg_pipe.fit(X_train, y_train)
y_prob = logreg_pipe.predict_proba(X_test)

print("\nTraining done!")

results = evaluate_log_loss_by_class(y_test, y_prob)
print("Résultats de log loss :\n")
print(results)


[I 2025-04-13 19:35:52,907] A new study created in memory with name: no-name-c8cff204-efa1-4bc7-90ae-5af458da84a1
[I 2025-04-13 19:36:15,838] Trial 0 finished with value: 0.502858995153776 and parameters: {'C': 3.8995702850293155, 'max_iter': 365}. Best is trial 0 with value: 0.502858995153776.
[I 2025-04-13 19:36:38,906] Trial 1 finished with value: 0.5026418487340943 and parameters: {'C': 0.1082698616271391, 'max_iter': 1573}. Best is trial 1 with value: 0.5026418487340943.
[I 2025-04-13 19:37:00,920] Trial 2 finished with value: 0.502675714735436 and parameters: {'C': 0.1522584680801155, 'max_iter': 240}. Best is trial 1 with value: 0.5026418487340943.
[I 2025-04-13 19:37:22,619] Trial 3 finished with value: 0.5028578902616357 and parameters: {'C': 1.801186335369927, 'max_iter': 561}. Best is trial 1 with value: 0.5026418487340943.
[I 2025-04-13 19:37:44,289] Trial 4 finished with value: 0.502857854795234 and parameters: {'C': 1.0255581334968018, 'max_iter': 106}. Best is trial 1 wi


Training done!
Résultats de log loss :

{'log_loss_global': 0.51302, 'log_loss_y0': 0.51212, 'log_loss_y1': 0.64067, 'mean_proba_y1': 0.59398}


In [None]:
report_results(df, y_prob[:,1], y_test)