# Step 4 - Hyperparameter and threshold optimization

Details and rationale are implemented in the code cells below.


### 1. Load libraries and data

Details and rationale are implemented in the code cells below.


In [None]:
import pandas as pd
import numpy as np
import optuna
import mlflow
import mlflow.sklearn
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

X_train = pd.read_parquet("../data/output/X_train.parquet")
X_test = pd.read_parquet("../data/output/X_test.parquet")
y_train = pd.read_parquet("../data/output/y_train.parquet").squeeze()
y_test = pd.read_parquet("../data/output/y_test.parquet").squeeze()

### 2. Define the business cost function

Details and rationale are implemented in the code cells below.


In [None]:
def compute_business_cost(y_true, y_pred, cost_fn=10, cost_fp=1):
    cm = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = cm.ravel()
    return fn * cost_fn + fp * cost_fp

### 3. Define the Optuna objective

Details and rationale are implemented in the code cells below.


In [None]:
def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 300),
        "max_depth": trial.suggest_int("max_depth", 2, 12),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "eval_metric": "logloss"
    }

    model = XGBClassifier(**params)
    model.fit(X_train, y_train)
    y_proba = model.predict_proba(X_test)[:, 1]
    y_pred = (y_proba >= 0.5).astype(int)

    return compute_business_cost(y_test, y_pred)

### 4. Run Optuna optimization

Details and rationale are implemented in the code cells below.


In [None]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=9)

print("Best params:", study.best_params)
print("Best cost:", study.best_value)

### 5. Optimize decision threshold

Details and rationale are implemented in the code cells below.


In [None]:
model = XGBClassifier(**study.best_params, eval_metric="logloss")
model.fit(X_train, y_train)
y_proba = model.predict_proba(X_test)[:, 1]

thresholds = np.arange(0.1, 0.9, 0.05)
costs = []

for t in thresholds:
    y_pred = (y_proba >= t).astype(int)
    cost = compute_business_cost(y_test, y_pred)
    costs.append(cost)

optimal_threshold = thresholds[np.argmin(costs)]
print("Threshold optimal :", optimal_threshold)

### 6. Log model and threshold with MLflow

Details and rationale are implemented in the code cells below.


In [None]:
import mlflow.models

mlflow.set_tracking_uri("file:../mlruns")
mlflow.set_experiment("modele_optimisation")

# Exemple de signature et input
from mlflow.models.signature import infer_signature
input_example = X_test.iloc[:5]
signature = infer_signature(X_test, model.predict_proba(X_test))

with mlflow.start_run():
    mlflow.log_params(study.best_params)
    mlflow.log_metric("optimal_threshold", optimal_threshold)
    mlflow.log_metric("best_cost", min(costs))
    mlflow.sklearn.log_model(model, "model", signature=signature, input_example=input_example)
    mlflow.set_tag("model_type", "XGBoost optimized with Optuna")

### 7. Sweep thresholds and plot cost vs threshold

Details and rationale are implemented in the code cells below.


In [None]:
import matplotlib.pyplot as plt

thresholds = np.arange(0.05, 0.96, 0.01)
costs = []

y_proba = model.predict_proba(X_test)[:, 1]

for threshold in thresholds:
    y_pred_thresh = (y_proba >= threshold).astype(int)
    cost = compute_business_cost(y_test, y_pred_thresh, cost_fn=10, cost_fp=1)
    costs.append(cost)

best_threshold = thresholds[np.argmin(costs)]
min_cost = min(costs)

plt.figure(figsize=(10, 5))
plt.plot(thresholds, costs, marker='o')
plt.axvline(best_threshold, color='red', linestyle='--', label=f'Threshold optimal = {best_threshold:.2f}')
plt.title("Business cost vs decision threshold")
plt.xlabel("Threshold de classification")
plt.ylabel("Business cost (FN*10 + FP*1)")
plt.legend()
plt.grid(True)
plt.show()

print(f"Optimal business threshold : {best_threshold:.2f} avec un total cost de {min_cost:.0f}")

### Conclusion

Details and rationale are implemented in the code cells below.


## Watchouts

Details and rationale are implemented in the code cells below.
