In [6]:
# %% [markdown]
# # Tuning Notebook — Projet 7
# - Préprocessing: `basic_feature_engineering`
# - Score métier normalisé (FN & FP)
# - VarianceThreshold après OHE
# - Randomized/Grid Search
# - MLflow local par défaut (sécurise l'exécution)

# %%
import os, sys, time, json, warnings
from pathlib import Path

import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.model_selection import (
    StratifiedKFold, RandomizedSearchCV, GridSearchCV
)
from sklearn.metrics import roc_auc_score
from sklearn.feature_selection import VarianceThreshold
from scipy.stats import randint, uniform, loguniform

warnings.filterwarnings("ignore")

# Permet d'importer src/* quand on lance depuis un notebook
ROOT = Path(".").resolve()
if str(ROOT) not in sys.path:
    sys.path.append(str(ROOT))

# %%
# -----------------------------
# Imports projet
# -----------------------------
import yaml
from src.features import basic_feature_engineering
from src.metrics import make_business_scorer, evaluate_all
# Si tu as un module data.py, on lit simplement les CSV via conf
# from src.data import load_training_data  # (si tu en as un)
# met ça tout en haut du notebook, AVANT tout import mlflow
MLFLOW_DIR = "./mlruns_tune"           # nouveau répertoire
Path(MLFLOW_DIR).mkdir(parents=True, exist_ok=True)
os.environ["MLFLOW_TRACKING_URI"] = f"file:{MLFLOW_DIR}"
os.environ["MLFLOW_EXPERIMENT"]   = "tuning_local"


# %%
# -----------------------------
# Config
# -----------------------------
CONF_PATH = "conf/params.yaml"

with open(CONF_PATH, "r") as f:
    cfg = yaml.safe_load(f)

def cfg_get(d, path, default=None):
    cur = d
    for p in path.split("."):
        if isinstance(cur, dict) and p in cur:
            cur = cur[p]
        else:
            return default
    return cur

TRAIN_CSV = cfg_get(cfg, "data.train_csv")
TARGET    = "TARGET"  # adapté au dataset Home Credit
FN_COST   = float(cfg_get(cfg, "cost.fn", 10.0))
FP_COST   = float(cfg_get(cfg, "cost.fp", 1.0))
TH_GRID   = int(cfg_get(cfg, "cost.threshold_grid", 501))

N_SPLITS  = int(cfg_get(cfg, "cv.n_splits", 5))
RSTATE    = int(cfg_get(cfg, "cv.random_state", 42))

# ---------------- MLflow (local par défaut) ----------------
USE_MLFLOW_REMOTE = False  # mets True si tu veux loguer sur Databricks
EXPERIMENT_NAME   = "tuning_local"

if not USE_MLFLOW_REMOTE:
    os.environ["MLFLOW_TRACKING_URI"] = "file:./mlruns"
    os.environ["MLFLOW_EXPERIMENT"]   = EXPERIMENT_NAME

import mlflow

if USE_MLFLOW_REMOTE:
    # suppose que tes variables env Databricks sont déjà OK dans ce notebook
    mlflow.set_tracking_uri(cfg_get(cfg, "mlflow.default_tracking_uri", "databricks"))
    mlflow.set_experiment(cfg_get(cfg, "mlflow.default_experiment", "/Users/.../projet7_scoring"))
else:
    mlflow.set_experiment(EXPERIMENT_NAME)

# %%
# -----------------------------
# Chargement & Préprocessing
# -----------------------------
df = pd.read_csv(TRAIN_CSV)
assert TARGET in df.columns, f"Colonne cible '{TARGET}' absente."

y = df[TARGET].astype(int).reset_index(drop=True)
X = df.drop(columns=[TARGET])

preprocessor = basic_feature_engineering(X)

# On ajoute un VarianceThreshold APRÈS le transformer (marche sur sparse)
var_filter = VarianceThreshold(threshold=0.0)

# %%
# -----------------------------
# Scorer métier normalisé
# -----------------------------
business_scorer = make_business_scorer(
    fn_cost=FN_COST,
    fp_cost=FP_COST,
    grid=TH_GRID
)

# %%
# -----------------------------
# Aides: builders d'estimateurs
# -----------------------------
import lightgbm as lgb
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

def build_lgbm_for_search(y):
    pos = int(np.sum(y == 1))
    neg = int(np.sum(y == 0))
    spw = neg / max(pos, 1)  # stabilise les splits pour données déséquilibrées
    return lgb.LGBMClassifier(
        n_estimators=2000,
        learning_rate=0.05,
        n_jobs=-1,
        random_state=RSTATE,
        min_gain_to_split=0.0,  # IMPORTANT pour éviter "best gain: -inf"
        verbosity=-1,
        scale_pos_weight=spw,   # plutôt que class_weight
    )

def build_logreg_for_search():
    return LogisticRegression(
        penalty="l2",
        solver="saga",
        max_iter=3000,
        class_weight="balanced",
        n_jobs=-1,
        random_state=RSTATE,
    )

def build_rf_for_search():
    return RandomForestClassifier(
        n_estimators=600,
        n_jobs=-1,
        random_state=RSTATE,
        class_weight="balanced",
        oob_score=False,
    )

# %%
# -----------------------------
# Espaces de recherche
# -----------------------------
def lgbm_space(random=True):
    if random:
        return {
            "clf__num_leaves": randint(63, 255),
            "clf__max_depth": randint(8, 16),          # -1 si tu veux illimité -> ajoute une valeur
            "clf__min_child_samples": randint(5, 100),
            "clf__subsample": uniform(0.6, 0.4),       # 0.6..1.0
            "clf__colsample_bytree": uniform(0.6, 0.4),
            "clf__reg_lambda": loguniform(1e-3, 10),
            "clf__reg_alpha": loguniform(1e-3, 10),
            "clf__max_bin": randint(128, 512),
        }
    else:
        return {
            "clf__num_leaves": [63, 127, 255],
            "clf__max_depth": [8, 12, -1],
            "clf__min_child_samples": [5, 20, 50, 100],
            "clf__subsample": [0.6, 0.8, 1.0],
            "clf__colsample_bytree": [0.6, 0.8, 1.0],
            "clf__reg_lambda": [0.01, 0.1, 1.0, 10.0],
            "clf__reg_alpha": [0.0, 0.01, 0.1, 1.0],
            "clf__max_bin": [128, 255, 512],
        }

def logreg_space(random=True):
    if random:
        return {
            "clf__C": loguniform(1e-3, 1e+2),
            "clf__l1_ratio": uniform(0.0, 1.0),  # si penalty='elasticnet' (optionnel)
            # si tu veux tester elasticnet:
            # "clf__penalty": ["l1", "l2", "elasticnet"],  # attention combinaisons solver
        }
    else:
        return {
            "clf__C": [0.01, 0.1, 1.0, 10.0, 50.0, 100.0],
        }

def rf_space(random=True):
    if random:
        return {
            "clf__max_depth": randint(6, 20),
            "clf__min_samples_leaf": randint(1, 50),
            "clf__min_samples_split": randint(2, 50),
            "clf__max_features": ["sqrt", "log2", None],
            "clf__n_estimators": randint(300, 900),
        }
    else:
        return {
            "clf__max_depth": [8, 12, 16, None],
            "clf__min_samples_leaf": [1, 2, 5, 10, 20],
            "clf__min_samples_split": [2, 5, 10, 20],
            "clf__max_features": ["sqrt", "log2", None],
            "clf__n_estimators": [300, 500, 700, 900],
        }

def get_estimator_by_name(name: str):
    if name == "lgbm":
        import lightgbm as lgb
        return lgb.LGBMClassifier(
            n_estimators=1500, learning_rate=0.03, num_leaves=64,
            subsample=0.8, colsample_bytree=0.8, class_weight="balanced",
            n_jobs=-1
        )
    elif name == "rf":
        from sklearn.ensemble import RandomForestClassifier
        return RandomForestClassifier(
            n_estimators=400, max_depth=None, n_jobs=-1, class_weight="balanced"
        )
    elif name == "logreg":
        from sklearn.linear_model import LogisticRegression
        return LogisticRegression(
            penalty="l2", solver="saga", max_iter=2000,
            class_weight="balanced", n_jobs=-1
        )
    else:
        raise ValueError(f"Unknown model {name}")

def sanity_fit(model_name="lgbm"):
    # Objets NEUFS (pas de fit_transform ici)
    prep = basic_feature_engineering(X)         # <- transformeur
    var_sel = VarianceThreshold(threshold=0.0)  # <- transformeur
    est = get_estimator_by_name(model_name)     # <- estimateur

    pipe = Pipeline([
        ("prep", var_sel if prep is None else prep),  # au cas où
        ("var", var_sel),
        ("est", est),
    ])

    skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    tr_idx, va_idx = next(iter(skf.split(X, y)))
    pipe.fit(X.iloc[tr_idx], y.iloc[tr_idx])
    proba = pipe.predict_proba(X.iloc[va_idx])[:, 1]
    auc = roc_auc_score(y.iloc[va_idx], proba)
    print(f"[Sanity {model_name}] AUC ~ {auc:.4f}")

# Lance une sanity rapide (tu peux changer "lgbm" en "rf"/"logreg")
sanity_fit("lgbm")

# %%
# -----------------------------
# Recherche (Randomized ou Grid)
# -----------------------------
SEARCH_MODE = "random"   # "random" ou "grid"
MODEL       = "lgbm"     # "lgbm" | "logreg" | "rf"
N_ITER      = 40         # utilisé si random
N_JOBS      = 4          # safe pour desktop; ajuste selon ta machine
VERBOSE_CV  = 1

if MODEL == "lgbm":
    clf = build_lgbm_for_search(y)
    space = lgbm_space(random=(SEARCH_MODE=="random"))
elif MODEL == "logreg":
    clf = build_logreg_for_search()
    space = logreg_space(random=(SEARCH_MODE=="random"))
elif MODEL == "rf":
    clf = build_rf_for_search()
    space = rf_space(random=(SEARCH_MODE=="random"))
else:
    raise ValueError("MODEL ∈ {lgbm, logreg, rf}")

pipe = Pipeline([
    ("prep", preprocessor),
    ("var", VarianceThreshold(0.0)),
    ("clf", clf),
])

cv = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RSTATE)

if SEARCH_MODE == "random":
    search = RandomizedSearchCV(
        estimator=pipe,
        param_distributions=space,
        n_iter=N_ITER,
        scoring=business_scorer,         # métrique métier normalisée
        cv=cv,
        n_jobs=N_JOBS,
        verbose=VERBOSE_CV,
        refit=True,
        random_state=RSTATE,
    )
else:
    search = GridSearchCV(
        estimator=pipe,
        param_grid=space,
        scoring=business_scorer,
        cv=cv,
        n_jobs=N_JOBS,
        verbose=VERBOSE_CV,
        refit=True,
    )

t0 = time.time()
with mlflow.start_run(run_name=f"tune_{MODEL}_{SEARCH_MODE}") as run:
    mlflow.log_params({
        "model": MODEL,
        "search_mode": SEARCH_MODE,
        "n_splits": N_SPLITS,
        "n_iter": N_ITER if SEARCH_MODE=="random" else None,
        "n_jobs": N_JOBS,
        "fn_cost": FN_COST,
        "fp_cost": FP_COST,
        "th_grid": TH_GRID,
    })
    search.fit(X, y)
    dur = time.time() - t0

    best_score = float(search.best_score_)
    best_params = search.best_params_
    mlflow.log_metric("best_business_score", best_score)
    mlflow.log_metric("search_secs", dur)
    mlflow.log_dict(best_params, "best_params.json")

    print(f"\n✅ DONE in {dur/60:.1f} min")
    print(f"Best business score (normalized): {best_score:.4f}")
    print("Best params:")
    for k, v in best_params.items():
        print(f"  - {k}: {v}")

# %%
# -----------------------------
# Évaluation AUC / métriques sur CV refit (train complet)
# -----------------------------
best_model = search.best_estimator_
proba_all = np.zeros(len(y), dtype=float)

fold_rows = []
for fold, (tr_idx, va_idx) in enumerate(cv.split(X, y), 1):
    m = best_model
    # Refit par fold (séparé du refit global) pour sortir des métriques propres
    m.fit(X.iloc[tr_idx], y[tr_idx])
    p = m.predict_proba(X.iloc[va_idx])[:, 1]
    proba_all[va_idx] = p
    fold_auc = roc_auc_score(y[va_idx], p)
    fold_rows.append({"fold": fold, "auc": fold_auc})

fold_df = pd.DataFrame(fold_rows)
overall_auc = roc_auc_score(y, proba_all)

print("\nAUC par fold:")
display(fold_df)
print(f"Overall AUC OOF: {overall_auc:.4f}")

# Score métier (normalisé) + seuil optimal sur OOF
m_oof = evaluate_all(y_true=y, y_prob=proba_all,
                     fn_cost=FN_COST, fp_cost=FP_COST,
                     threshold_grid=TH_GRID)
print("\nMétriques OOF (extrait):")
for k in ["auc", "ap", "brier", "ks", "business_score_norm", "best_threshold"]:
    if k in m_oof:
        print(f"  {k}: {m_oof[k]}")

# Sauvegarde artifacts locaux utiles
ART_DIR = Path("reports")
ART_DIR.mkdir(exist_ok=True, parents=True)
np.save(ART_DIR/"oof_prob.npy", proba_all)
with open(ART_DIR/"tuning_summary.json", "w") as f:
    json.dump({
        "model": MODEL,
        "search_mode": SEARCH_MODE,
        "best_business_score_norm": float(m_oof.get("business_score_norm", float("nan"))),
        "best_threshold": float(m_oof.get("best_threshold", float("nan"))),
        "overall_auc_oof": float(overall_auc)
    }, f, indent=2)

print("\nArtifacts sauvegardés dans ./reports/")


Traceback (most recent call last):
  File "/home/nicolasd/.pyenv/versions/scoring_project7/lib/python3.10/site-packages/mlflow/store/tracking/file_store.py", line 366, in search_experiments
    exp = self._get_experiment(exp_id, view_type)
  File "/home/nicolasd/.pyenv/versions/scoring_project7/lib/python3.10/site-packages/mlflow/store/tracking/file_store.py", line 464, in _get_experiment
    meta = FileStore._read_yaml(experiment_dir, FileStore.META_DATA_FILE_NAME)
  File "/home/nicolasd/.pyenv/versions/scoring_project7/lib/python3.10/site-packages/mlflow/store/tracking/file_store.py", line 1634, in _read_yaml
    return _read_helper(root, file_name, attempts_remaining=retries)
  File "/home/nicolasd/.pyenv/versions/scoring_project7/lib/python3.10/site-packages/mlflow/store/tracking/file_store.py", line 1627, in _read_helper
    result = read_yaml(root, file_name)
  File "/home/nicolasd/.pyenv/versions/scoring_project7/lib/python3.10/site-packages/mlflow/utils/yaml_utils.py", line 107

ValueError: The truth value of a DataFrame is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().