In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from lightgbm import LGBMClassifier
import optuna
import mlflow
import warnings

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn.metrics import (
    roc_auc_score, roc_curve,
    f1_score, precision_score, recall_score,
    accuracy_score, matthews_corrcoef
)
from mlflow.models.signature import infer_signature

warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('base_modelo.csv', parse_dates=['safra'], dayfirst=True)


In [3]:
var_cols = [col for col in df.columns if col.startswith('VAR_')]

df = df.fillna(-1)


In [None]:
# separa DEV / OOT
df["safra"] = df["safra"].astype(str)
df["period_label"] = np.where(
    df["safra"].between("201402","201409"), "DEV",
    np.where(df["safra"].between("2014010","201412"), "OOT", np.nan)
)
dev = df[df["period_label"]=="DEV"].copy()
oot = df[df["period_label"]=="OOT"].copy()


In [6]:
excl_psi = ['VAR_53'
'VAR_54',
'VAR_30',
'VAR_1', 'VAR_2', 'VAR_3', 'VAR_4'
]
cols_boruta = ['VAR_1', 'VAR_2', 'VAR_3', 'VAR_4', 'VAR_6', 'VAR_9', 'VAR_10', 'VAR_12', 'VAR_14', 'VAR_16', 'VAR_17', 
'VAR_18', 'VAR_19', 'VAR_20', 'VAR_21', 'VAR_22', 'VAR_23', 'VAR_25', 'VAR_26', 'VAR_27', 'VAR_28', 'VAR_29', 'VAR_30', 
'VAR_31', 'VAR_32', 'VAR_33', 'VAR_34', 'VAR_37', 'VAR_38', 'VAR_40', 'VAR_41', 'VAR_42', 'VAR_43', 'VAR_44', 'VAR_46', 
'VAR_47', 'VAR_48', 'VAR_49', 'VAR_51', 'VAR_53', 'VAR_57', 'VAR_59', 'VAR_60', 'VAR_62', 'VAR_64', 'VAR_65', 'VAR_66', 
'VAR_67', 'VAR_68', 'VAR_69', 'VAR_70', 'VAR_71', 'VAR_72', 'VAR_73', 'VAR_74', 'VAR_76', 'VAR_77', 'VAR_78']

cols_boruta = [col for col in cols_boruta if col not in excl_psi]

In [7]:
corr = dev[cols_boruta].corr().abs()
upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
to_drop = [col for col in upper.columns if any(upper[col] > 0.9)]
cols_uncorr = [c for c in cols_boruta if c not in to_drop]


In [10]:
scaler = StandardScaler()
X_dev_raw = dev[cols_uncorr]
X_oot_raw = oot[cols_uncorr]
X_dev = scaler.fit_transform(X_dev_raw)
X_oot = scaler.transform(X_oot_raw)
y_dev = dev["y"].values
y_oot = oot["y"].values

In [None]:
# CUTOFF personalizado
CUTOFF = 0.60

# 5) Métricas robustas com cutoff

def coefval(x):
    if len(x) < 2 or np.mean(x) == 0:
        return np.nan
    return np.std(x, ddof=0) / np.mean(x)


def ks_score(y_true, y_proba):
    fpr, tpr, _ = roc_curve(y_true, y_proba)
    return np.max(np.abs(tpr - fpr)) * 100


def metricas_models(X, y, model):
    # usa predict_proba se disponível
    if hasattr(model, "predict_proba"):
        probs = model.predict_proba(X)[:,1]
    else:
        probs = model.predict(X)

    thr_05 = 0.5
    thr_m  = np.mean(probs)

    p05    = (probs > thr_05).astype(int)
    pm     = (probs > thr_m).astype(int)

    # se só 1 classe, retorna NaNs
    if len(np.unique(y)) < 2:
        return dict(
            f1= np.nan, prec=np.nan, rec=np.nan,
            ks=np.nan, gini=np.nan, acc=np.nan,
            f1c=np.nan, prec_c=np.nan, rec_c=np.nan, acc_c=np.nan,
            f1_cut=np.nan, acc_cut=np.nan, mcc=np.nan
        )

    # calcula métricas
    f1   = f1_score(y, p05) * 100
    prec = precision_score(y, p05) * 100
    rec  = recall_score(y, p05) * 100
    ks   = ks_score(y, probs)
    gini = (2 * roc_auc_score(y, probs) - 1) * 100
    acc  = accuracy_score(y, p05) * 100

    f1c  = f1_score(y, pm) * 100
    prec_c = precision_score(y, pm) * 100
    rec_c  = recall_score(y, pm) * 100
    acc_c  = accuracy_score(y, pm) * 100

    f1_cut = f1_score(y) * 100
    acc_cut = accuracy_score(y) * 100

    mcc = matthews_corrcoef(y, p05) * 100

    return dict(
        f1=f1, prec=prec, rec=rec,
        ks=ks, gini=gini, acc=acc,
        f1c=f1c, prec_c=prec_c, rec_c=rec_c, acc_c=acc_c,
        f1_cut=f1_cut, acc_cut=acc_cut, mcc=mcc
    )

In [None]:
mlflow.set_experiment("lgbm_full_with_scaling_corr_cv_cutoff60")

def objective(trial):
    params = {
        "boosting_type": trial.suggest_categorical("boosting_type", ["gbdt","dart"]),
        "num_leaves": trial.suggest_int("num_leaves", 10, 500),
        "max_depth": trial.suggest_int("max_depth", 2, 20),
        "min_child_samples": trial.suggest_int("min_child_samples", 10, 1000, step=50),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.3, 0.9),
        "reg_alpha": trial.suggest_float("reg_alpha", 0.1, 0.9),
        "reg_lambda": trial.suggest_float("reg_lambda", 0.1, 0.9),
        "subsample": trial.suggest_float("subsample", 0.3, 0.8),
        "objective": "binary",
        "metric": "binary_error",
        "is_unbalance": trial.suggest_categorical("is_unbalance", [True,False]),
        "verbosity": -1,
        "random_state": 42,
        "min_split_gain": trial.suggest_float("min_split_gain", 0.0, 1.0)
    }
    num_boost = trial.suggest_int("n_estimators", 350, 500, step=10)

    # treina
    model = LGBMClassifier(**params, n_estimators=num_boost)
    model.fit(X_dev, y_dev)

    m_tr = metricas_models(X_dev, y_dev, model)
    m_oo = metricas_models(X_oot, y_oot, model)
    shift_gini = m_oo["gini"] - m_tr["gini"]

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    aucs = cross_val_score(
        model, X_dev, y_dev,
        scoring="roc_auc", cv=cv, n_jobs=-1
    )
    auc_mean = np.mean(aucs)*100
    auc_min  = np.min(aucs)*100
    auc_max  = np.max(aucs)*100
    auc_amp  = auc_max - auc_min
    auc_cv   = coefval(aucs)*100
    gini_cv  = (auc_mean*2 - 100)

    # log no MLflow
    with mlflow.start_run(nested=True):
        mlflow.log_params(params)
        mlflow.log_param("n_estimators", num_boost)

        mlflow.log_metrics({
            "gini_train":    round(m_tr["gini"],2),
            "gini_oot":      round(m_oo["gini"],2),
            "ks_train":      round(m_tr["ks"],2),
            "ks_oot":        round(m_oo["ks"],2),
            "f1_train":      round(m_tr["f1"],2),
            "f1_oot":        round(m_oo["f1"],2),
            "f1_train_cut":  round(m_tr["f1_cut"],2),
            "f1_oot_cut":    round(m_oo["f1_cut"],2),
            "acc_train_cut": round(m_tr["acc_cut"],2),
            "acc_oot_cut":   round(m_oo["acc_cut"],2),
            "shift_gini":    round(shift_gini,2),
            "auc_mean_cv":   round(auc_mean,2),
            "auc_min_cv":    round(auc_min,2),
            "auc_max_cv":    round(auc_max,2),
            "auc_ampl_cv":   round(auc_amp,2),
            "auc_coefv_cv":  round(auc_cv,2),
            "gini_cv":       round(gini_cv,2)
        })

        sig = infer_signature(X_dev, model.predict_proba(X_dev))
        mlflow.lightgbm.log_model(model, "model", signature=sig)

    return gini_cv - abs(shift_gini)

study = optuna.create_study(direction="maximize", study_name="lgbm_scaling_corr_cutoff30")
study.optimize(objective, n_trials=100)
