In [9]:
%pip install -q scikit-learn mlflow matplotlib joblib


Note: you may need to restart the kernel to use updated packages.


In [10]:
# беру подготовленные train/test с нашими фичами из предыдущего ноутбука
from pathlib import Path
import pandas as pd

PROC = Path("data/processed")
TARGET = "default.payment.next.month"

train = pd.read_csv(PROC / "train.csv")
test  = pd.read_csv(PROC / "test.csv")

# численные и категориальные 
num_feats = [
    "LIMIT_BAL","AGE",
    "BILL_AMT1","BILL_AMT2","BILL_AMT3","BILL_AMT4","BILL_AMT5","BILL_AMT6",
    "PAY_AMT1","PAY_AMT2","PAY_AMT3","PAY_AMT4","PAY_AMT5","PAY_AMT6",
    "utilization1","payment_ratio1","max_delay"
]
cat_feats = ["SEX","EDUCATION","MARRIAGE","PAY_0","PAY_2","PAY_3","PAY_4","PAY_5","PAY_6"]

X_train, y_train = train.drop(columns=[TARGET]), train[TARGET]
X_test,  y_test  = test.drop(columns=[TARGET]),  test[TARGET]

len(num_feats), len(cat_feats), X_train.shape


(17, 9, (23974, 26))

In [11]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier

# предобработка
num_tf = Pipeline(steps=[
    ("imp", SimpleImputer(strategy="median")),
    ("sc",  StandardScaler())
])
cat_tf = Pipeline(steps=[
    ("imp", SimpleImputer(strategy="most_frequent")),
    ("oh",  OneHotEncoder(handle_unknown="ignore"))
])

preprocess = ColumnTransformer([
    ("num", num_tf, num_feats),
    ("cat", cat_tf, cat_feats),
])

# базовый пайп с логрег
pipe_logreg = Pipeline([
    ("pre", preprocess),
    ("clf", LogisticRegression(max_iter=400, class_weight="balanced"))
])

# альтернативный пайп с GBDT
pipe_gbdt = Pipeline([
    ("pre", preprocess),
    ("clf", GradientBoostingClassifier())
])


In [12]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import loguniform, randint

# логрегрессия
pipe_logreg.set_params(clf__max_iter=1000)  

param_logreg = {
    "clf__C": loguniform(1e-2, 1e1),       
    "clf__penalty": ["l2"],
    "clf__solver": ["lbfgs", "saga"]
}

param_gbdt = {
    "clf__n_estimators": randint(80, 300),
    "clf__learning_rate": loguniform(1e-2, 3e-1),
    "clf__max_depth": randint(2, 5),
}

cv_logreg = RandomizedSearchCV(pipe_logreg, param_logreg, n_iter=20,
                               scoring="roc_auc", cv=3, random_state=42, n_jobs=-1, verbose=0)

cv_gbdt = RandomizedSearchCV(pipe_gbdt, param_gbdt, n_iter=20,
                             scoring="roc_auc", cv=3, random_state=42, n_jobs=-1, verbose=0)

cv_logreg.fit(X_train, y_train)
cv_gbdt.fit(X_train, y_train)
cv_logreg.best_score_, cv_gbdt.best_score_


(0.7757013885448544, 0.7856355398406617)

In [13]:
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score, RocCurveDisplay
import matplotlib.pyplot as plt
import json
from joblib import dump
from pathlib import Path

# выбираю по лучшему cv-roc_auc
best_est, best_name = (cv_gbdt.best_estimator_, "gbdt") if cv_gbdt.best_score_ >= cv_logreg.best_score_ else (cv_logreg.best_estimator_, "logreg")
print("Лучшая модель:", best_name)

proba = best_est.predict_proba(X_test)[:,1]
pred  = (proba >= 0.5).astype(int)

metrics = {
    "model": best_name,
    "test_auc": float(roc_auc_score(y_test, proba)),
    "test_f1": float(f1_score(y_test, pred)),
    "test_precision": float(precision_score(y_test, pred, zero_division=0)),
    "test_recall": float(recall_score(y_test, pred)),
}
metrics


Лучшая модель: gbdt


{'model': 'gbdt',
 'test_auc': 0.7804616314386661,
 'test_f1': 0.47093307278944796,
 'test_precision': 0.6685159500693482,
 'test_recall': 0.363499245852187}

In [14]:
# ROC
RocCurveDisplay.from_predictions(y_test, proba)
plt.title(f"ROC ({best_name})")
plt.tight_layout()
Path("artifacts").mkdir(exist_ok=True, parents=True)
plt.savefig("artifacts/roc.png"); plt.close()

Path("models").mkdir(exist_ok=True, parents=True)
dump(best_est, "models/credit_default_model.pkl")

# метрики и список признаков для DVC/отчёта/APIs
Path("metrics.json").write_text(json.dumps(metrics, indent=2, ensure_ascii=False))
feat_spec = {"num_feats": num_feats, "cat_feats": cat_feats}
Path("feature_list.json").write_text(json.dumps(feat_spec, indent=2, ensure_ascii=False))

"Сохранил: models/credit_default_model.pkl, metrics.json, artifacts/roc.png, feature_list.json"


'Сохранил: models/credit_default_model.pkl, metrics.json, artifacts/roc.png, feature_list.json'

In [32]:
import mlflow, mlflow.sklearn
from mlflow.models.signature import infer_signature

mlflow.set_tracking_uri("file:./mlruns")
mlflow.set_experiment("CreditDefault_Prediction")  # единое имя

int_like = ["SEX","EDUCATION","MARRIAGE","AGE","PAY_0","PAY_2","PAY_3","PAY_4","PAY_5","PAY_6","max_delay"]
X_sig = X_train.copy()
for c in int_like:
    if c in X_sig.columns:
        X_sig[c] = X_sig[c].astype("float64")

# берём реалистичный сэмпл 
input_example = X_sig.head(1)
signature = infer_signature(X_sig.head(200), best_est.predict_proba(X_sig.head(200))[:, 1])

with mlflow.start_run():
    mlflow.log_param("model", best_name)
    for k, v in best_est.named_steps["clf"].get_params().items():
        mlflow.log_param(f"clf__{k}", v)

    for k, v in metrics.items():
        if k.startswith("test_"):
            mlflow.log_metric(k, v)

    mlflow.log_artifact("artifacts/roc.png")

    mlflow.sklearn.log_model(
        sk_model=best_est,
        artifact_path="CreditDefaultModel",          
        signature=signature,
        input_example=input_example
    )

"MLflow run logged with signature (int-like -> float64 for signature)."




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

'MLflow run logged with signature (int-like -> float64 for signature).'

In [34]:
import mlflow, mlflow.sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier

mlflow.set_tracking_uri("file:./mlruns")
mlflow.set_experiment("CreditDefault_Prediction")

candidates = [
    ("logreg", LogisticRegression(max_iter=1000, class_weight="balanced", C=0.3, solver="lbfgs")),
    ("logreg", LogisticRegression(max_iter=1000, class_weight="balanced", C=1.0, solver="saga")),
    ("gbdt",   GradientBoostingClassifier(n_estimators=150, learning_rate=0.08, max_depth=3)),
    ("gbdt",   GradientBoostingClassifier(n_estimators=230, learning_rate=0.06, max_depth=3)),
    ("rf",     RandomForestClassifier(n_estimators=300, max_depth=8, class_weight="balanced_subsample", n_jobs=-1)),
]

for name, clf in candidates:
    pipe = best_est.set_params(clf=clf)  # тот же preprocess, меняем только классификатор
    with mlflow.start_run():
        pipe.fit(X_train, y_train)
        proba = pipe.predict_proba(X_test)[:,1]
        pred  = (proba >= 0.5).astype(int)

        # метрики
        from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score
        m = {
            "test_auc": float(roc_auc_score(y_test, proba)),
            "test_f1": float(f1_score(y_test, pred)),
            "test_precision": float(precision_score(y_test, pred, zero_division=0)),
            "test_recall": float(recall_score(y_test, pred)),
        }

        mlflow.log_param("model", name)
        for k,v in clf.get_params().items():
            mlflow.log_param(f"clf__{k}", v)
        for k,v in m.items():
            mlflow.log_metric(k, v)

        mlflow.sklearn.log_model(pipe, artifact_path="model")  
print("добили 5+ экспериментов.")




Готово: добили 5+ экспериментов.


In [36]:
import mlflow
import pandas as pd

mlflow.set_tracking_uri("file:./mlruns")
exp = [e for e in mlflow.search_experiments() if e.name=="CreditDefault_Prediction"][0]
runs = mlflow.search_runs(experiment_ids=[exp.experiment_id])

cols = ["run_id","params.model","metrics.test_auc","metrics.test_f1","metrics.test_precision","metrics.test_recall"]
display(runs[cols].sort_values("metrics.test_auc", ascending=False).reset_index(drop=True).head(20))


Unnamed: 0,run_id,params.model,metrics.test_auc,metrics.test_f1,metrics.test_precision,metrics.test_recall
0,ab9832d3f78a4194b85b054bdb09ebb4,gbdt,0.781826,0.465071,0.660194,0.358974
1,2e79c690178846f8b83d6efc948980e9,gbdt,0.780689,0.467545,0.662517,0.361237
2,b36985e20f0940f88f35962083329ea9,gbdt,0.780462,0.470933,0.668516,0.363499
3,84ad7277802c4fd0b58f2c9846a70459,gbdt,0.780462,0.470933,0.668516,0.363499
4,9e5c378a85964dbfb9071be40dbf0099,gbdt,0.780462,0.470933,0.668516,0.363499
5,bdcc4a37c418465a9236fe84d6d9cc8c,gbdt,0.780462,0.470933,0.668516,0.363499
6,e305b9f190d84882a8e2131995d56dbf,gbdt,0.780462,0.470933,0.668516,0.363499
7,82340676557847719b9d6aed46a1a5f7,rf,0.776793,0.540121,0.488415,0.604072
8,dc983047e0f94ed08dcc82ee036da9a1,logreg,0.763192,0.520847,0.464033,0.593514
9,94706bc6eac948988801fdaebb4f8f0e,logreg,0.76313,0.520875,0.464539,0.59276
