# Шаг 3. MLflow Tracking (эксперименты)

Цель: залогировать 5+ экспериментов с разными моделями/гиперами.
Логирую: параметры, метрики (roc_auc, precision, recall, f1), ROC-кривую и модель.


In [1]:
%pip install -q mlflow==2.14.1 matplotlib

import mlflow, mlflow.sklearn
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score, roc_curve


Note: you may need to restart the kernel to use updated packages.


  import pkg_resources  # noqa: TID251


## Подготовка данных и препроцессинга (как в baseline)


In [4]:
ROOT = Path.cwd() if Path.cwd().name=="credit" else Path.cwd().parent
PROC = ROOT / "data" / "processed"
TARGET = "default.payment.next.month"

train = pd.read_csv(PROC/"train.csv")
test  = pd.read_csv(PROC/"test.csv")

for df in (train, test):
    if "ID" in df.columns:
        df.drop(columns=["ID"], inplace=True)

X_train, y_train = train.drop(columns=[TARGET]), train[TARGET]
X_test,  y_test  = test.drop(columns=[TARGET]),  test[TARGET]

all_cols = X_train.columns.tolist()
cat_cols = [c for c in ["SEX","EDUCATION","MARRIAGE"] if c in all_cols] + [c for c in all_cols if c.startswith("PAY_")]
cat_cols = sorted(list(dict.fromkeys(cat_cols)))
num_cols = [c for c in all_cols if c not in cat_cols]

numeric_tf = Pipeline([("imputer", SimpleImputer(strategy="median")),
                       ("scaler", StandardScaler())])
categorical_tf = Pipeline([("imputer", SimpleImputer(strategy="most_frequent")),
                           ("onehot", OneHotEncoder(handle_unknown="ignore"))])
preprocess = ColumnTransformer([("num", numeric_tf, num_cols),
                                ("cat", categorical_tf, cat_cols)])


## Настраиваю MLflow 


In [7]:
mlruns_dir = str(ROOT / "mlruns")
mlflow.set_tracking_uri(f"file:///{mlruns_dir.replace('\\','/')}")
mlflow.set_experiment("PD_Baselines");
mlruns_dir


2025/10/28 14:50:23 INFO mlflow.tracking.fluent: Experiment with name 'PD_Baselines' does not exist. Creating a new experiment.


'C:\\Users\\USER\\Desktop\\credit\\mlruns'

## Цикл экспериментов (5 запусков)


In [14]:
from sklearn.exceptions import UndefinedMetricWarning
import warnings
warnings.filterwarnings("ignore", category=UndefinedMetricWarning)

def train_and_log(model, params: dict, run_name: str):
    pipe = Pipeline([("preprocess", preprocess), ("clf", model.set_params(**params))])

    with mlflow.start_run(run_name=run_name):
        mlflow.log_param("model", type(model).__name__)
        mlflow.log_params({f"clf__{k}": v for k,v in params.items()})

        pipe.fit(X_train, y_train)

        y_proba = pipe.predict_proba(X_test)[:, 1]
        y_pred  = pipe.predict(X_test)

        # метрики без ворнингов + доля предсказанных единиц
        metrics = {
            "roc_auc": float(roc_auc_score(y_test, y_proba)),
            "precision": float(precision_score(y_test, y_pred, zero_division=0)),
            "recall": float(recall_score(y_test, y_pred, zero_division=0)),
            "f1": float(f1_score(y_test, y_pred, zero_division=0)),
            "pred_pos_rate": float(y_pred.mean()),   # сколько «1» предсказали
        }
        mlflow.log_metrics(metrics)

        # ROC арт
        fpr, tpr, _ = roc_curve(y_test, y_proba)
        plt.figure(figsize=(5,4))
        plt.plot(fpr, tpr, label=f"AUC={metrics['roc_auc']:.3f}")
        plt.plot([0,1],[0,1],"--")
        plt.xlabel("FPR"); plt.ylabel("TPR"); plt.title(run_name); plt.legend(); plt.tight_layout()
        out_png = ROOT / "models" / f"roc_{run_name}.png"
        out_png.parent.mkdir(exist_ok=True, parents=True)
        plt.savefig(out_png, dpi=150); plt.close()
        mlflow.log_artifact(str(out_png))
        try: out_png.unlink()
        except: pass

        mlflow.sklearn.log_model(pipe, artifact_path="model")
        return metrics


runs = []
runs.append(train_and_log(LogisticRegression(max_iter=500, solver="lbfgs"), {"C":0.3}, "LR_C0.3"))
runs.append(train_and_log(LogisticRegression(max_iter=500, solver="lbfgs"), {"C":1.0}, "LR_C1.0"))
runs.append(train_and_log(GradientBoostingClassifier(), {"n_estimators":150,"learning_rate":0.1}, "GB_150_0.1"))
runs.append(train_and_log(RandomForestClassifier(random_state=42), {"n_estimators":300,"max_depth":8}, "RF_300_d8"))
runs.append(train_and_log(RandomForestClassifier(random_state=42), {"n_estimators":500,"max_depth":10}, "RF_500_d10"))

runs


[{'roc_auc': 0.7575339808236351,
  'precision': 0.6563814866760168,
  'recall': 0.35267520723436324,
  'f1': 0.4588235294117647,
  'pred_pos_rate': 0.11883333333333333},
 {'roc_auc': 0.7479272532115823,
  'precision': 0.6457461645746164,
  'recall': 0.34890730972117556,
  'f1': 0.45303326810176126,
  'pred_pos_rate': 0.1195},
 {'roc_auc': 0.7705216566622121,
  'precision': 0.6642441860465116,
  'recall': 0.3443858327053504,
  'f1': 0.45359801488833745,
  'pred_pos_rate': 0.11466666666666667},
 {'roc_auc': 0.7491205148271969,
  'precision': 0.0,
  'recall': 0.0,
  'f1': 0.0,
  'pred_pos_rate': 0.0},
 {'roc_auc': 0.7517428844146438,
  'precision': 0.0,
  'recall': 0.0,
  'f1': 0.0,
  'pred_pos_rate': 0.0}]

In [16]:
runs.append(train_and_log(
    LogisticRegression(max_iter=500, solver="lbfgs"),
    {"C": 1.0, "class_weight": "balanced"},
    "LR_bal_C1.0"
))
runs.append(train_and_log(
    RandomForestClassifier(random_state=42),
    {"n_estimators": 400, "max_depth": 10, "class_weight": "balanced"},
    "RF_bal_400_d10"
))


## Как открыть MLflow UI

### Как запустить MLflow UI
 1. Терминал (CMD/PowerShую**

1. Открой терминал и перейди в проект:

```powershell
cd C:\Users\USER\Desktop\credit
```

2. Запусти UI через питон из venv:

```powershell
.\.venv\Scripts\python -m mlflow ui --backend-store-uri mlruns --port 5001
```

3. Открой в браузере: [http://127.0.0.1:5001](http:го** окружения и увидит локальные эксперименты.


In [24]:
import subprocess, pathlib
ROOT = pathlib.Path.cwd() if pathlib.Path.cwd().name=="credit" else pathlib.Path.cwd().parent

def sh(c):
    r = subprocess.run(c, cwd=ROOT, shell=True, text=True, capture_output=True)
    print(">", c, "\n", r.stdout or r.stderr); 
    return r

# на всякий случай покажем, не игнорятся ли картинки .gitignore-ом
sh('git check-ignore -v reports/figures/* || echo "not ignored"')

# добавляем только скрины + README (если ты его менял)
sh("git add reports/figures/*")
sh("git add README.md")  # не страшно, если файла нет

# отдельный docs-коммит
sh('git commit -m "docs(mlflow): add screenshots (UI start, experiments list, RF_bal_400_d10 run)"')

# пуш
sh("git push")


> git check-ignore -v reports/figures/* || echo "not ignored" 
 "not ignored"

> git add reports/figures/* 
 
> git add README.md 
 fatal: pathspec 'README.md' did not match any files

> git commit -m "docs(mlflow): add screenshots (UI start, experiments list, RF_bal_400_d10 run)" 
 [main 7eaf1c9] docs(mlflow): add screenshots (UI start, experiments list, RF_bal_400_d10 run)
 3 files changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 reports/figures/03-mlflow-run-RF_bal_400_d10.png
 create mode 100644 reports/figures/03-mlflow-terminal-ui-start.png
 create mode 100644 reports/figures/03-mlflow-ui-runs.png

> git push 
 To https://github.com/pero1x1/credit.git
   a69187a..7eaf1c9  main -> main



CompletedProcess(args='git push', returncode=0, stdout='', stderr='To https://github.com/pero1x1/credit.git\n   a69187a..7eaf1c9  main -> main\n')