# Шаг 2. Baseline-модель (pipeline + метрики)

Цель: собрать sklearn Pipeline (импьютер, скейлер, one-hot) и обучить простую модель.
Сравню LogisticRegression и GradientBoosting, выберу лучшую по ROC-AUC.
Сохраню модель и ROC-кривую в папку models/.


In [2]:
%pip install -q matplotlib


Note: you may need to restart the kernel to use updated packages.


## 1) Загружаю train/test
Важно: убираю колонку ID.


In [1]:
from pathlib import Path
import pandas as pd

ROOT = Path.cwd() if Path.cwd().name == "credit" else Path.cwd().parent
PROC = ROOT / "data" / "processed"

train = pd.read_csv(PROC / "train.csv")
test  = pd.read_csv(PROC / "test.csv")

TARGET = "default.payment.next.month"

for df in (train, test):
    if "ID" in df.columns:
        df.drop(columns=["ID"], inplace=True)

X_train, y_train = train.drop(columns=[TARGET]), train[TARGET]
X_test,  y_test  = test.drop(columns=[TARGET]),  test[TARGET]

X_train.shape, X_test.shape


((24000, 23), (6000, 23))

## 2) Фичи и препроцессинг
Категориальные: SEX, EDUCATION, MARRIAGE и все PAY_*.
Численные: всё остальное.


In [4]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

all_cols = X_train.columns.tolist()
cat_cols = [c for c in ["SEX", "EDUCATION", "MARRIAGE"] if c in all_cols] + \
           [c for c in all_cols if c.startswith("PAY_")]
cat_cols = sorted(list(dict.fromkeys(cat_cols)))  # уникальные + порядок
num_cols = [c for c in all_cols if c not in cat_cols]

numeric_tf = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])
categorical_tf = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocess = ColumnTransformer([
    ("num", numeric_tf, num_cols),
    ("cat", categorical_tf, cat_cols)
])
len(num_cols), len(cat_cols)


(8, 15)

## 3) Обучение с GridSearchCV
Две модели: LogisticRegression и GradientBoosting. Метрика — ROC-AUC.


In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

pipe = Pipeline([
    ("preprocess", preprocess),
    ("clf", LogisticRegression(max_iter=500))
])

param_grid = [
    {"clf": [LogisticRegression(max_iter=500)],
     "clf__C": [0.3, 1.0, 3.0],
     "clf__solver": ["lbfgs"]},
    {"clf": [GradientBoostingClassifier()],
     "clf__n_estimators": [150, 250],
     "clf__learning_rate": [0.05, 0.1]}
]

gs = GridSearchCV(
    pipe, param_grid=param_grid,
    scoring="roc_auc", cv=5, n_jobs=-1, verbose=0
)
gs.fit(X_train, y_train)

gs.best_params_, round(gs.best_score_, 4)


({'clf': GradientBoostingClassifier(),
  'clf__learning_rate': 0.1,
  'clf__n_estimators': 150},
 0.7789)

## 4) Оценка на тесте + ROC-кривая
Посчитаю ROC-AUC, Precision, Recall, F1 и сохраню график.


In [12]:
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score, roc_curve
import matplotlib.pyplot as plt
import json, joblib
from pathlib import Path

best = gs.best_estimator_
model_name = type(best.named_steps["clf"]).__name__
best_model_params = {k.replace("clf__", ""): v
                     for k, v in gs.best_params_.items()
                     if k.startswith("clf__")}  # только гиперы модели

y_proba = best.predict_proba(X_test)[:, 1]
y_pred  = best.predict(X_test)

metrics = {
    "model": model_name,
    "model_params": best_model_params,  # здесь уже нет несериализуемых объектов
    "roc_auc": float(roc_auc_score(y_test, y_proba)),
    "precision": float(precision_score(y_test, y_pred)),
    "recall": float(recall_score(y_test, y_pred)),
    "f1": float(f1_score(y_test, y_pred)),
}

models_dir = ROOT / "models"
models_dir.mkdir(exist_ok=True, parents=True)

# ROC-кривая
fpr, tpr, _ = roc_curve(y_test, y_proba)
plt.figure(figsize=(5,4))
plt.plot(fpr, tpr, label=f"{model_name} | AUC={metrics['roc_auc']:.3f}")
plt.plot([0,1],[0,1],"--")
plt.xlabel("FPR"); plt.ylabel("TPR"); plt.title("ROC curve (test)")
plt.legend(loc="lower right"); plt.tight_layout()
plt.savefig(models_dir / "roc_curve.png", dpi=150); plt.close()

# Сохраняю модель и метрики
joblib.dump(best, models_dir / "credit_default_model.pkl")
(Path(models_dir / "metrics.json")).write_text(json.dumps(metrics, indent=2), encoding="utf-8")

metrics  # посмотрю, что получилось


{'model': 'GradientBoostingClassifier',
 'model_params': {'learning_rate': 0.1, 'n_estimators': 150},
 'roc_auc': 0.7705572956671517,
 'precision': 0.6647230320699709,
 'recall': 0.3436322532027129,
 'f1': 0.45305514157973176}

## 5) Итог по шагу
— Лучшая модель и её параметры в metrics.json  
— ROC-кривая: models/roc_curve.png  
— Модель: models/credit_default_model.pkl


## Git: коммит и пуш
Удалю лишний `$null` и запушу всё, чтобы был полный трек.


In [16]:
import subprocess, pathlib

def sh(c):
    r = subprocess.run(c, cwd=ROOT, shell=True, text=True, capture_output=True)
    print(">", c, "\n", r.stdout or r.stderr); 
    return r

if (ROOT/"$null").exists():
    try:
        (ROOT/"$null").unlink()
    except: 
        pass
    sh('git rm -f --cached "$null"')

sh("git add -A")
sh('git commit -m "feat(model): baseline pipeline + GridSearch + metrics + ROC plot; chore: remove $null"')
sh("git push")


> git rm -f --cached "$null" 
 rm '$null'

> git add -A 

> git commit -m "feat(model): baseline pipeline + GridSearch + metrics + ROC plot; chore: remove $null" 
 [main a69187a] feat(model): baseline pipeline + GridSearch + metrics + ROC plot; chore: remove $null
 6 files changed, 363 insertions(+), 18 deletions(-)
 delete mode 100644 $null
 create mode 100644 models/credit_default_model.pkl
 create mode 100644 models/metrics.json
 create mode 100644 models/roc_curve.png
 create mode 100644 notebooks/02_model_baseline.ipynb

> git push 
 To https://github.com/pero1x1/credit.git
   aff2a78..a69187a  main -> main



CompletedProcess(args='git push', returncode=0, stdout='', stderr='To https://github.com/pero1x1/credit.git\n   aff2a78..a69187a  main -> main\n')