# HW06 – (S06-hw-dataset-01)


In [1]:
import json
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.inspection import permutation_importance
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    ConfusionMatrixDisplay,
    accuracy_score,
    f1_score,
    roc_auc_score,
    roc_curve,
)
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier

DATA_PATH = Path("data") / "S06-hw-dataset-01.csv"
ARTIFACTS_DIR = Path("artifacts")
ARTIFACTS_DIR.mkdir(parents=True, exist_ok=True)



In [2]:
df = pd.read_csv(DATA_PATH)
print(df.head())
print(df.info())
print(df["target"].value_counts(normalize=True))

X = df.drop(columns=["target"])
if "id" in X.columns:
    X = X.drop(columns=["id"])
y = df["target"]



   id     num01     num02     num03     num04     num05     num06     num07  \
0   1 -0.946058 -0.070313  1.824445 -2.754422  0.808865 -0.111094 -0.268950   
1   2 -2.484027  0.739378  1.596908 -2.586479 -0.033225 -3.054412 -4.706908   
2   3  1.522629  7.159635 -0.564903 -4.493110  1.623610  5.450187 -0.974595   
3   4  0.463373 -1.073908  1.752813  0.362786  2.790872  4.082385  0.322283   
4   5  3.188390 -4.701692 -0.689918 -0.448995  0.373821 -3.275363 -1.760931   

      num08     num09  ...     num20     num21     num22     num23     num24  \
0 -3.078210  0.801275  ... -1.616515 -1.989464  1.407390 -0.218362  2.016052   
1 -9.795169  0.145911  ... -1.727040 -0.583997  1.136761  0.285978 -0.310879   
2 -5.189589  1.600591  ...  0.524408  2.022430  1.278358 -0.850547  0.847457   
3  3.390984 -0.033929  ...  2.399834 -1.431576 -0.746987  0.049639  2.414689   
4  0.923689  0.537345  ... -2.183407 -2.896590  2.440343 -1.097168  1.457323   

   cat_contract  cat_region  cat_payment  te

In [3]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.25,
    random_state=42,
    stratify=y,
)

print(y_train.value_counts(normalize=True))
print(y_test.value_counts(normalize=True))



target
0    0.676556
1    0.323444
Name: proportion, dtype: float64
target
0    0.676667
1    0.323333
Name: proportion, dtype: float64


In [4]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

def compute_metrics(model, X_test, y_test, is_multiclass=False):
    preds = model.predict(X_test)
    metrics = {"accuracy": accuracy_score(y_test, preds)}

    if is_multiclass:
        metrics["f1"] = f1_score(y_test, preds, average="macro")
    else:
        metrics["f1"] = f1_score(y_test, preds)

    roc_auc = None
    if hasattr(model, "predict_proba"):
        proba = model.predict_proba(X_test)
        if is_multiclass:
            roc_auc = roc_auc_score(y_test, proba, multi_class="ovr")
        else:
            roc_auc = roc_auc_score(y_test, proba[:, 1])

    metrics["roc_auc"] = roc_auc
    return metrics

from sklearn.dummy import DummyClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

is_multiclass = y.nunique() > 2

metrics = {}

# Dummy
dummy = DummyClassifier(strategy="most_frequent", random_state=42)
dummy.fit(X_train, y_train)
metrics["Dummy"] = compute_metrics(dummy, X_test, y_test, is_multiclass)

# LogisticRegression
logreg = Pipeline([
    ("scaler", StandardScaler()),
    ("logreg", LogisticRegression(max_iter=1000, random_state=42)),
])
logreg.fit(X_train, y_train)
metrics["LogisticRegression"] = compute_metrics(logreg, X_test, y_test, is_multiclass)

# DecisionTree
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
metrics["DecisionTree"] = compute_metrics(dt, X_test, y_test, is_multiclass)

# RandomForest
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
metrics["RandomForest"] = compute_metrics(rf, X_test, y_test, is_multiclass)

# Boosting
gb = GradientBoostingClassifier(random_state=42)
gb.fit(X_train, y_train)
metrics["Boosting"] = compute_metrics(gb, X_test, y_test, is_multiclass)

metrics


{'Dummy': {'accuracy': 0.6766666666666666, 'f1': 0.0, 'roc_auc': 0.5},
 'LogisticRegression': {'accuracy': 0.8296666666666667,
  'f1': 0.7146845337800112,
  'roc_auc': 0.8789091463104972},
 'DecisionTree': {'accuracy': 0.8566666666666667,
  'f1': 0.7767393561786086,
  'roc_auc': 0.8343354832156822},
 'RandomForest': {'accuracy': 0.9306666666666666,
  'f1': 0.8878101402373247,
  'roc_auc': 0.9696805647250012},
 'Boosting': {'accuracy': 0.9063333333333333,
  'f1': 0.8446655610834716,
  'roc_auc': 0.9582387892946016}}

In [5]:
scoring = "f1_macro" if is_multiclass else "roc_auc"
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# Для выбора модели используем только CV на train, test не трогаем.
searches = {}
models = {}
cv_scores = {}

# Decision Tree: контроль сложности

tree = DecisionTreeClassifier(random_state=42)
tree_grid = {
    # ограничение структуры
    "max_depth": [None, 3, 5, 8, 12],
    "max_leaf_nodes": [None, 15, 31, 63],
    # ограничение разбиений
    "min_samples_split": [2, 10, 30],
    "min_samples_leaf": [1, 5, 15],
    # порог улучшения и пост-обрезка
    "min_impurity_decrease": [0.0, 1e-4, 1e-3],
    "ccp_alpha": [0.0, 1e-4, 1e-3],
}

tree_search = GridSearchCV(
    tree,
    tree_grid,
    scoring=scoring,
    cv=cv,
    n_jobs=-1,
    refit=True,
)
tree_search.fit(X_train, y_train)
searches["DecisionTreeClassifier"] = tree_search
models["DecisionTreeClassifier"] = tree_search.best_estimator_
cv_scores["DecisionTreeClassifier"] = tree_search.best_score_

# Random Forest
rf = RandomForestClassifier(random_state=42, n_jobs=-1)
rf_grid = {
    "n_estimators": [150],
    "max_depth": [None, 10],
    "min_samples_leaf": [1, 5],
    "max_features": ["sqrt"],
}

rf_search = GridSearchCV(
    rf,
    rf_grid,
    scoring=scoring,
    cv=cv,
    n_jobs=-1,
    refit=True,
)
rf_search.fit(X_train, y_train)
searches["RandomForestClassifier"] = rf_search
models["RandomForestClassifier"] = rf_search.best_estimator_
cv_scores["RandomForestClassifier"] = rf_search.best_score_

# Gradient Boosting
gb = GradientBoostingClassifier(random_state=42)
gb_grid = {
    "n_estimators": [100],
    "learning_rate": [0.05, 0.1],
    "max_depth": [2, 3],
}

gb_search = GridSearchCV(
    gb,
    gb_grid,
    scoring=scoring,
    cv=cv,
    n_jobs=-1,
    refit=True,
)
gb_search.fit(X_train, y_train)
searches["GradientBoostingClassifier"] = gb_search
models["GradientBoostingClassifier"] = gb_search.best_estimator_
cv_scores["GradientBoostingClassifier"] = gb_search.best_score_

# Резюме по CV
search_summaries = {
    name: {
        "scoring": scoring,
        "best_score_cv": float(search.best_score_),
        "best_params": search.best_params_,
    }
    for name, search in searches.items()
}

best_model_name = max(cv_scores.items(), key=lambda item: item[1])[0]
best_model = models[best_model_name]

models


{'DecisionTreeClassifier': DecisionTreeClassifier(min_samples_leaf=15, random_state=42),
 'RandomForestClassifier': RandomForestClassifier(n_estimators=150, n_jobs=-1, random_state=42),
 'GradientBoostingClassifier': GradientBoostingClassifier(random_state=42)}

In [6]:
metrics[best_model_name] = compute_metrics(best_model, X_test, y_test, is_multiclass)
metrics


{'Dummy': {'accuracy': 0.6766666666666666, 'f1': 0.0, 'roc_auc': 0.5},
 'LogisticRegression': {'accuracy': 0.8296666666666667,
  'f1': 0.7146845337800112,
  'roc_auc': 0.8789091463104972},
 'DecisionTree': {'accuracy': 0.8566666666666667,
  'f1': 0.7767393561786086,
  'roc_auc': 0.8343354832156822},
 'RandomForest': {'accuracy': 0.9306666666666666,
  'f1': 0.8878101402373247,
  'roc_auc': 0.9696805647250012},
 'Boosting': {'accuracy': 0.9063333333333333,
  'f1': 0.8446655610834716,
  'roc_auc': 0.9582387892946016},
 'RandomForestClassifier': {'accuracy': 0.9313333333333333,
  'f1': 0.8890086206896551,
  'roc_auc': 0.9696493321822152}}

In [7]:
import json

with (ARTIFACTS_DIR / "metrics_test.json").open("w", encoding="utf-8") as f:
    json.dump(metrics, f, ensure_ascii=False, indent=2)
best_model_name


'RandomForestClassifier'

In [8]:
best_model = models[best_model_name]

fig, ax = plt.subplots(figsize=(5, 4))
ConfusionMatrixDisplay.from_estimator(best_model, X_test, y_test, ax=ax, colorbar=False)
ax.set_title("Confusion Matrix")
fig.tight_layout()
fig.savefig(ARTIFACTS_DIR / "figures_confusion_matrix_from_nb.png", dpi=150)
plt.close(fig)

if hasattr(best_model, "predict_proba") and not is_multiclass:
    proba = best_model.predict_proba(X_test)[:, 1]
    fpr, tpr, _ = roc_curve(y_test, proba)
    roc_auc = roc_auc_score(y_test, proba)

    plt.figure(figsize=(6, 4))
    plt.plot(fpr, tpr, label=f"ROC AUC = {roc_auc:.4f}")
    plt.plot([0, 1], [0, 1], linestyle="--", color="gray")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("ROC Curve")
    plt.legend(loc="lower right")
    plt.tight_layout()
    plt.savefig(ARTIFACTS_DIR / "figures_roc_curve_from_nb.png", dpi=150)
    plt.close()

"done"


'done'

In [9]:
result = permutation_importance(best_model, X_test, y_test, n_repeats=5, random_state=42)
importances = result.importances_mean
indices = np.argsort(importances)[::-1][:15]

plt.figure(figsize=(7, 5))
plt.barh(range(len(indices)), importances[indices][::-1])
plt.yticks(range(len(indices)), np.array(X.columns)[indices][::-1])
plt.xlabel("Mean Importance (Permutation)")
plt.title("Top Features")
plt.tight_layout()
plt.savefig(ARTIFACTS_DIR / "figures_permutation_importance_from_nb.png", dpi=150)
plt.close()

"done"


'done'