# HW06 (S06-hw-dataset-03)



In [1]:
import json
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.inspection import permutation_importance
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    ConfusionMatrixDisplay,
    accuracy_score,
    f1_score,
    roc_auc_score,
)
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier

DATA_PATH = Path("..") / ".." / "S06-hw-dataset-03.csv"
ARTIFACTS_DIR = Path("artifacts")
ARTIFACTS_DIR.mkdir(parents=True, exist_ok=True)



In [2]:
df = pd.read_csv(DATA_PATH)
print(df.head())
print(df.info())
print(df["target"].value_counts(normalize=True))

X = df.drop(columns=["target"])
if "id" in X.columns:
    X = X.drop(columns=["id"])
y = df["target"]



   id       f01       f02       f03       f04       f05       f06       f07  \
0   1 -2.721419  0.652294  1.867234 -0.245331 -0.241182 -0.195509  1.180193   
1   2 -4.191520 -0.647731 -0.881929 -0.968159  3.530725 -4.858592  0.240979   
2   3 -0.582739  0.415128 -4.205613 -0.320853  0.313570 -2.655451  2.215387   
3   4 -1.766082  1.253523  1.610804  0.466067  3.837868 -3.564073 -1.831031   
4   5 -2.157834 -1.361285 -0.917199  0.937285  0.408551 -0.062032 -0.480196   

        f08       f09  ...       f20       f21       f22       f23       f24  \
0 -0.724816  1.804165  ...  0.042851 -0.153232  1.566167 -1.516125 -1.586857   
1 -0.714017  0.285769  ... -1.170056  0.631661  1.277915 -0.464432  1.927986   
2  1.492222 -0.516727  ...  0.083281 -0.757912  5.672669 -0.283472  0.275362   
3  1.066265 -0.198636  ... -0.674648  1.780285 -4.718432  0.711573  1.705610   
4 -0.554454 -1.026434  ... -0.096277  0.212875  1.710699  2.476220  0.669305   

        f25       f26       f27       f28  t

In [3]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.25,
    random_state=42,
    stratify=y,
)

print(y_train.value_counts(normalize=True))
print(y_test.value_counts(normalize=True))



target
0    0.542578
1    0.302311
2    0.155111
Name: proportion, dtype: float64
target
0    0.5424
1    0.3024
2    0.1552
Name: proportion, dtype: float64


In [4]:
def compute_metrics(model, X_test, y_test, is_multiclass=False):
    preds = model.predict(X_test)
    metrics = {"accuracy": accuracy_score(y_test, preds)}
    if is_multiclass:
        metrics["f1"] = f1_score(y_test, preds, average="macro")
    else:
        metrics["f1"] = f1_score(y_test, preds)

    roc_auc = None
    if hasattr(model, "predict_proba"):
        proba = model.predict_proba(X_test)
        if is_multiclass:
            roc_auc = roc_auc_score(y_test, proba, multi_class="ovr")
        else:
            # положительный класс – 1
            roc_auc = roc_auc_score(y_test, proba[:, 1])
    metrics["roc_auc"] = roc_auc
    return metrics

is_multiclass = y.nunique() > 2

metrics = {}

dummy = DummyClassifier(strategy="most_frequent", random_state=42)
dummy.fit(X_train, y_train)
metrics["DummyClassifier"] = compute_metrics(dummy, X_test, y_test, is_multiclass)

logreg = Pipeline([
    ("scaler", StandardScaler()),
    ("logreg", LogisticRegression(max_iter=1000, random_state=42)),
])
logreg.fit(X_train, y_train)
metrics["LogisticRegression"] = compute_metrics(logreg, X_test, y_test, is_multiclass)

metrics


{'DummyClassifier': {'accuracy': 0.5424,
  'f1': 0.2344398340248963,
  'roc_auc': 0.5},
 'LogisticRegression': {'accuracy': 0.72,
  'f1': 0.6632634159041243,
  'roc_auc': 0.8467788482997881}}

In [5]:
scoring = "f1_macro" if is_multiclass else "roc_auc"
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

models = {}

# Decision Tree
tree = DecisionTreeClassifier(random_state=42)
tree_grid = {
    "max_depth": [3, 5, 8],
    "min_samples_leaf": [1, 5],
    "ccp_alpha": [0.0, 0.001],
}

tree_search = GridSearchCV(
    tree,
    tree_grid,
    scoring=scoring,
    cv=cv,
    n_jobs=-1,
)
tree_search.fit(X_train, y_train)
models["DecisionTreeClassifier"] = tree_search.best_estimator_

# Random Forest
rf = RandomForestClassifier(random_state=42, n_jobs=-1)
rf_grid = {
    "n_estimators": [150],
    "max_depth": [None, 10],
    "min_samples_leaf": [1, 5],
    "max_features": ["sqrt"],
}

rf_search = GridSearchCV(
    rf,
    rf_grid,
    scoring=scoring,
    cv=cv,
    n_jobs=-1,
)
rf_search.fit(X_train, y_train)
models["RandomForestClassifier"] = rf_search.best_estimator_

# Gradient Boosting
gb = GradientBoostingClassifier(random_state=42)
gb_grid = {
    "n_estimators": [100],
    "learning_rate": [0.05, 0.1],
    "max_depth": [2, 3],
}

gb_search = GridSearchCV(
    gb,
    gb_grid,
    scoring=scoring,
    cv=cv,
    n_jobs=-1,
)
gb_search.fit(X_train, y_train)
models["GradientBoostingClassifier"] = gb_search.best_estimator_

models


{'DecisionTreeClassifier': DecisionTreeClassifier(max_depth=8, min_samples_leaf=5, random_state=42),
 'RandomForestClassifier': RandomForestClassifier(n_estimators=150, n_jobs=-1, random_state=42),
 'GradientBoostingClassifier': GradientBoostingClassifier(random_state=42)}

In [6]:
for name, model in models.items():
    metrics[name] = compute_metrics(model, X_test, y_test, is_multiclass)

metrics


{'DummyClassifier': {'accuracy': 0.5424,
  'f1': 0.2344398340248963,
  'roc_auc': 0.5},
 'LogisticRegression': {'accuracy': 0.72,
  'f1': 0.6632634159041243,
  'roc_auc': 0.8467788482997881},
 'DecisionTreeClassifier': {'accuracy': 0.7773333333333333,
  'f1': 0.7127050263372623,
  'roc_auc': 0.8637234867383086},
 'RandomForestClassifier': {'accuracy': 0.8842666666666666,
  'f1': 0.8547630499128241,
  'roc_auc': 0.9521801686426321},
 'GradientBoostingClassifier': {'accuracy': 0.8277333333333333,
  'f1': 0.7878856130711229,
  'roc_auc': 0.9267500849815864}}

In [7]:
with (ARTIFACTS_DIR / "metrics_test.json").open("w", encoding="utf-8") as f:
    json.dump(metrics, f, ensure_ascii=False, indent=2)

best_metric_name = "f1" if is_multiclass else "roc_auc"
best_model_name = max(
    metrics.items(),
    key=lambda item: item[1][best_metric_name]
    if item[1][best_metric_name] is not None
    else -np.inf,
)[0]

best_model_name


'RandomForestClassifier'

In [8]:
best_model = models[best_model_name]

fig, ax = plt.subplots(figsize=(5, 4))
ConfusionMatrixDisplay.from_estimator(best_model, X_test, y_test, ax=ax, colorbar=False)
ax.set_title("Confusion Matrix")
fig.tight_layout()
fig.savefig(ARTIFACTS_DIR / "figures_confusion_matrix_from_nb.png", dpi=150)
plt.close(fig)

"done"


'done'

In [9]:
result = permutation_importance(best_model, X_test, y_test, n_repeats=5, random_state=42)
importances = result.importances_mean
indices = np.argsort(importances)[::-1][:15]

plt.figure(figsize=(7, 5))
plt.barh(range(len(indices)), importances[indices][::-1])
plt.yticks(range(len(indices)), np.array(X.columns)[indices][::-1])
plt.xlabel("Mean Importance (Permutation)")
plt.title("Top Features")
plt.tight_layout()
plt.savefig(ARTIFACTS_DIR / "figures_permutation_importance_from_nb.png", dpi=150)
plt.close()

"done"


'done'