Przygotowanie środowiska i danych

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline


def calculations_with_tuning(dataPath):
    # === 1. Wczytanie danych ===
    df = pd.read_csv(dataPath)

    # === 2. Podział na X/y ===
    X = df.drop("DEATH_EVENT", axis=1)
    y = df["DEATH_EVENT"]

    # === 3. Train/test split ===
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.25, stratify=y, random_state=42
    )

    # === 4. Przygotowanie pipeline i parametrów do strojenia ===
    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

    pipelines = {
        'rf': Pipeline([
            ('scaler', StandardScaler()),
            ('clf', RandomForestClassifier(random_state=42))
        ]),
        'svm': Pipeline([
            ('scaler', StandardScaler()),
            ('clf', SVC(class_weight='balanced', probability=True, random_state=42))
        ]),
        'ada': Pipeline([
            ('scaler', StandardScaler()),
            ('clf', AdaBoostClassifier(random_state=42))
        ]),
        'nb': Pipeline([
            ('scaler', StandardScaler()),
            ('clf', GaussianNB())
        ])
    }

    param_grids = {
        'rf': {
            'clf__n_estimators': [50, 100],
            'clf__max_depth': [None, 5]
        },
        'svm': {
            'clf__C': [0.1, 1, 10],
            'clf__gamma': ['scale', 'auto']
        },
        'ada': {
            'clf__n_estimators': [50, 100, 200],
            'clf__learning_rate': [0.5, 1.0, 1.5]
        },
        'nb': {
            'clf__var_smoothing': [1e-9, 1e-8, 1e-7]
        }
    }

    best_models = {}

    # === 5. Hyperparameter tuning z GridSearchCV ===
    for name in pipelines:
        print(f"Trening i strojenie: {name}")
        grid = GridSearchCV(
            pipelines[name], param_grids[name], cv=cv, scoring='roc_auc', n_jobs=-1
        )
        grid.fit(X_train, y_train)
        best_models[name] = grid.best_estimator_
        print(f"{name} best params: {grid.best_params_}")

    # === 6. Ocena najlepszych modeli na zbiorze testowym ===
    print("\n=== Metryki na zbiorze testowym (po tuning) ===")
    for name, model in best_models.items():
        y_pred = model.predict(X_test)
        y_proba = model.predict_proba(X_test)[:, 1]

        acc = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        auc = roc_auc_score(y_test, y_proba)

        print(f"\nModel: {name}")
        print(f"Accuracy: {acc:.3f}")
        print(f"F1-score: {f1:.3f}")
        print(f"AUC-ROC: {auc:.3f}")

    # === 7. Cross-validation after tuning ===
    print("\n=== Cross-validation (Accuracy ± SD) po tuningu ===")
    for name, model in best_models.items():
        scores = cross_val_score(
            model, X, y, cv=cv, scoring='accuracy', n_jobs=-1
        )
        print(f"{name}: {scores.mean():.3f} ± {scores.std():.3f}")


In [2]:
print("=== Wyniki dla zbioru heart_1.csv ===")
calculations_with_tuning("../dataset/heart_1.csv")

print("\n=== Wyniki dla zbioru heart_2.csv ===")
calculations_with_tuning("../dataset/heart_2.csv")

=== Wyniki dla zbioru heart_1.csv ===
Trening i strojenie: rf
rf best params: {'clf__max_depth': None, 'clf__n_estimators': 50}
Trening i strojenie: svm
svm best params: {'clf__C': 10, 'clf__gamma': 'scale'}
Trening i strojenie: ada
ada best params: {'clf__learning_rate': 1.5, 'clf__n_estimators': 200}
Trening i strojenie: nb
nb best params: {'clf__var_smoothing': 1e-09}

=== Metryki na zbiorze testowym (po tuning) ===

Model: rf
Accuracy: 0.994
F1-score: 0.990
AUC-ROC: 0.998

Model: svm
Accuracy: 0.982
F1-score: 0.971
AUC-ROC: 0.988

Model: ada
Accuracy: 0.959
F1-score: 0.934
AUC-ROC: 0.991

Model: nb
Accuracy: 0.791
F1-score: 0.603
AUC-ROC: 0.889

=== Cross-validation (Accuracy ± SD) po tuningu ===
rf: 0.992 ± 0.004
svm: 0.979 ± 0.007
ada: 0.960 ± 0.010
nb: 0.792 ± 0.013

=== Wyniki dla zbioru heart_2.csv ===
Trening i strojenie: rf
rf best params: {'clf__max_depth': 5, 'clf__n_estimators': 100}
Trening i strojenie: svm
svm best params: {'clf__C': 1, 'clf__gamma': 'scale'}
Trening i 