Przygotowanie środowiska i danych

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC


def analyzeModels(dataPath):
  # === 1. Wczytanie danych ===

  df = pd.read_csv(dataPath)  

  # === 2. Podział na X/y i standaryzacja ===

  X = df.drop("DEATH_EVENT", axis=1)
  y = df["DEATH_EVENT"]

  # Standaryzacja
  scaler = StandardScaler()
  X_scaled = scaler.fit_transform(X)

  # === 3. Train/test split ===

  # Podział na trening/test
  X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, stratify=y, random_state=42)

  # === 4. Definicja modeli ===

  models = {
      "Random Forest": RandomForestClassifier(random_state=42),
      "SVM (RBF)": SVC(probability=True, random_state=42),
      "AdaBoost": AdaBoostClassifier(random_state=42)
  }

  # === 5. Trening i metryki ===

  print("=== Metryki na zbiorze testowym ===")
  for name, model in models.items():
      model.fit(X_train, y_train)
      y_pred = model.predict(X_test)
      y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None

      acc = accuracy_score(y_test, y_pred)
      f1 = f1_score(y_test, y_pred)
      auc = roc_auc_score(y_test, y_proba) if y_proba is not None else "N/A"

      print(f"\nModel: {name}")
      print(f"Accuracy: {acc:.3f}")
      print(f"F1-score: {f1:.3f}")
      print(f"AUC-ROC: {auc}")

  # === 6. Cross-validation (z pomiarem wariancji) ===

  cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

  print("\n=== Cross-validation (Accuracy ± SD) ===")
  for name, model in models.items():
      scores = cross_val_score(model, X_scaled, y, cv=cv, scoring="accuracy")
      print(f"{name}: {scores.mean():.3f} ± {scores.std():.3f}")


In [2]:
print("=== Wyniki dla zbioru heart_1.csv ===")
analyzeModels("../dataset/heart_1.csv")

print("\n=== Wyniki dla zbioru heart_2.csv ===")
analyzeModels("../dataset/heart_2.csv")

=== Wyniki dla zbioru heart_1.csv ===
=== Metryki na zbiorze testowym ===

Model: Random Forest
Accuracy: 0.991
F1-score: 0.986
AUC-ROC: 0.9998723329186088

Model: SVM (RBF)
Accuracy: 0.956
F1-score: 0.929
AUC-ROC: 0.9818759168817665

Model: AdaBoost
Accuracy: 0.882
F1-score: 0.805
AUC-ROC: 0.9576632745910011

=== Cross-validation (Accuracy ± SD) ===
Random Forest: 0.992 ± 0.002
SVM (RBF): 0.957 ± 0.010
AdaBoost: 0.892 ± 0.008

=== Wyniki dla zbioru heart_2.csv ===
=== Metryki na zbiorze testowym ===

Model: Random Forest
Accuracy: 0.833
F1-score: 0.706
AUC-ROC: 0.8915275994865212

Model: SVM (RBF)
Accuracy: 0.767
F1-score: 0.533
AUC-ROC: 0.8446726572528884

Model: AdaBoost
Accuracy: 0.800
F1-score: 0.600
AUC-ROC: 0.8741976893453145

=== Cross-validation (Accuracy ± SD) ===
Random Forest: 0.836 ± 0.043
SVM (RBF): 0.809 ± 0.032
AdaBoost: 0.826 ± 0.035
