In [4]:
# ============================================
# Cell 1 – Import libraries (Classification notebook)
# --------------------------------------------
# What this cell does:
# - Imports all Python libraries we need for:
#   * Data handling (NumPy, pandas)
#   * Splitting data into train / validation / test
#   * Preprocessing (scaling numeric features, encoding categorical features)
#   * Building pipelines (preprocess + model together)
#   * Evaluating classification performance
#   * Training 4 models on the SAME classification dataset:
#       - LinearSVC
#       - Logistic Regression (correct for classification)
# ============================================

import numpy as np  # NumPy: fast numerical operations and array handling
import pandas as pd  # pandas: work with tabular data (dataframes similar to Excel tables)

from sklearn.model_selection import train_test_split  # function to split data into train / validation / test sets

from sklearn.compose import ColumnTransformer  # lets us apply different preprocessing to different columns
from sklearn.preprocessing import OneHotEncoder  # transforms categorical (text) columns into numeric dummy variables
from sklearn.preprocessing import StandardScaler  # scales numeric features (mean 0, std 1), useful for KNN and logistic regression

from sklearn.pipeline import Pipeline  # allows us to chain preprocessing and model into one pipeline object

# ---- Classification metrics ----
from sklearn.metrics import accuracy_score  # basic classification metric: fraction of correct predictions
from sklearn.metrics import classification_report  # detailed classification metrics (precision, recall, f1-score for each class)

# ---- Models we will use on the CLASSIFICATION dataset ----
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression  # Logistic Regression model (well-suited for binary classification)


In [5]:
# ============================================
# Cell 2 – Load the Heart Disease dataset
# --------------------------------------------
# What this cell does:
# - Loads the dataset from the Kaggle input directory using the correct folder name.
# - Displays the first few rows to understand the structure.
# - Prints dataset shape, data types, and missing values.
# - Confirms that the target column (HeartDisease) exists and is binary.
# ============================================

# Load the dataset (replace folder name if yours is different)
df = pd.read_csv("heart.csv")
# ^ This file path works when the dataset folder is: heart-failure-prediction

# Show the first 5 rows
df.head()

# Print dataset shape (rows, columns)
print("Dataset shape:", df.shape)

# Show information about column types and non-null counts
df.info()

# Print missing values for each column
print("\nMissing values per column:")
print(df.isnull().sum())

# Set the target variable for classification
target = "HeartDisease"  # 1 = disease present, 0 = no disease

# Check uniqueness of the target values
print("\nTarget variable selected:", target)
print("Unique values in target:", df[target].unique())

Dataset shape: (918, 12)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB

Missing values per column:
Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR    

In [6]:
# ============================================
# Cell 3 – Separate features & target, detect feature types
# --------------------------------------------
# What this cell does:
# - Splits the DataFrame into:
#     * X = input features (all columns except the target)
#     * y = target labels (HeartDisease)
# - Checks the distribution of the target (how many 0 vs 1).
# - Automatically detects:
#     * numeric feature columns
#     * categorical feature columns
# - Prints lists of numeric and categorical columns so we know
#   what will be scaled and what will be one-hot encoded later.
# ============================================

# Separate input features (X) and target labels (y)
X = df.drop(columns=[target])  # X = all columns except HeartDisease
y = df[target]                 # y = the HeartDisease column only

# Show basic information about the target distribution (class balance)
print("Target value counts (0 = no disease, 1 = disease):")
print(y.value_counts())

# Automatically detect numeric feature columns (int or float)
numeric_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
# Automatically detect categorical feature columns (non-numeric types)
categorical_features = X.select_dtypes(exclude=["int64", "float64"]).columns.tolist()

print("\nNumeric feature columns:")
print(numeric_features)

print("\nCategorical feature columns:")
print(categorical_features)

Target value counts (0 = no disease, 1 = disease):
HeartDisease
1    508
0    410
Name: count, dtype: int64

Numeric feature columns:
['Age', 'RestingBP', 'Cholesterol', 'FastingBS', 'MaxHR', 'Oldpeak']

Categorical feature columns:
['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']


In [7]:
# ============================================
# Cell 4 – Train / Validation / Test split (70% / 15% / 15%)
# --------------------------------------------
# What this cell does:
# - Splits our dataset into:
#       70% Training
#       15% Validation
#       15% Test
# - We first split out 15% for the TEST set.
# - Then from the remaining 85%, we calculate how much to split
#   to produce exactly 15% VALIDATION overall.
# - We use stratify=y to keep the same class balance (0/1)
#   in all splits.
# ============================================

# --- Step 1: Split OFF the Test Set (15%) ---
X_temp, X_test, y_temp, y_test = train_test_split(
    X,
    y,
    test_size=0.15,      # 15% for final test
    random_state=42,
    stratify=y           # keep 0/1 ratio consistent
)
# --- Step 2: From the remaining 85%, split Train and Validation ---
# Validation should be 15% of TOTAL.
# So inside the remaining 85%, validation proportion is:
#       0.15 / 0.85 = ~0.17647
validation_ratio = 0.15 / 0.85  # ~0.17647

X_train, X_val, y_train, y_val = train_test_split(
    X_temp,
    y_temp,
    test_size=validation_ratio,  # produces 15% overall validation
    random_state=42,
    stratify=y_temp
)

# Print out the sizes to confirm 70/15/15 split
print("Train set size:     ", X_train.shape[0], "rows")
print("Validation set size:", X_val.shape[0], "rows")
print("Test set size:      ", X_test.shape[0], "rows")

# Optional check: print percentages
total = len(df)
print("\nPercentage breakdown:")
print("Train:      ", round(X_train.shape[0] / total * 100, 2), "%")
print("Validation: ", round(X_val.shape[0] / total * 100, 2), "%")
print("Test:       ", round(X_test.shape[0] / total * 100, 2), "%")


Train set size:      642 rows
Validation set size: 138 rows
Test set size:       138 rows

Percentage breakdown:
Train:       69.93 %
Validation:  15.03 %
Test:        15.03 %


In [8]:
# ============================================
# Cell 5 – Manual Preprocessing (Ohne Pipeline)
# --------------------------------------------
# Was wir hier tun:
# 1. Wir definieren den ColumnTransformer wie vorher.
# 2. Wir wenden ihn manuell auf X_train und X_val an.
# 3. Wir machen daraus wieder schöne DataFrames mit Spaltennamen.
# ============================================

# 1. Definition der Transformer (wie vorher)
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(
    handle_unknown="ignore",
    sparse_output=False  # Wichtig: False, damit wir eine lesbare Tabelle (Matrix) bekommen
)

# Der Preprocessor (fasst beides zusammen)
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ],
    verbose_feature_names_out=False # Sorgt für schönere Namen ohne "num__" oder "cat__" Präfix
)

# 2. Fitten und Transformieren
# ACHTUNG: fit() darf NUR auf X_train passieren!
print("Starte Preprocessing...")

# Lernen (fit) und Anwenden (transform) auf Trainingsdaten
X_train_np = preprocessor.fit_transform(X_train)

# Nur Anwenden (transform) auf Validierungsdaten (nichts neues lernen!)
X_val_np = preprocessor.transform(X_val)

# Nur Anwenden (transform) auf Testdaten
X_test_np = preprocessor.transform(X_test)


# 3. Feature Namen retten (Wichtig für XAI!)
# Wir holen uns die Namen der neuen Spalten direkt aus dem Preprocessor
feature_names = preprocessor.get_feature_names_out()

# Wir wandeln die NumPy Arrays zurück in Pandas DataFrames
# Das macht es viel leichter zu verstehen, was passiert ist
X_train_processed = pd.DataFrame(X_train_np, columns=feature_names, index=X_train.index)
X_val_processed = pd.DataFrame(X_val_np, columns=feature_names, index=X_val.index)
X_test_processed = pd.DataFrame(X_test_np, columns=feature_names, index=X_test.index)

print("Preprocessing abgeschlossen!")
print(f"Neue Anzahl Features: {X_train_processed.shape[1]}")
print("\nSo sehen die Daten jetzt aus (erste 3 Zeilen X_train_processed):")
display(X_train_processed.head(3))

Starte Preprocessing...
Preprocessing abgeschlossen!
Neue Anzahl Features: 20

So sehen die Daten jetzt aus (erste 3 Zeilen X_train_processed):


Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,Sex_F,Sex_M,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ExerciseAngina_N,ExerciseAngina_Y,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
43,-0.178734,0.377473,0.49781,-0.525682,1.31363,-0.843569,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
760,-0.074608,-0.515599,0.70998,-0.525682,-1.568765,1.02779,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
255,-0.074608,-0.6732,0.636182,-0.525682,-0.223647,-0.843569,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0


In [17]:
# ============================================
# Cell 6 – Train & Evaluate Random Forest
# --------------------------------------------
# Wir nutzen nun den RandomForestClassifier, der für 
# XAI-Methoden wie SHAP oder LIME besonders spannend ist.
# ============================================
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# 1. Modell definieren
# random_state=42 sorgt dafür, dass deine Ergebnisse reproduzierbar bleiben.
rf_model = RandomForestClassifier(
    n_estimators=100, 
    max_depth=None,   # Die Bäume wachsen, bis alle Blätter rein sind
    random_state=39,
    n_jobs=-1         # Nutzt alle verfügbaren CPU-Kerne für schnelleres Training
)

# 2. Modell trainieren
# Wir nutzen weiterhin die vorverarbeiteten Daten (X_train_processed)
rf_model.fit(X_train_processed, y_train)

# 3. Vorhersagen machen
y_val_pred = rf_model.predict(X_val_processed)

# 4. Evaluieren
val_accuracy = accuracy_score(y_val, y_val_pred)

print(f"Random Forest – Validation accuracy: {val_accuracy:.4f}")
print("\nClassification report:")
print(classification_report(y_val, y_val_pred))

Random Forest – Validation accuracy: 0.8406

Classification report:
              precision    recall  f1-score   support

           0       0.83      0.81      0.82        62
           1       0.85      0.87      0.86        76

    accuracy                           0.84       138
   macro avg       0.84      0.84      0.84       138
weighted avg       0.84      0.84      0.84       138



In [19]:
# ============================================
# Cell 7 – Final Evaluation on Test Set
# --------------------------------------------
# ACHTUNG: Das machen wir nur EINMAL ganz am Schluss.
# Wenn dieses Ergebnis schlecht ist, müssen wir eigentlich
# ganz von vorne anfangen (neue Features, anderes Modell),
# aber wir dürfen nicht einfach an kleinen Schräubchen drehen,
# bis dieses Ergebnis passt.
# ============================================

print("--- FINALE PRÜFUNG (Test Set) ---")

# 1. Vorhersagen auf den bereits verarbeiteten Testdaten machen
# (X_test_processed haben wir in Cell 5 schon erstellt)
y_test_pred = rf_model.predict(X_test_processed)

# 2. Genauigkeit berechnen
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f"Test Set Accuracy: {test_accuracy:.4f}")
print("\nClassification Report (Test Set):")
print(classification_report(y_test, y_test_pred))

# Vergleich: War Validierung und Test ähnlich?
print("-" * 30)
print(f"Vergleich Accuracy: Val={val_accuracy:.4f} vs. Test={test_accuracy:.4f}")
if abs(val_accuracy - test_accuracy) < 0.05:
    print("-> Das Modell ist stabil (kein großes Overfitting).")
else:
    print("-> Vorsicht: Großer Unterschied zwischen Validierung und Test.")

--- FINALE PRÜFUNG (Test Set) ---
Test Set Accuracy: 0.8841

Classification Report (Test Set):
              precision    recall  f1-score   support

           0       0.87      0.87      0.87        62
           1       0.89      0.89      0.89        76

    accuracy                           0.88       138
   macro avg       0.88      0.88      0.88       138
weighted avg       0.88      0.88      0.88       138

------------------------------
Vergleich Accuracy: Val=0.8406 vs. Test=0.8841
-> Das Modell ist stabil (kein großes Overfitting).


In [20]:
import joblib

joblib.dump(rf_model, 'v_01_Random_Forest_Model.pkl')

['v_01_Random_Forest_Model.pkl']