In [None]:
%load_ext autoreload

%autoreload 2

from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"

try:
    import jupyter_black
except:
    print("Jupyter-Black not found")

# Applied Machine Learning - LE3

- Modell Selektion
- Pipelines

## Imports

In [None]:
from pathlib import Path

import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from mlxtend import plotting
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

## Functions & Parameters

Wir definieren einige Parameter wie Pfade und für die Visualisierung der Daten.


In [None]:
DATA_PATH = Path("../data")

HUE_ORDER=["Benign", "Malign"]
HUE_ORDER_NUM=[0, 1]


colors= sns.color_palette().as_hex()
COLORS =  [colors[0], colors[1]]

Nun definieren wir einige Funktionen.

In [None]:
def plot_decision_region(X, y, clf, ax, colors, title):
    _ = plotting.plot_decision_regions(X=X, y=y.ravel(), clf=clf, ax=ax, scatter_kwargs={'s': 0}, colors=",".join(colors))
    _ = sns.scatterplot(y=X[:, 1], x=X[:, 0], hue=y.ravel(), ax=ax, hue_order=HUE_ORDER_NUM, palette=colors).set(
        title=title,
        xlabel="Fläche Tumor",
        ylim=(0, X[:, 1].max()),
        xlim=(X[:, 0].min(), X[:, 0].max()),
        ylabel="Symmetrie")
    handles, _ = ax.get_legend_handles_labels()
    ax.legend(handles[2:], ['Benign', 'Malign'], framealpha=0.3, scatterpoints=1)


def undersample_malign(df, n: int = 80):
    # Undersample the Malign class
    df_malign = (
        df
        .query("diagnosis == 'Malign'")
        .sample(n=n, random_state=123)
    )

    # Get all Benign samples
    df_benign = df.query("diagnosis == 'Benign'")

    # Combine to create a balanced dataset
    return pd.concat([df_benign, df_malign])

## Data

Wir lesen nun einen Datensatz ein. Dieser Datensatz ist aus der Medizin und beinhaltet Messwerte von Tumoren in der Brust mit Verdacht auf Brustkrebs. Diese Tumore wurden entsprechend diagnostiziert und in gutartig (benign) und bösartig (malign) eingestuft. Der Datensatz hat verschiedene Attribute, wobei wir uns auf einige wenige beschränken werden.

In [None]:
df_raw = pd.read_csv(DATA_PATH.joinpath("breast-cancer.csv")).drop('Unnamed: 32', axis=1)

df = (
    df_raw
    .assign(diagnosis=lambda _df: _df['diagnosis'].map({'M': "Malign", 'B': "Benign"}))
    .astype({"diagnosis": "category"})
)

Wir schauen uns den Datensatz mal grob an.

In [None]:
df.head()
df.iloc[0]

In [None]:
df.groupby("diagnosis", observed=True).size()

Für Experiment-Zwecke können wir die Klassen-Imbalance noch vergrössern.

In [None]:
#df_breast_cancer = undersample_malign(df_breast_cancer, 100)

Wir wählen zwei Features aus. Dadurch kann man den Datensatz gut darstellen.

In [None]:
df[["diagnosis", "symmetry_worst", "area_mean"]].head()

In [None]:
fig, ax = plt.subplots(figsize=(12, 6))
_ = sns.scatterplot(data=df, y="symmetry_worst", x='area_mean', hue='diagnosis', ax=ax, hue_order=HUE_ORDER, palette=COLORS).set(
    title="Diagnose in Abhängigkeit der Fläche und Symmetrie",
    xlabel="Fläche Tumor",
    ylabel="Symmetrie")

**Lässt sich das Problem gut modellieren?**

Nun bereiten wir die Daten für die Modellierung vor.

In [None]:
from sklearn.preprocessing import LabelEncoder

# Create data matrix X and label vector y
X2d = df[['area_mean', 'symmetry_worst']].to_numpy()
X = df.drop(["id", "diagnosis"], axis=1).to_numpy()
y = df[['diagnosis']].to_numpy().reshape(-1,)

label_encoder = LabelEncoder().fit(y)
y = label_encoder.transform(y)

print(label_encoder.classes_)

## Erste Modelle

Wir trainieren nun erste Modelle und schauen uns die Ergebnisse an.

In [None]:
rng = np.random.RandomState(123)

# train a logistic regression model
clf = LogisticRegression()
clf = clf.fit(X2d, y)

# train a random forest classifier
clf2 = RandomForestClassifier(n_estimators=50, random_state=rng)
clf2 = clf2.fit(X=X2d, y=y)

In [None]:
fig, axes = plt.subplots(figsize=(16, 7), ncols=2)
plot_decision_region(X2d, y, clf, axes[0], COLORS, title="Logistic Regression")
plot_decision_region(X2d, y, clf2, axes[1], COLORS, title="Random Forest")

**Wie gefallen Euch die Modelle?**

Accuracy:

In [None]:
print(f"Accuracy Logistic Regression: {clf.score(X2d, y):.2f}")
print(f"Accuracy Random Forest: {clf2.score(X2d, y):.2f}")

**Offensichtlich: Random Forest is the Winner!**

## Modell Selektion

So geht's natürlich nicht. Wir brauchen ein unabhängiges Set um die Modelle fair zu evaluieren.

![Train Test Split](../figures/train_test.jpg)

Wir generieren nun Splits für Training, Validation und Test.


Dazu verwenden wir: [sklearn.model_selection.train_test_split](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html)


Wir _stratifizieren_ dabei nach der Zielvariablen.

In [None]:
from sklearn.model_selection import train_test_split

X2d_train, X2d_test, y_train, y_test = train_test_split(
    X2d, y, test_size=0.2, random_state=123, stratify=y)

X2d_train, X2d_val, y_train, y_val = train_test_split(
    X2d_train, y_train, test_size=0.2, random_state=123, stratify=y_train)


clf = LogisticRegression(random_state=123)
clf = clf.fit(X2d_train, y_train)

# train a random forest classifier
rng = np.random.RandomState(123)
clf2 = RandomForestClassifier(n_estimators=50, random_state=rng)
clf2 = clf2.fit(X=X2d_train, y=y_train)


print(f"Accuracy Logistic Regression: {clf.score(X2d_val, y_val):.2f}")
print(f"Accuracy Random Forest: {clf2.score(X2d_val, y_val):.2f}")

Nun können wir das beste Modell auf dem Testset evaluieren.

In [None]:
print(f"Accuracy Random Forest: {clf2.score(X2d_test, y_test):.2f}")

Wie robust ist dieses Resultat?

In [None]:
deltas = list()
for random_state in range(0, 100):

    X2d_train, X2d_test, y_train, y_test = train_test_split(
        X2d, y, test_size=0.2, random_state=123, stratify=y)
    X2d_train, X2d_val, y_train, y_val = train_test_split(
        X2d_train, y_train, test_size=0.2, random_state=random_state, stratify=y_train)

    clf = LogisticRegression(random_state=123)
    clf = clf.fit(X2d_train, y_train)

    # train a random forest classifier
    rng = np.random.RandomState(123)
    clf2 = RandomForestClassifier(n_estimators=50, random_state=rng)
    clf2 = clf2.fit(X=X2d_train, y=y_train)

    deltas.append(clf.score(X2d_val, y_val) - clf2.score(X2d_val, y_val))

In [None]:
fig, ax = plt.subplots(figsize=(12, 6))
_ = sns.histplot(deltas, bins=np.arange(-0.06, 0.06, 0.01), ax=ax).set(
    title="Differenz Accuracy Logistic Regression - Random Forest", xlabel="Differenz", ylabel="Häufigkeit")

print(f"Anzahl Cases bei der die Logistische Regression besser wäre: {sum(np.array(deltas) > 0) / len(deltas):.2f}")

## Kreuzvalidierung

Ein robusteres Verfahren ist die Kreuzvalidierung.

![Kreuzvalidierung](../figures/xval.jpg)

Quelle: _Sebastian Raschka and Vahid Mirjalili. Python Machine Learning, 3rd Ed. Packt Publishing, Birmingham, UK, 3 edition, 2019. ISBN 978-1-78995-575-0._

Ein Beispiel.

In [None]:
from sklearn.model_selection import KFold

X2d_train, X2d_test, y_train, y_test = train_test_split(
    X2d, y, test_size=0.2, random_state=123, stratify=y)

cv = KFold(n_splits=5, shuffle=True, random_state=123)

results  = list()

for train_idx, validation_idx in cv.split(X2d_train, y_train):

    X_train_split, y_train_split = X2d_train[train_idx], y_train[train_idx]
    X_val_split, y_val_split = X2d_train[validation_idx], y_train[validation_idx]

    clf = LogisticRegression()
    clf = clf.fit(X_train_split, y_train_split)

    rng = np.random.RandomState(123)
    clf2 = RandomForestClassifier(n_estimators=50, random_state=rng)
    clf2 = clf2.fit(X=X_train_split, y=y_train_split)

    results.append({"model": "Logistic Regression", "accuracy": clf.score(X_val_split, y_val_split)})
    results.append({"model": "Random Forest", "accuracy": clf2.score(X_val_split, y_val_split)})
    

In [None]:
df_results = pd.DataFrame.from_records(results)
df_results.groupby("model").agg({"accuracy": ["mean", "std"]})

In [None]:
fig, ax = plt.subplots(figsize=(12, 6))
_ = sns.barplot(df_results, x="model", y="accuracy", errorbar="sd", ax=ax).set(
    title="Performance Vergleich Kreuzvalidierung")

**Was passiert wenn man mehr oder weniger Folds wählt?**

## Hyper-Parameter Optimierung


Die meisten Algorithmen haben Hyper-Parameter welche den Optimisierungsprozess beeinflussen. Diese können wir nicht direkt mit dem Trainingsset optimieren, sondern müssen ein Validation Set verwenden.


Wir möchten z.B. testen ob mehr Bäume (`n_estimators`) besser sind.

In [None]:
X2d_train, X2d_test, y_train, y_test = train_test_split(
    X2d, y, test_size=0.2, random_state=123, stratify=y)

cv = KFold(n_splits=5, shuffle=True, random_state=123)

results  = list()

rng = np.random.RandomState(123)

for train_idx, validation_idx in cv.split(X2d_train, y_train):

    X_train_split, y_train_split = X2d_train[train_idx], y_train[train_idx]
    X_val_split, y_val_split = X2d_train[validation_idx], y_train[validation_idx]

    clf = LogisticRegression(random_state=123)
    clf = clf.fit(X_train_split, y_train_split)

    
    clf2 = RandomForestClassifier(n_estimators=50, random_state=rng)
    clf2 = clf2.fit(X=X_train_split, y=y_train_split)

    clf3 = RandomForestClassifier(n_estimators=150, random_state=rng)
    clf3 = clf3.fit(X=X_train_split, y=y_train_split)

    results.append({"model": "Logistic Regression", "accuracy": clf.score(X_val_split, y_val_split)})
    results.append({"model": "Random Forest", "accuracy": clf2.score(X_val_split, y_val_split)})
    results.append({"model": "Random Forest2", "accuracy": clf3.score(X_val_split, y_val_split)})

In [None]:
df_results = pd.DataFrame.from_records(results)
df_results.groupby("model").agg({"accuracy": ["mean", "std"]})

In [None]:
df_results = pd.DataFrame.from_records(results)

fig, ax = plt.subplots(figsize=(12, 6))
_ = sns.barplot(df_results, x="model", y="accuracy", errorbar="sd", ax=ax)

**Ist ein Random Forest mit mehr Bäumen besser?**

### Grid-Search

Man kann verschiedene Hyper-Parameter Kombinationen einfach mit [sklearn.model_selection.GridSearchCV](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html) ausprobieren.

In [None]:
from sklearn.model_selection import GridSearchCV

cv = KFold(n_splits=5, shuffle=True, random_state=123)

rng = np.random.RandomState(123)

algorithms = {
    "random_forest": {
        "cls": RandomForestClassifier(random_state=rng),
        "hyper_params": {
            "n_estimators": [50, 100, 200],
            "max_depth": [1, 3, 5, 10, 15]
        }
    },
    "logistic_regression": {
        "cls": LogisticRegression(random_state=123),
        "hyper_params": {
            "C": [0.1,  0.0001]
        }
    },
    "mlp": {
        "cls": MLPClassifier(random_state=rng),
        "hyper_params": {
            "hidden_layer_sizes": [[5, 5], [10, 10]],
            "max_iter": [1000]
        }
    }
}

for algorithm_name, algorithm_data in algorithms.items():

    param_grid = algorithm_data["hyper_params"]
    algorithm = algorithm_data["cls"]
    grid_search = GridSearchCV(
            algorithm,
            param_grid=param_grid,
            cv=cv,
            n_jobs=-1,
        )
    
    grid_search.fit(X2d_train, y_train)

    # Best parameters and estimator
    print("Best parameters:", grid_search.best_params_)
    print("Best estimator:", grid_search.best_estimator_)
    print(f"Best score: {grid_search.best_score_:.2f}")

**Warum ist das MLP so schlecht?**

In [None]:
fig, ax = plt.subplots(figsize=(6, 6), ncols=1)
plot_decision_region(X2d_train, y_train, grid_search.best_estimator_, ax, COLORS, title="MLP")

**Ist das eine erwartete Decision Bundary?**

**Was könnte man verbessern?**

## Pipelines

Pipelines vereinfachen die Hyper-Parameter Optimisierung und Integrieren Pre-Processing Schritte in ein gemeinsames Objekt, welches man Fitten und auf neuen Daten anwenden kann. Es verringert die Gefahr von Fehlern und Data Leakage enorm.

![Pipelines](../figures/ml_pipelines.jpg)

Quelle: _Sebastian Raschka and Vahid Mirjalili. Python Machine Learning, 3rd Ed. Packt Publishing, Birmingham, UK, 3 edition, 2019. ISBN 978-1-78995-575-0._

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

cv = KFold(n_splits=5, shuffle=True, random_state=123)

rng = np.random.RandomState(123)

algorithms = {
    "random_forest": {
        "cls": RandomForestClassifier(random_state=rng),
        "hyper_params": {
            "cls__n_estimators": [50, 100, 200],
            "cls__max_depth": [1, 3, 5, 10, 15],
        }
    },
    "logistic_regression": {
        "cls": LogisticRegression(random_state=rng),
        "hyper_params": {
            "cls__C": [0.1,  0.0001],
        }
    },
    "mlp": {
    "cls": MLPClassifier(random_state=rng),
    "hyper_params": {
        "cls__hidden_layer_sizes": [[5, 5], [10, 10]],
        "cls__max_iter": [1000],
    }
}
}

for algorithm_name, algorithm_data in algorithms.items():

    param_grid = algorithm_data["hyper_params"]
    algorithm = algorithm_data["cls"]

    pipeline = Pipeline([("scaler", StandardScaler()), ("cls", algorithm)])
    
    grid_search = GridSearchCV(
            pipeline,
            param_grid=param_grid,
            cv=cv,
            n_jobs=-1,
        )
    
    grid_search.fit(X2d_train, y_train)

    # Best parameters and estimator
    print("Best parameters:", grid_search.best_params_)
    print("Best estimator:", grid_search.best_estimator_)
    print(f"Best score: {grid_search.best_score_:.2f}")

Wie sieht die Decision Boundary aus?

In [None]:
fig, ax = plt.subplots(figsize=(6, 6), ncols=1)
plot_decision_region(X2d_train, y_train, grid_search.best_estimator_, ax, COLORS, title="MLP")

**Was wenn ich verschiedene Pre-Processing Schritte ausprobieren möchte?**

In [None]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

cv = KFold(n_splits=5, shuffle=True, random_state=123)
rng = np.random.RandomState(123)

# Combined pipeline with conditional preprocessing
pipeline = Pipeline([
    ('pre_processing', 'passthrough'),  # placeholder
    ('cls', LogisticRegression())       # placeholder
])

param_grid = [
    # Random Forest (no scaling)
    {
        "pre_processing": ["passthrough"],
        "cls": [RandomForestClassifier(random_state=rng)],
        "cls__n_estimators": [50, 100, 200],
        "cls__max_depth": [1, 3, 5, 10, 15],
    },
    # Logistic Regression (with scaling)
    {
        "pre_processing": [StandardScaler(), MinMaxScaler()],
        "cls": [LogisticRegression(random_state=rng)],
        "cls__C": [0.1, 0.0001],
    },
    {
        "pre_processing": [StandardScaler(), MinMaxScaler()],
        "cls": [MLPClassifier(random_state=rng)],
        "cls__hidden_layer_sizes": [[5, 5], [10, 10]],
        "cls__max_iter": [1000],
    }
]

# GridSearch with conditional preprocessing
grid_search = GridSearchCV(
    pipeline,
    param_grid=param_grid,
    cv=cv,
    n_jobs=-1,
)
    
grid_search.fit(X2d_train, y_train)

# Best parameters and estimator
print("Best parameters:", grid_search.best_params_)
print("Best estimator:", grid_search.best_estimator_)
print(f"Best score: {grid_search.best_score_:.2f}")

In [None]:
df_results = pd.DataFrame.from_records(grid_search.cv_results_)
df_results.sort_values("rank_test_score").head(5)[["mean_test_score", "param_cls", "param_cls__C", "param_cls__n_estimators", "param_cls__max_depth", "param_pre_processing"]]

In [None]:
fig, ax = plt.subplots(figsize=(6, 6), ncols=1)
plot_decision_region(X2d_train, y_train, grid_search.best_estimator_, ax, COLORS, title="Best Model")

## Metriken

Wie messe ich die Güte / Performance von einem Modell?

In [None]:
fig, ax = plt.subplots(figsize=(12, 6))
_ = sns.scatterplot(data=df, y="symmetry_worst", x='area_mean', hue='diagnosis', ax=ax, hue_order=HUE_ORDER, palette=COLORS).set(
    title="Diagnose in Abhängigkeit der Fläche und Symmetrie",
    xlabel="Fläche Tumor",
    ylabel="Symmetrie")

Confusion Matrix

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
from sklearn.model_selection import cross_val_predict

# Obtain the best estimator pipeline from GridSearchCV
best_pipeline = grid_search.best_estimator_

# Use cross_val_predict to get validation predictions across folds
y_pred_cv = cross_val_predict(best_pipeline, X2d_train, y_train, cv=cv, n_jobs=-1)

# Compute confusion matrix
cm = confusion_matrix(y_train, y_pred_cv)

# Display confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
_ = disp.plot(cmap='Blues')
_ = plt.title('Confusion Matrix (Cross-validated predictions)')
plt.show()

num_false_negatives = cm[1][0] / cm[1].sum()

In [None]:
print(f"{100 * num_false_negatives:.2f}% der positiven Fälle werden nicht erkannt.")

In [None]:
cv = KFold(n_splits=5, shuffle=True, random_state=123)
rng = np.random.RandomState(123)

# Combined pipeline with conditional preprocessing
pipeline = Pipeline([
    ('pre_processing', 'passthrough'),  # placeholder
    ('cls', LogisticRegression())       # placeholder
])

# Updated param grid to conditionally apply scaling
param_grid = [
    # Random Forest (no scaling)
    {
        "pre_processing": ["passthrough"],
        "cls": [RandomForestClassifier(random_state=rng)],
        "cls__n_estimators": [50, 100, 200],
        "cls__max_depth": [1, 3, 5, 10, 15],
    },
    # Logistic Regression (with scaling)
    {
        "pre_processing": [StandardScaler()],
        "cls": [LogisticRegression(random_state=rng)],
        "cls__C": [0.1, 0.0001],
    },
    {
        "pre_processing": [StandardScaler()],
        "cls": [MLPClassifier(random_state=rng)],
        "cls__hidden_layer_sizes": [[5, 5], [10, 10]],
        "cls__max_iter": [1000],
    }
]

# GridSearch with conditional preprocessing
grid_search = GridSearchCV(
    pipeline,
    param_grid=param_grid,
    cv=cv,
    scoring='roc_auc',
    n_jobs=-1,
)

grid_search.fit(X2d_train, y_train)

# Best parameters and estimator
print("Best parameters:", grid_search.best_params_)
print("Best estimator:", grid_search.best_estimator_)
print(f"Best score: {grid_search.best_score_:.2f}")

In [None]:
best_pipeline = grid_search.best_estimator_

# Use cross_val_predict to get validation predictions across folds
y_pred_cv = cross_val_predict(best_pipeline, X2d_train, y_train, cv=cv, n_jobs=-1)

# Compute confusion matrix
cm = confusion_matrix(y_train, y_pred_cv)

# Display confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap='Blues')
plt.title('Confusion Matrix (Cross-validated predictions)')
plt.show()

Was, wenn wir eine komplexere Kostenfuntion haben?

In [None]:
from sklearn.metrics import confusion_matrix, make_scorer


def cost_metric(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    
    # Define your costs here (example values):
    cost_fp = 5  # Cost of false positive
    cost_fn = 50  # Cost of false negative
    cost_tp = 0   # Benefit (negative cost) of true positive
    cost_tn = 0   # Cost of true negative (typically zero)
    
    # Compute total cost
    total_cost = (fp * cost_fp) + (fn * cost_fn) + (tp * cost_tp) + (tn * cost_tn)
    return total_cost

# Create scorer (lower is better, hence greater_is_better=False)
cost_scorer = make_scorer(cost_metric, greater_is_better=False)

Wir können diese verwenden für die Grid-Search.

In [None]:
cv = KFold(n_splits=5, shuffle=True, random_state=123)
rng = np.random.RandomState(123)

pipeline = Pipeline([
    ('pre_processing', 'passthrough'),  # placeholder
    ('cls', LogisticRegression())       # placeholder
])

param_grid = [
    {
        "pre_processing": ["passthrough"],
        "cls": [RandomForestClassifier(random_state=rng)],
        "cls__n_estimators": [50, 100, 200],
        "cls__max_depth": [1, 3, 5, 10, 15],
    },
    {
        "pre_processing": [StandardScaler()],
        "cls": [LogisticRegression(random_state=123)],
        "cls__C": [0.1, 0.0001],
    },
    {
        "pre_processing": [StandardScaler()],
        "cls": [MLPClassifier(random_state=rng)],
        "cls__hidden_layer_sizes": [[5, 5], [10, 10]],
        "cls__max_iter": [1000],
    }
]

grid_search = GridSearchCV(
    pipeline,
    param_grid=param_grid,
    cv=cv,
    scoring=cost_scorer,
    n_jobs=-1,
)

grid_search.fit(X2d_train, y_train)

print("Best parameters:", grid_search.best_params_)
print("Best estimator:", grid_search.best_estimator_)
print(f"Best score: {grid_search.best_score_:.2f}")

In [None]:
best_pipeline = grid_search.best_estimator_

# Use cross_val_predict to get validation predictions across folds
y_pred_cv = cross_val_predict(best_pipeline, X2d_train, y_train, cv=cv, n_jobs=-1)

# Compute confusion matrix
cm = confusion_matrix(y_train, y_pred_cv)

# Display confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap='Blues')
plt.title('Confusion Matrix (Cross-validated predictions)')
plt.show()

### ROC Kurve

Wichtiges Instrument um einen Classifier zu beurteilen.

In [None]:
from sklearn.metrics import RocCurveDisplay

fig, axs = plt.subplots(nrows=1, ncols=1, figsize=(14, 6))

y_pred_cv = cross_val_predict(best_pipeline, X2d_train, y_train, cv=cv, n_jobs=-1, method="predict_proba")

RocCurveDisplay.from_predictions(
    y_train, y_pred_cv[:, 1].ravel(), pos_label=1, ax=axs
)

### Tuning the Threshold

Bei Klassifikations-Problemen möchte man oft Precision vs Recall optimieren. 

Dies kann man tun, indem man den Decision-Threshold optimiert.

In [None]:
from sklearn.model_selection import TunedThresholdClassifierCV

In [None]:
tuned_model = TunedThresholdClassifierCV(
    estimator=grid_search.best_estimator_,
    cv=cv,
    scoring=cost_scorer,
    store_cv_results=True,  # necessary to inspect all results
)

tuned_model.fit(X2d_train, y_train)
print(f"{tuned_model.best_threshold_=:0.2f}")

In [None]:
# Use cross_val_predict to get validation predictions across folds
y_pred_cv = cross_val_predict(tuned_model, X2d_train, y_train, cv=cv, n_jobs=-1)

# Compute confusion matrix
cm = confusion_matrix(y_train, y_pred_cv)

# Display confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap='Blues')
plt.title('Confusion Matrix (Cross-validated predictions)')
plt.show()

Achtung positiver Bias!

In [None]:
fig, axs = plt.subplots(nrows=1, ncols=1, figsize=(14, 6))

y_pred_cv = cross_val_predict(tuned_model, X2d_train, y_train, cv=cv, n_jobs=-1, method="predict_proba")

RocCurveDisplay.from_predictions(
    y_train, y_pred_cv[:, 1].ravel(), pos_label=1, ax=axs
)

## Oversampling

Wir haben ja gesehen, dass die positive Klasse unterrepräsentiert ist. Könnte Oversampling helfen?

Es gibt dazu ein eigenes sklearn-kompatibles Package.

[imbalanced-learn](https://imbalanced-learn.org/stable/index.html)

In [None]:
from imblearn.over_sampling import ADASYN, SMOTE

X2d_resampled_train, y_resampled_train = SMOTE(random_state=123).fit_resample(X2d_train, y_train)

print(f"Sampled: {X2d_resampled_train.shape[0] - X2d_train.shape[0]}")

np.bincount(y_resampled_train)

In [None]:
cv = KFold(n_splits=5, shuffle=True, random_state=123)
rng = np.random.RandomState(123)

pipeline = Pipeline([
    ('pre_processing', 'passthrough'),  # placeholder
    ('cls', LogisticRegression())       # placeholder
])

param_grid = [
    {
        "pre_processing": ["passthrough"],
        "cls": [RandomForestClassifier(random_state=rng)],
        "cls__n_estimators": [50, 100, 200],
        "cls__max_depth": [1, 3, 5, 10, 15],
    },
    {
        "pre_processing": [StandardScaler()],
        "cls": [LogisticRegression(random_state=123)],
        "cls__C": [0.1, 0.0001],
    },
    {
        "pre_processing": [StandardScaler()],
        "cls": [MLPClassifier(random_state=rng)],
        "cls__hidden_layer_sizes": [[5, 5], [10, 10]],
        "cls__max_iter": [1000],
    }
]

# GridSearch with conditional preprocessing
grid_search = GridSearchCV(
    pipeline,
    param_grid=param_grid,
    cv=cv,
    scoring="roc_auc",
    n_jobs=-1,
)

grid_search.fit(X2d_resampled_train, y_resampled_train)

# Best parameters and estimator
print("Best parameters:", grid_search.best_params_)
print("Best estimator:", grid_search.best_estimator_)
print(f"Best score: {grid_search.best_score_:.2f}")

In [None]:
fig, axs = plt.subplots(nrows=1, ncols=1, figsize=(14, 6))

y_pred_cv = cross_val_predict(grid_search.best_estimator_, X2d_resampled_train, y_resampled_train, cv=cv, n_jobs=-1, method="predict_proba")

RocCurveDisplay.from_predictions(
    y_resampled_train, y_pred_cv[:, 1].ravel(), pos_label=1, ax=axs
)

**Alles gut?**

In [None]:
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

cv = KFold(n_splits=5, shuffle=True, random_state=123)
rng = np.random.RandomState(123)

# Combined pipeline with conditional preprocessing
pipeline = ImbPipeline([
    ('pre_processing', 'passthrough'),  
    ('sampling', 'passthrough'),        
    ('cls', LogisticRegression())       # placeholder
])

# Updated param grid to conditionally apply scaling
param_grid = [
    # Random Forest (no scaling)
    {
        "pre_processing": ["passthrough"],
        "cls": [RandomForestClassifier(random_state=rng)],
        "cls__n_estimators": [50, 100, 200],
        "cls__max_depth": [1, 3, 5, 10, 15],
        "sampling": ["passthrough"],
    },
    {
        "pre_processing": ["passthrough"],
        "cls": [RandomForestClassifier(random_state=rng)],
        "cls__n_estimators": [50, 100, 200],
        "cls__max_depth": [1, 3, 5, 10, 15],
        "sampling": [SMOTE(random_state=rng)],
        'sampling__k_neighbors': [3, 5, 7],
    },
    {
        "pre_processing": [StandardScaler()],
        "cls": [LogisticRegression(random_state=rng)],
        "cls__class_weight": ["balanced", None, {1: 10, 0: 1}],
        "cls__C": [0.1, 0.0001],
        "sampling": ["passthrough"],
    },
    {
        "pre_processing": [StandardScaler()],
        "cls": [LogisticRegression(random_state=rng)],
        "cls__class_weight": ["balanced", None, {1: 10, 0: 1}],
        "cls__C": [0.1, 0.0001],
        "sampling": [SMOTE(random_state=rng)],
        'sampling__k_neighbors': [3, 5, 7],
    },
    {
        "pre_processing": [StandardScaler()],
        "cls": [MLPClassifier(random_state=rng)],
        "cls__hidden_layer_sizes": [[5, 5], [10, 10]],
        "cls__max_iter": [1000],
        "sampling": ["passthrough"],
    },
    {
        "pre_processing": [StandardScaler()],
        "cls": [MLPClassifier(random_state=rng)],
        "cls__hidden_layer_sizes": [[5, 5], [10, 10]],
        "cls__max_iter": [1000],
        "sampling": [SMOTE(random_state=rng)],
        'sampling__k_neighbors': [3, 5, 7],
    }
]

# GridSearch with conditional preprocessing
grid_search = GridSearchCV(
    pipeline,
    param_grid=param_grid,
    cv=cv,
    scoring="roc_auc",
    n_jobs=-1,
)


grid_search.fit(X2d_train, y=y_train)

# Best parameters and estimator
print("Best parameters:", grid_search.best_params_)
print("Best estimator:", grid_search.best_estimator_)
print(f"Best score: {grid_search.best_score_:.2f}")

In [None]:
df_results = pd.DataFrame.from_records(grid_search.cv_results_)
df_results.sort_values("rank_test_score").head(5)[["mean_test_score", "param_cls", "param_cls__C", "param_cls__n_estimators", "param_cls__max_depth", "param_pre_processing", "param_sampling"]]

In [None]:
fig, axs = plt.subplots(nrows=1, ncols=1, figsize=(14, 6))

y_pred_cv = cross_val_predict(grid_search.best_estimator_, X2d_train, y_train, cv=cv, n_jobs=-1, method="predict_proba")

RocCurveDisplay.from_predictions(
    y_train, y_pred_cv[:, 1].ravel(), pos_label=1, ax=axs
)

Noch etwas eleganter:

In [None]:
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

cv = KFold(n_splits=5, shuffle=True, random_state=123)
rng = np.random.RandomState(123)

# Combined pipeline with conditional preprocessing
pipeline = ImbPipeline([
    ('pre_processing', 'passthrough'),  
    ('sampling', 'passthrough'),        
    ('cls', LogisticRegression())       # placeholder
])


param_grid_algos = [
    {
        "pre_processing": ["passthrough"],
        "cls": [RandomForestClassifier(random_state=rng)],
        "cls__n_estimators": [200],
        "cls__max_depth": [1, 3, 5, 10, None],
    },
    {
        "pre_processing": [StandardScaler()],
        "cls": [LogisticRegression(random_state=rng)],
        "cls__class_weight": ["balanced", None, {1: 10, 0: 1}],
        "cls__C": [0.15, 0.1, 0.01, 0.001],
    },
    {
        "pre_processing": [StandardScaler()],
        "cls": [MLPClassifier(random_state=rng)],
        "cls__hidden_layer_sizes": [[5, 5], [10, 10]],
        "cls__max_iter": [1000],
    }
]

param_grid_sampling = [
    {
        'sampling': [SMOTE(random_state=rng)],
        'sampling__k_neighbors': [3, 5, 7],
    },
    {
        'sampling': ['passthrough'],
    }
]

param_grid = []
for algo_grid in param_grid_algos:
    for sampling_grid in param_grid_sampling:
        combined_grid = {**algo_grid, **sampling_grid}
        param_grid.append(combined_grid)


# GridSearch with conditional preprocessing
grid_search = GridSearchCV(
    pipeline,
    param_grid=param_grid,
    cv=cv,
    scoring="roc_auc",
    n_jobs=-1,
)


grid_search.fit(X2d_train, y=y_train)

# Best parameters and estimator
print("Best parameters:", grid_search.best_params_)
print("Best estimator:", grid_search.best_estimator_)
print(f"Best score: {grid_search.best_score_:.2f}")

In [None]:
df_results = pd.DataFrame.from_records(grid_search.cv_results_)
df_results.sort_values("rank_test_score").head(5)[["mean_test_score", "param_cls", "param_cls__C", "param_cls__n_estimators", "param_cls__max_depth", "param_pre_processing", "param_sampling"]]

## Full Dataset

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123, stratify=y)

Wir verwenden Stratified KFold um die Modelle zu evaluieren.  

Auch möchten wir PCA als Pre-Processing Schritt verwenden.

In [None]:
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.decomposition import PCA
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=123)
rng = np.random.RandomState(123)

# Combined pipeline with conditional preprocessing
pipeline = ImbPipeline([
    ('sampling', 'passthrough'),
    ('pre_processing', StandardScaler()),
    ('pca', 'passthrough'),  # conditional PCA step
    ('cls', LogisticRegression())
])


param_grid_algos = [
    {
        "pre_processing": ["passthrough"],
        "pca": ["passthrough"],
        "cls": [RandomForestClassifier(random_state=rng)],
        "cls__n_estimators": [200],
        "cls__max_depth": [1, 3, 5, 10, None],
        "sampling": [SMOTE(k_neighbors=k) for k in [3, 5, 7]] + ['passthrough'],
    },
    {
        "pre_processing": [StandardScaler()],
        "pca": ["passthrough"],
        "cls": [LogisticRegression(random_state=rng)],
        "cls__class_weight": ["balanced", None, {1: 10, 0: 1}],
        "cls__C": [0.15, 0.1, 0.01, 0.001],
        "sampling": [SMOTE(k_neighbors=k) for k in [3, 5, 7]] + ['passthrough'],
    },
    {
        "pre_processing": [StandardScaler()],
        "pca": [PCA()],
        "pca__n_components": [2, 3, 4, 5, 6],
        "cls": [LogisticRegression(random_state=rng)],
        "cls__class_weight": ["balanced", None, {1: 10, 0: 1}],
        "cls__C": [0.15, 0.1, 0.01, 0.001],
        "sampling": [SMOTE(k_neighbors=k) for k in [3, 5, 7]] + ['passthrough'],
    },
    {
        "pre_processing": [StandardScaler()],
        "pca": [PCA(n_components=n) for n in [3, 5, 7]] + ['passthrough'],
        "cls": [MLPClassifier(random_state=rng)],
        "cls__hidden_layer_sizes": [[5, 5], [10, 10], [20, 20]],
        "cls__max_iter": [1000],
    }
]


# GridSearch with conditional preprocessing
grid_search = GridSearchCV(
    pipeline,
    param_grid=param_grid,
    cv=cv,
    scoring="roc_auc",
    n_jobs=-1,
)

grid_search.fit(X_train, y=y_train)

# Best parameters and estimator
print("Best parameters:", grid_search.best_params_)
print("Best estimator:", grid_search.best_estimator_)
print(f"Best score: {grid_search.best_score_:.2f}")

In [None]:
tuned_model = TunedThresholdClassifierCV(
    estimator=grid_search.best_estimator_,
    cv=cv,
    scoring=cost_scorer,
    store_cv_results=True,  # necessary to inspect all results
)

tuned_model.fit(X_train, y_train)
print(f"{tuned_model.best_threshold_=:0.2f}")

## Final Performance Estimation

In [None]:
y_hat_test = tuned_model.predict(X_test)

In [None]:
# Compute confusion matrix
cm = confusion_matrix(y_test, y_hat_test )

# Display confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap='Blues')
plt.title('Confusion Matrix (Cross-validated predictions)')
plt.show()

In [None]:
from sklearn.metrics import classification_report

cost_metric(y_test, y_hat_test)
print(classification_report(y_test, y_hat_test))