# Pipline for Breast Cancer data set

In [12]:
# imports 
import os
import pandas as pd
import numpy as np


In [13]:
# load csv file into df 
df = pd.read_csv("data/breast-cancer-diagnostic.shuf.lrn.csv")

## Imputation

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class OutlierImputer(BaseEstimator, TransformerMixin):
    """
    Ersetzt Ausreißer (Z-Score > threshold) durch den Median
    der jeweiligen Spalte (berechnet aus Trainingsdaten).
    """
    def __init__(self, threshold=3):
        self.threshold = threshold
        self.medians_ = None
        self.numeric_cols_ = None

    def fit(self, X, y=None):
        # Nur numerische Spalten
        self.numeric_cols_ = X.select_dtypes(include='number').columns
        # 
        self.medians_ = X[self.numeric_cols_].median()
        return self

    def transform(self, X, y=None):
        # Kopie, um Original-Daten nicht zu überschreiben
        X_transformed = X.copy()
        for col in self.numeric_cols_:
            mean = X_transformed[col].mean()
            std = X_transformed[col].std()

            # Z-Scores
            z_scores = (X_transformed[col] - mean) / (std if std != 0 else 1e-9)
            outlier_idx = np.where(np.abs(z_scores) > self.threshold)[0]  # int positions

            # Outlier -> Median
            X_transformed.iloc[outlier_idx, X_transformed.columns.get_loc(col)] = self.medians_[col]

            z_scores = (X_transformed[col] - mean) / (std if std != 0 else 1e-9)
            
        return X_transformed

## Pipline

In [15]:
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier


pipeline = Pipeline([
    ('outlier_imputer', OutlierImputer(threshold=3)),
    ('smote', SMOTE(random_state=123)),          # Minority Over-Sampling
    ('scaler', StandardScaler()),               # StandardScaler
    ('pca', PCA(n_components=0.95)),            # PCA mit 95% Varianzerhalt
    ('classifier', RandomForestClassifier())    # random frorest
])


## Cross Validation
Vorteil: In jedem Fold wird erst der Outlier-Imputer „gefitet“, dann SMOTE auf den jeweiligen Fold angewendet, anschließend skaliert, PCA berechnet und zuletzt das Modell trainiert. Auf dem Test-Fold werden alle Transformationen mit den Parametern des Trainingsfolds angewandt. Dadurch vermeidest du jegliches Data Leakage.

## Optimazation

In [18]:
def objective(trial):
    # === Hyperparameter per trial vorschlagen
    threshold = trial.suggest_float("outlier_threshold", 2.0, 5.0)  # Range für Outlier
    n_estimators = trial.suggest_int("n_estimators", 50, 300)
    max_depth = trial.suggest_int("max_depth", 3, 20)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 10)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 4)
    max_features = trial.suggest_categorical("max_features", ["sqrt", "log2"])
    pca_n_components = trial.suggest_float("pca_n_components", 0.90, 0.99)

    # === Pipeline zusammenbauen ===
    pipeline = Pipeline([
        ("outlier_imputer", OutlierImputer(threshold=threshold)),
        ("smote", SMOTE(random_state=42)),
        ("scaler", StandardScaler()),
        ("pca", PCA(n_components=pca_n_components)),
        ("rf", RandomForestClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            max_features=max_features,
            random_state=42,
            n_jobs=-1
        ))
    ])

    # === Cross Validation ===
    # Die rohen X und y (nicht vorher skaliert/PCA etc.)
    cv_score = cross_val_score(
        pipeline, 
        X, 
        y, 
        cv=5,                # z.B. 5-Fold CV
        scoring='f1_macro'   # oder f1_weighted, accuracy, etc.
    ).mean()

    return cv_score

# === Optuna Study starten ===
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30)

# Ergebnis
print("Beste Parameter:", study.best_params)
print("Bester Score:", study.best_value)

# Optional: Visualisierungen
# fig1 = optuna.visualization.plot_optimization_history(study)
# fig1.show()

[I 2025-04-16 13:10:36,601] A new study created in memory with name: no-name-46d98217-a959-4e11-acd4-a81d7d331d6e
  X_transformed.iloc[outlier_idx, X_transformed.columns.get_loc(col)] = self.medians_[col]
  X_transformed.iloc[outlier_idx, X_transformed.columns.get_loc(col)] = self.medians_[col]
  X_transformed.iloc[outlier_idx, X_transformed.columns.get_loc(col)] = self.medians_[col]
  X_transformed.iloc[outlier_idx, X_transformed.columns.get_loc(col)] = self.medians_[col]
  X_transformed.iloc[outlier_idx, X_transformed.columns.get_loc(col)] = self.medians_[col]
  X_transformed.iloc[outlier_idx, X_transformed.columns.get_loc(col)] = self.medians_[col]
[I 2025-04-16 13:10:37,462] Trial 0 finished with value: 0.9340752917197955 and parameters: {'outlier_threshold': 4.477169652736231, 'n_estimators': 127, 'max_depth': 19, 'min_samples_split': 9, 'min_samples_leaf': 2, 'max_features': 'log2', 'pca_n_components': 0.9173255125319805}. Best is trial 0 with value: 0.9340752917197955.
  X_trans

Beste Parameter: {'outlier_threshold': 3.3025959088240784, 'n_estimators': 195, 'max_depth': 14, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'log2', 'pca_n_components': 0.9099093925024171}
Bester Score: 0.9452258522258523
