# Ensembling: Random Forest vs Gradient Boosting
Diese Notebook‑Demo führt ein kurzes Experiment auf dem Iris‑Datensatz durch: Vergleich von RandomForest und GradientBoosting, Cross‑Validation, Feature‑Importances und Lernkurve. Outputs werden in `../assets/` geschrieben.

In [None]:
# Standard‑Imports
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, cross_val_score, learning_curve
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report

sns.set(style='whitegrid')
BASE = os.path.dirname(__file__) if '__file__' in globals() else '.'
ASSETS = os.path.abspath(os.path.join(BASE, '..', 'assets'))
os.makedirs(ASSETS, exist_ok=True)

In [None]:
# Daten laden und kurzer Überblick
data = load_iris()
X = data.data
y = data.target
feature_names = data.feature_names
print('X shape:', X.shape)
print('Classes:', np.unique(y))
print('Feature names:', feature_names)

In [None]:
# Train/Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)
print('Train size:', X_train.shape[0], 'Test size:', X_test.shape[0])

In [None]:
# Modelle definieren und trainieren
rf = RandomForestClassifier(n_estimators=200, random_state=0)
gb = GradientBoostingClassifier(n_estimators=200, learning_rate=0.05, random_state=0)

rf.fit(X_train, y_train)
gb.fit(X_train, y_train)
print('Modelle trainiert')

In [None]:
# Evaluation: Test accuracy + CV
from sklearn.model_selection import cross_val_score
for name, model in [('RandomForest', rf), ('GradientBoosting', gb)]:
    preds = model.predict(X_test)
    acc = accuracy_score(y_test, preds)
    cv = cross_val_score(model, X, y, cv=5, scoring='accuracy')
    print(f'{name} test accuracy: {acc:.3f}')
    print(f'{name} 5-fold CV mean: {cv.mean():.3f} (std {cv.std():.3f})')

In [None]:
# Feature importances & Plot (saved to assets)
importances = pd.DataFrame({
    'feature': feature_names,
    'rf_importance': rf.feature_importances_,
    'gb_importance': gb.feature_importances_
})
fig, ax = plt.subplots(figsize=(6,4))
sns.barplot(data=importances.melt(id_vars='feature', value_vars=['rf_importance','gb_importance'], var_name='model', value_name='importance'), x='feature', y='importance', hue='model', ax=ax)
plt.xticks(rotation=20)
plt.tight_layout()
png_path = os.path.join(ASSETS, 'feature_importances.png')
fig.savefig(png_path, dpi=200)
plt.close(fig)
print('Saved:', png_path)

In [None]:
# Learning curve (example)
train_sizes, train_scores, test_scores = learning_curve(rf, X, y, cv=5, train_sizes=[0.2,0.4,0.6,0.8,1.0], scoring='accuracy')
train_mean = train_scores.mean(axis=1)
test_mean = test_scores.mean(axis=1)
fig2, ax2 = plt.subplots()
ax2.plot(train_sizes, train_mean, 'o-', label='Train')
ax2.plot(train_sizes, test_mean, 'o-', label='Validation')
ax2.set_xlabel('Training set size')
ax2.set_ylabel('Accuracy')
ax2.legend()
ax2.grid(True)
lc_path = os.path.join(ASSETS, 'learning_curve_rf.png')
fig2.savefig(lc_path, dpi=200)
plt.close(fig2)
print('Saved:', lc_path)

---
**Weiteres:** In `code/demo_run.py` ist ein skriptfähiger Minimallauf enthalten, das die gleichen Plots erzeugt (nützlich zum Batch‑Laufen oder CI).