In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_validate
from sklearn.metrics import mean_squared_error, mean_absolute_error, make_scorer

In [None]:
# 1. Generate synthetic regression data
np.random.seed(42)
X = np.random.rand(442, 10)  # similar shape to diabetes dataset (442 samples, 10 features)
y = X @ np.random.rand(10) + np.random.randn(442) * 0.5  # linear-ish target with noise

In [None]:
# 2. Define scoring metrics
scoring = {
    'mse': make_scorer(mean_squared_error),
    'mae': make_scorer(mean_absolute_error)
}

In [None]:
# 3. Plot training loss for different max_depth values
losses = {}
for depth in [1, 2, 3, 4]:
    model = GradientBoostingRegressor(n_estimators=100, max_depth=depth, loss='lad', learning_rate=1.0)
    model.fit(X, y)
    losses[depth] = model.train_score_  # uses MAE (lad) loss


In [None]:
for depth, loss in losses.items():
    plt.plot(loss, label=f'max_depth={depth}')
plt.xlabel('Number of Trees')
plt.ylabel('MAE Loss')
plt.title('Training Loss by Boosting Iteration')
plt.legend()
plt.show()

In [None]:
4. Compare model performance using cross-validation

models = {
    "GradientBoost (sklearn)": GradientBoostingRegressor(n_estimators=20, max_depth=1, loss='lad', learning_rate=1.0),
    "Decision Stump": DecisionTreeRegressor(max_depth=1),
    "Random Forest": RandomForestRegressor(n_estimators=20, max_depth=1),
    "AdaBoost": AdaBoostRegressor(DecisionTreeRegressor(max_depth=1), n_estimators=20, loss='linear')
}

# Evaluate and print results
print("Model\t\t\tMSE\t\tMAE")
for name, model in models.items():
    scores = cross_validate(model, X, y, cv=10, scoring=scoring)
    print(f"{name:<24}{np.mean(scores['test_mse']):.2f}\t{np.mean(scores['test_mae']):.2f}")