Exercise – Ensemble Methods and Hyperparameter Tuning

1. Implement Classification Models

1.1 Train a Decision Tree Classifier and a Random Forest Classifier

In [None]:
import numpy as np
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, classification_report

# Load dataset
wine = load_wine()
X, y = wine.data, wine.target

# Split dataset (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Decision Tree Classifier
dt_clf = DecisionTreeClassifier(random_state=42)
dt_clf.fit(X_train, y_train)
dt_pred = dt_clf.predict(X_test)

# Random Forest Classifier
rf_clf = RandomForestClassifier(random_state=42)
rf_clf.fit(X_train, y_train)
rf_pred = rf_clf.predict(X_test)


1.2 Compare the models based on their F1 scores

In [None]:
dt_f1 = f1_score(y_test, dt_pred, average="macro")
rf_f1 = f1_score(y_test, rf_pred, average="macro")

print("Decision Tree F1 Score (Macro):", dt_f1)
print("Random Forest F1 Score (Macro):", rf_f1)

print("\nDecision Tree Classification Report:")
print(classification_report(y_test, dt_pred, target_names=wine.target_names))

print("\nRandom Forest Classification Report:")
print(classification_report(y_test, rf_pred, target_names=wine.target_names))


Decision Tree F1 Score (Macro): 0.9457411645054665
Random Forest F1 Score (Macro): 1.0

Decision Tree Classification Report:
              precision    recall  f1-score   support

     class_0       1.00      0.92      0.96        12
     class_1       0.88      1.00      0.93        14
     class_2       1.00      0.90      0.95        10

    accuracy                           0.94        36
   macro avg       0.96      0.94      0.95        36
weighted avg       0.95      0.94      0.94        36


Random Forest Classification Report:
              precision    recall  f1-score   support

     class_0       1.00      1.00      1.00        12
     class_1       1.00      1.00      1.00        14
     class_2       1.00      1.00      1.00        10

    accuracy                           1.00        36
   macro avg       1.00      1.00      1.00        36
weighted avg       1.00      1.00      1.00        36



2. Hyperparameter Tuning

2.1 Identify three hyperparameters of the Random Forest Classifier

Three commonly tuned hyperparameters are:

n_estimators: number of trees in the forest

max_depth: maximum depth of each tree

max_features: number of features considered at each split

2.2 Perform hyperparameter tuning using GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV

rf_clf = RandomForestClassifier(random_state=42)

param_grid = {
    "n_estimators": [50, 100, 200],
    "max_depth": [None, 5, 10, 20],
    "max_features": ["sqrt", "log2", None]
}

grid_search = GridSearchCV(
    estimator=rf_clf,
    param_grid=param_grid,
    scoring="f1_macro",
    cv=5,
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation F1 (Macro):", grid_search.best_score_)


Best Parameters: {'max_depth': None, 'max_features': 'sqrt', 'n_estimators': 50}
Best Cross-Validation F1 (Macro): 0.9863203463203462


2.3 Evaluate the tuned model on the test set

In [None]:
best_rf_clf = grid_search.best_estimator_
best_pred = best_rf_clf.predict(X_test)

best_f1 = f1_score(y_test, best_pred, average="macro")
print("Test F1 Score (Macro) after tuning:", best_f1)

print("\nTuned Random Forest Classification Report:")
print(classification_report(y_test, best_pred, target_names=wine.target_names))


Test F1 Score (Macro) after tuning: 1.0

Tuned Random Forest Classification Report:
              precision    recall  f1-score   support

     class_0       1.00      1.00      1.00        12
     class_1       1.00      1.00      1.00        14
     class_2       1.00      1.00      1.00        10

    accuracy                           1.00        36
   macro avg       1.00      1.00      1.00        36
weighted avg       1.00      1.00      1.00        36



3. Implement Regression Model

In this implementation, “alcohol” is treated as the continuous target variable.

3.1 Train a Decision Tree Regressor and a Random Forest Regressor

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Regression setup: y = alcohol, X = remaining features
y_reg = wine.data[:, 0]                  # alcohol
X_reg = np.delete(wine.data, 0, axis=1)  # remaining features

X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(
    X_reg, y_reg, test_size=0.2, random_state=42
)

# Decision Tree Regressor
dt_reg = DecisionTreeRegressor(random_state=42)
dt_reg.fit(X_train_r, y_train_r)
dt_reg_pred = dt_reg.predict(X_test_r)

# Random Forest Regressor
rf_reg = RandomForestRegressor(random_state=42)
rf_reg.fit(X_train_r, y_train_r)
rf_reg_pred = rf_reg.predict(X_test_r)

def regression_metrics(y_true, y_pred, model_name):
    print(f"\n{model_name}")
    print("MSE:", mean_squared_error(y_true, y_pred))
    print("MAE:", mean_absolute_error(y_true, y_pred))
    print("R2 :", r2_score(y_true, y_pred))

regression_metrics(y_test_r, dt_reg_pred, "Decision Tree Regressor")
regression_metrics(y_test_r, rf_reg_pred, "Random Forest Regressor")



Decision Tree Regressor
MSE: 0.31197222222222226
MAE: 0.4383333333333332
R2 : 0.4774648003821349

Random Forest Regressor
MSE: 0.15426672999999946
MAE: 0.31587222222222167
R2 : 0.7416122628458712


3.2 Identify three parameters for Random Forest Regression and tune using RandomizedSearchCV

In [None]:
from sklearn.model_selection import RandomizedSearchCV

rf_reg = RandomForestRegressor(random_state=42)

param_dist = {
    "n_estimators": [50, 100, 200, 300, 500],
    "max_depth": [None, 5, 10, 20, 30],
    "min_samples_split": [2, 5, 10, 15]
}

random_search = RandomizedSearchCV(
    estimator=rf_reg,
    param_distributions=param_dist,
    n_iter=20,
    scoring="neg_mean_squared_error",
    cv=5,
    random_state=42,
    n_jobs=-1
)

random_search.fit(X_train_r, y_train_r)

print("Best Parameters:", random_search.best_params_)


Best Parameters: {'n_estimators': 500, 'min_samples_split': 15, 'max_depth': 5}


3.3 Evaluate tuned Random Forest Regressor

In [None]:
best_rf_reg = random_search.best_estimator_
tuned_pred = best_rf_reg.predict(X_test_r)

regression_metrics(y_test_r, tuned_pred, "Tuned Random Forest Regressor")



Tuned Random Forest Regressor
MSE: 0.1562382386485518
MAE: 0.3130770389411607
R2 : 0.7383101013332805
