In [17]:
pip install xgboost

Collecting xgboost
  Downloading xgboost-3.0.0-py3-none-macosx_10_15_x86_64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.0-py3-none-macosx_10_15_x86_64.whl (2.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: xgboost
Successfully installed xgboost-3.0.0
Note: you may need to restart the kernel to use updated packages.


In [18]:
import pandas as pd
import numpy as np
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor, StackingRegressor, BaggingRegressor
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, LassoCV
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

In [7]:
from ucimlrepo import fetch_ucirepo 

# fetch dataset 
abalone = fetch_ucirepo(id=1) 
  
# data (as pandas dataframes) 
X = abalone.data.features 
y = abalone.data.targets 
  
# metadata 
print(abalone.metadata) 
  
# variable information 
print(abalone.variables) 

{'uci_id': 1, 'name': 'Abalone', 'repository_url': 'https://archive.ics.uci.edu/dataset/1/abalone', 'data_url': 'https://archive.ics.uci.edu/static/public/1/data.csv', 'abstract': 'Predict the age of abalone from physical measurements', 'area': 'Biology', 'tasks': ['Classification', 'Regression'], 'characteristics': ['Tabular'], 'num_instances': 4177, 'num_features': 8, 'feature_types': ['Categorical', 'Integer', 'Real'], 'demographics': [], 'target_col': ['Rings'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1994, 'last_updated': 'Mon Aug 28 2023', 'dataset_doi': '10.24432/C55C7W', 'creators': ['Warwick Nash', 'Tracy Sellers', 'Simon Talbot', 'Andrew Cawthorn', 'Wes Ford'], 'intro_paper': None, 'additional_info': {'summary': 'Predicting the age of abalone from physical measurements.  The age of abalone is determined by cutting the shell through the cone, staining it, and counting the number of rings through a microscope -- 

In [9]:
ct = ColumnTransformer(
    [
        # Dummify all categorical columns
        ("dummify", OneHotEncoder(handle_unknown="ignore", sparse_output=False), make_column_selector(dtype_include=object)),

        # Standardize numerical columns
        ("standardize", StandardScaler(), make_column_selector(dtype_include=np.number))
    ],
    # Keeps any column not specified in ct as is in final output
    remainder='passthrough'  
)

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define base models (level-0 models) — these are your experts
base_models = [
    # Random Forest Regressor — tree-based model, good for handling non-linear patterns
    ('rf', RandomForestRegressor(n_estimators=100)),

    # KNN with StandardScaler — KNN needs scaled features for distance calculations
    ('knn', KNeighborsRegressor(n_neighbors=5)),

    # SVR with StandardScaler — Support Vector Regression also sensitive to feature scaling
    ('svr', SVR())
]

# Define meta-model (level-1 model) — the judge that combines the base models’ predictions
meta_model = Ridge()  # Regularized linear regression model to avoid overfitting

# Build the stacking regressor ensemble
stack = StackingRegressor(
    estimators=base_models,      # Base models
    final_estimator=meta_model,  # Meta-model
    cv=5                         # 5-fold cross-validation to avoid data leakage to meta-model
)

In [10]:
# Full pipeline = preprocessing + stacking model
full_pipeline = Pipeline([
    ('preprocessing', ct),
    ('model', stack)
])

# Fit the pipeline to the training data
full_pipeline.fit(X_train, y_train)

# Make predictions on the test set
y_pred = full_pipeline.predict(X_test)

# Evaluate the model
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse}")
print(f"MAE: {mae}")
print(f"R²: {r2}")

  y = column_or_1d(y, warn=True)


RMSE: 2.136488989760084
MAE: 1.527231270925299
R²: 0.5504890051720036




In [16]:
base_models = [
    ('linear_regression', LinearRegression()),
    ('ridge', RidgeCV()),
    ('lasso', LassoCV()),
    ('knn',  KNeighborsRegressor(n_neighbors=5)),
    ('svr', SVR())
]
meta_model = RandomForestRegressor(n_estimators=100)

stack = StackingRegressor(
    estimators=base_models,
    final_estimator=meta_model,
    cv=5,
    passthrough=False  
)

# Full pipeline = preprocessing + stacking model
full_pipeline2 = Pipeline([
    ('preprocessing', ct),
    ('model', stack)
])

# Fit the pipeline to the training data
full_pipeline2.fit(X_train, y_train)

# Make predictions on the test set
y_pred2 = full_pipeline2.predict(X_test)

# Evaluate the model
rmse2 = mean_squared_error(y_test, y_pred2, squared=False)
mae2 = mean_absolute_error(y_test, y_pred2)
r2_2 = r2_score(y_test, y_pred2)

print("\nSecond Stacking Model:")
print(f"RMSE: {rmse2}")
print(f"MAE: {mae2}")
print(f"R²: {r2_2}")

  y = column_or_1d(y, warn=True)



Second Stacking Model:
RMSE: 2.255827328897104
MAE: 1.6242424242424245
R²: 0.4988696471563113




In [23]:
base_models = [
    ('linear', LinearRegression()),
    ('ridge', RidgeCV()),
    ('lasso', LassoCV()),
    ('knn', KNeighborsRegressor(n_neighbors=5)),
    ('xgb', XGBRegressor(n_estimators=100, random_state=42))
]

meta_model = KNeighborsRegressor(n_neighbors=3)

stack = StackingRegressor(
    estimators=base_models,
    final_estimator=meta_model,
    cv=5,
    passthrough=False  
)

# Full pipeline = preprocessing + stacking model
full_pipeline3 = Pipeline([
    ('preprocessing', ct),
    ('model', stack)
])

# Fit the pipeline to the training data
full_pipeline3.fit(X_train, y_train)

# Make predictions on the test set
y_pred3 = full_pipeline3.predict(X_test)

# Evaluate the model
rmse3 = mean_squared_error(y_test, y_pred3, squared=False)
mae3 = mean_absolute_error(y_test, y_pred3)
r2_3 = r2_score(y_test, y_pred3)

print("\nSecond Stacking Model:")
print(f"RMSE: {rmse3}")
print(f"MAE: {mae3}")
print(f"R²: {r2_3}")

  y = column_or_1d(y, warn=True)



Second Stacking Model:
RMSE: 2.466504298002439
MAE: 1.7299308878256248
R²: 0.4008952624436244




The first stacking model I tried performed similar to bagging. It has a slightly better RMSE and R^2 than bagging. The other two stacking models I tried performed worse. 