## Assignment 6, Question 2

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from ISLP.bart import BART

df = pd.read_csv("C:/Users/msohr/Desktop/NU/DDS-8555/Abalone/train.csv")
df.drop(columns=["id"], inplace=True)

X = df.drop(columns=["Rings"])
y = df["Rings"]

categorical_features = ["Sex"]
numeric_features = X.columns.drop(categorical_features).tolist()

preprocessor = ColumnTransformer([
    ("cat", OneHotEncoder(drop="first"), categorical_features)
], remainder="passthrough")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=55)

X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)
results = {}

model=LinearRegression()
model.fit(X_train_transformed, y_train)
y_pred = model.predict(X_test_transformed)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
results["LinearRegression"] = {"MSE": mse, "R2": r2}
print(f"LinearRegression - MSE: {mse:.4f}, R2: {r2:.4f}")

LinearRegression - MSE: 4.0957, R2: 0.6005


In [4]:
model=GradientBoostingRegressor(n_estimators=5000, learning_rate=0.001, max_depth=5, random_state=55)
model.fit(X_train_transformed, y_train)
y_pred = model.predict(X_test_transformed)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
results["Boosting"] = {"MSE": mse, "R2": r2}
print(f"Boosting - MSE: {mse:.4f}, R2: {r2:.4f}")

Boosting - MSE: 3.5634, R2: 0.6524


In [6]:
model=RandomForestRegressor(max_features=X_train_transformed.shape[1], random_state=55, n_jobs=-1)
model.fit(X_train_transformed, y_train)
y_pred = model.predict(X_test_transformed)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
results["Bagging"] = {"MSE": mse, "R2": r2}
print(f"Bagging - MSE: {mse:.4f}, R2: {r2:.4f}")

Bagging - MSE: 3.6247, R2: 0.6464


In [9]:
model=RandomForestRegressor(max_features=int(X_train.shape[1]/2), random_state=55, n_jobs=-1)
model.fit(X_train_transformed, y_train)
y_pred = model.predict(X_test_transformed)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
results["RandomForest"] = {"MSE": mse, "R2": r2}
print(f"RandomForest - MSE: {mse:.4f}, R2: {r2:.4f}")

RandomForest - MSE: 3.5339, R2: 0.6553


In [24]:
bart_model = BART(burnin=5, ndraw=15, random_state=55)
bart_model.fit(X_train_transformed, y_train)
y_pred_bart = bart_model.predict(X_test_transformed)
mse_bart = mean_squared_error(y_test, y_pred_bart)
r2_bart = r2_score(y_test, y_pred_bart)
results["BART"] = {"MSE": mse_bart, "R2": r2_bart}
print(f"BART - MSE: {mse_bart:.4f}, R2: {r2_bart:.4f}")

BART - MSE: 3.8604, R2: 0.6234


In [11]:
for model, metrics in results.items():
    print(f"{model}: MSE = {metrics['MSE']:.4f}, R2 = {metrics['R2']:.4f}")

LinearRegression: MSE = 4.0957, R2 = 0.6005
Boosting: MSE = 3.5634, R2 = 0.6524
Bagging: MSE = 3.6247, R2 = 0.6464
RandomForest: MSE = 3.5339, R2 = 0.6553


Random Forest performs best with lowest MSE (3.5339) and highest R² (0.6553), meaning it explains ~65.5% of the variance in the test data — better than all other models.  
Linear Regression performs the worst, affirming that the relationships in the data are nonlinear and benefit from ensemble modeling.