<a href="https://colab.research.google.com/github/mersalas/MLBS-2025_workshop/blob/main/Lab_1b_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import packages

In [None]:
# Install packages
!pip install ucimlrepo
!pip install scikit-optimize

In [None]:
# Import the necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split, KFold
from skopt import BayesSearchCV
from skopt.plots import plot_objective
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_validate
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

## **Load dataset**

In [None]:
# Fetch dataset
from ucimlrepo import fetch_ucirepo
abalone = fetch_ucirepo(id=1)

## **Exploratory data analysis**

In [None]:
# data (as pandas dataframes)
X = abalone.data.features
y = abalone.data.targets
X.head()

In [None]:
# Add y into X dataset
data = X.copy()
data['Rings'] = y
data.head()

In [None]:
# Metadata
print(abalone.metadata)

In [None]:
# Variable information
print(abalone.variables)

In [None]:
# Countplot for variable sex
ax = sns.countplot(x='Sex', data=data)
ax.bar_label(ax.containers[0])
plt.show()

In [None]:
# Convert categorical into numerical values
label = LabelEncoder()
data['Sex'] = label.fit_transform(data['Sex'])

In [None]:
# Check datatypes & if there are missing values
data.info()

In [None]:
# summary statistics
data.describe()

In [None]:
# How many samples have zero (0) values for height?
(data.Height == 0).sum()

In [None]:
# Identify the samples with height = 0
data[data['Height'] == 0]

In [None]:
# Drop samples with height = 0
data = data[data['Height'] != 0]
data.describe()

In [None]:
# Convert rings to the age of abalone
data['Age'] = data['Rings'] + 1.5

# Remove rings column
data = data.drop('Rings', axis=1)

data.head()

In [None]:
# Correlation matrix
sns.heatmap(data.corr(), annot=True)

In [None]:
# Scatter plot

# Define numeric features
numeric_features = ['Length', 'Diameter', 'Height', 'Whole_weight',
                    'Shucked_weight', 'Viscera_weight', 'Shell_weight', 'Sex']

# Create subplots
fig, axes = plt.subplots(4,2, figsize=(12,14))
axes = axes.flatten()

# Plot scatter plots for each numeric feature against age
for i, feature in enumerate(numeric_features):
  sns.scatterplot(data=data, x=feature, y='Age', ax=axes[i])
  axes[i].set_title(f"{feature} vs Age")
  axes[i].set_xlabel(feature)
  axes[i].set_ylabel('Age')

plt.tight_layout()
plt.show()

In [None]:
# Separating the features from the target
X = data.iloc[:,0:8].values
y = data['Age'].values

In [None]:
# Target distribution
sns.distplot(y)

In [None]:
# Box plot to detect outliers
sns.boxplot(y)

In [None]:
# Normalize the data
scaler = MinMaxScaler()
X_scale = scaler.fit_transform(X)

In [None]:
# Split dataset into training & test set
X_train, X_test, y_train, y_test = train_test_split(X_scale, y, test_size=0.20, random_state=42)

print("Size of training set: {}   Size of test set:"
      " {}\n".format(X_train.shape[0], X_test.shape[0]))

## **Train model**

### Ridge regression

In [None]:
# Perform Bayesian search to find optimum parameters for ridge
ridge = Ridge(random_state=42)

param = {'alpha': (1e-3, 1e1, 'log-uniform')}

cv = KFold(n_splits=10, shuffle=True, random_state=42)
ridge_cv = BayesSearchCV(estimator=ridge, search_spaces=param, cv=cv, scoring='r2',
                       n_jobs=-1, random_state=42)
ridge_cv.fit(X_train, y_train)

print('best parameters: ', ridge_cv.best_params_)
print('best score after search cv:', ridge_cv.best_score_)

In [None]:
# Plot the Bayesian objective function
fig, ax = plt.subplots(figsize=(4,4))
plot_objective(ridge_cv.optimizer_results_[0], ax=ax)
plt.show()

In [None]:
# Train ridge
ridge_tuned = Ridge(**ridge_cv.best_params_, random_state=42)

ridge_tuned.fit(X_train, y_train)

In [None]:
# Print intercept & coefficients
print('intercept:', ridge_tuned.intercept_ )
print('coef:', ridge_tuned.coef_, end='\n')

In [None]:
# Evaluate the ridge model on the training set
scoring = ['r2', 'neg_mean_squared_error', 'neg_mean_absolute_error']
cv_results = cross_validate(ridge_tuned, X_train, y_train, cv=10, scoring=scoring, n_jobs=-1)

print("Performance of the ridge model on the training set:\nR2: {:.4f}\nMSE: {:.4f}\nMAE: {:.4f}".format(
    np.mean(cv_results['test_r2']),
    np.mean(cv_results['test_neg_mean_squared_error']),
    np.mean(cv_results['test_neg_mean_absolute_error'])
))

### Lasso regression

In [None]:
# Perform Bayesian search to find optimum parameters for lasso
lasso = Lasso(random_state=42)

param = {'alpha': (1e-3, 1, 'log-uniform')}

cv = KFold(n_splits=10, shuffle=True, random_state=42)
lasso_cv = BayesSearchCV(estimator=lasso, search_spaces=param, cv=cv, scoring='r2',
                       n_jobs=-1, random_state=42)
lasso_cv.fit(X_train, y_train)

print('best parameters: ', lasso_cv.best_params_)
print('best score after search cv:', lasso_cv.best_score_)

In [None]:
# Plot the Bayesian objective function
fig, ax = plt.subplots(figsize=(4,4))
plot_objective(lasso_cv.optimizer_results_[0], ax=ax)
plt.show()

In [None]:
# Train lasso
lasso_tuned = Lasso(**lasso_cv.best_params_, random_state=42)

lasso_tuned.fit(X_train, y_train)

In [None]:
# Print intercept & coefficients
print('intercept:', lasso_tuned.intercept_ )
print('coef:', lasso_tuned.coef_, end='\n')

In [None]:
# Evaluate the lasso model on the training set
scoring = ['r2', 'neg_mean_squared_error', 'neg_mean_absolute_error']
cv_results = cross_validate(lasso_tuned, X_train, y_train, cv=10, scoring=scoring, n_jobs=-1)

print("Performance of the lasso model on the training set:\nR2: {:.4f}\nMSE: {:.4f}\nMAE: {:.4f}".format(
    np.mean(cv_results['test_r2']),
    np.mean(cv_results['test_neg_mean_squared_error']),
    np.mean(cv_results['test_neg_mean_absolute_error'])
))

### ElasticNet regression

In [None]:
# Perform Bayesian search to find optimum parameters for elasticnet
enet = ElasticNet(random_state=42)

param = {'alpha': (1e-3, 1, 'log-uniform'),
         'l1_ratio': (0,1)}

cv = KFold(n_splits=10, shuffle=True, random_state=42)
enet_cv = BayesSearchCV(estimator=enet, search_spaces=param, cv=cv, scoring='r2',
                       n_jobs=-1, random_state=42)
enet_cv.fit(X_train, y_train)

print('best parameters: ', enet_cv.best_params_)
print('best score after search cv:', enet_cv.best_score_)

In [None]:
# Plot the Bayesian objective function
fig, ax = plt.subplots(figsize=(10,8))
plot_objective(enet_cv.optimizer_results_[0], ax=ax)
plt.show()

In [None]:
# Train enet
enet_tuned = ElasticNet(**enet_cv.best_params_, random_state=42)

enet_tuned.fit(X_train, y_train)

In [None]:
# Print intercept & coefficients
print('intercept:', enet_tuned.intercept_ )
print('coef:', enet_tuned.coef_, end='\n')

In [None]:
# Evaluate the enet model on the training set
scoring = ['r2', 'neg_mean_squared_error', 'neg_mean_absolute_error']
cv_results = cross_validate(enet_tuned, X_train, y_train, cv=10, scoring=scoring, n_jobs=-1)

print("Performance of the enet model on the training set:\nR2: {:.4f}\nMSE: {:.4f}\nMAE: {:.4f}".format(
    np.mean(cv_results['test_r2']),
    np.mean(cv_results['test_neg_mean_squared_error']),
    np.mean(cv_results['test_neg_mean_absolute_error'])
))

### KNR

In [None]:
# Perform random search to find the optimum parameters for KNR
knr = KNeighborsRegressor(n_jobs=-1)

param = {'n_neighbors': (5, 20),
         'weights': ['uniform', 'distance'],
         'p': (1,2,3),
         'metric': ['minkowski', 'manhattan', 'euclidean']
         }

cv = KFold(n_splits=10, shuffle=True, random_state=42)
knr_cv = BayesSearchCV(estimator=knr, search_spaces=param, cv=cv, scoring='r2',
                       n_jobs=-1, random_state=42)

knr_cv.fit(X_train, y_train)

print('best parameters: ', knr_cv.best_params_)
print('best score after random search cv:', knr_cv.best_score_)

In [None]:
# Plot the Bayesian objective function
fig, ax = plt.subplots(figsize=(10,8))
plot_objective(knr_cv.optimizer_results_[0], ax=ax)
plt.show()

In [None]:
# Train KNR
knr_tuned = KNeighborsRegressor(n_neighbors=17, weights='distance', p=2,
                                 metric='euclidean', n_jobs=-1)

knr_tuned.fit(X_train, y_train)

In [None]:
# Evaluate the KNR model on the training set
scoring = ['r2', 'neg_mean_squared_error', 'neg_mean_absolute_error']
cv_results = cross_validate(knr_tuned, X_train, y_train, cv=10, scoring=scoring, n_jobs=-1)

print("Performance of the enet model on the training set:\nR2: {:.4f}\nMSE: {:.4f}\nMAE: {:.4f}".format(
    np.mean(cv_results['test_r2']),
    np.mean(cv_results['test_neg_mean_squared_error']),
    np.mean(cv_results['test_neg_mean_absolute_error'])
))

## **Evaluation**

In [None]:
# Residual plot
y_train_pred = knr_tuned.predict(X_train)
y_test_pred = knr_tuned.predict(X_test)

x_max = np.max([np.max(y_train_pred), np.max(y_test_pred)])
x_min = np.min([np.min(y_train_pred), np.min(y_test_pred)])

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(7,3), sharey=True)

ax1.scatter(
    y_test_pred, y_test_pred - y_test,
    c='limegreen', marker='s',
    edgecolor='white', label='Test data')

ax2.scatter(
    y_train_pred, y_train_pred - y_train,
    c='steelblue', marker='o', edgecolor='white',
    label='Training data')
ax1.set_ylabel('Residuals')

for ax in (ax1, ax2):
  ax.set_xlabel('Predicted values')
  ax.legend(loc='upper left')
  ax.hlines(y=0, xmin=x_min-100, xmax=x_max+100,\
            color='black', lw=2)

plt.tight_layout()
plt.show()

In [None]:
y_pred_knr = knr_tuned.predict(X_test)

print("Performance of the KNR model on the test set:")
print("R2: {:.4f}".format(r2_score(y_test, y_pred_knr)))
print("MSE: {:.4f}".format(mean_squared_error(y_test, y_pred_knr)))
print("MAE: {:.4f}".format(mean_absolute_error(y_test, y_pred_knr)))

## **Exercise 1b**

Using the abalone dataset, build regressor using the following algorithms:


*   SVR
*   RandomForestRegressor



