In [48]:
# Import Relevant Packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Import the 'abalone' dataset
abalone = pd.read_csv('./abalone.csv', header=None)
abalone_x = abalone.iloc[:,:-1]
abalone_y = abalone.iloc[:,-1]

# Split 
abalone_x_train, abalone_x_test, abalone_y_train, abalone_y_test = train_test_split(abalone_x,
                                                                                    abalone_y,
                                                                                    test_size=0.2,
                                                                                    shuffle = True,
                                                                                    random_state = 0)

# Homework 2
Author: Mao Nishino

## Problem Statement
In this problem we use the abalone dataset available on Canvas. The dataset
is about predicting the age of the abalone from its physical measurements. Use the first 7 variables as predictors and the 8-th as the response.
Report all results as the average of 20 random splits. For each random split divide the data at random into 85% for training and 15% for testing, train the models and compute the training error and the test error (or R2) for that split. Repeat this process 20 times obtaining 20 different random splits of the data and report the average training or test MSE or R2 obtained over the 20 splits for the following models.

## Problem (a)


Null model. Report the average train and test MSE of the null model that always
predicts training  ̄y (average training y). (1 point)

In [49]:
# Calculate the average y-value in the training set
y_bar = abalone_y_train.mean()

# Create the prediction
y_train_pred = np.full(len(abalone_x_train), y_bar)
y_test_pred = np.full(len(abalone_x_test), y_bar)

# Find the training and the MSEs
train_mse = mean_squared_error(abalone_y_train, y_train_pred)
test_mse = mean_squared_error(abalone_y_test, y_test_pred)

# Create a dataframe to show the result
table = {
    'Dataset': ['Train', 'Test'],
    'Mean Squared Error': [train_mse, test_mse]
}

print('The following table shows the desired MSEs.')
df = pd.DataFrame(table)
df


The following table shows the desired MSEs.


Unnamed: 0,Dataset,Mean Squared Error
0,Train,10.272738
1,Test,10.875555


## Problem (b)
OLS regression, analytic, by solving the normal equations, with λ = 0.0001.
Report the average training and test R2 and MSE. (2 points)

In [50]:
# Calculate B = X^TX+lambda*I_p
B = np.matmul(abalone_x_train.T, abalone_x_train) + 0.0001*np.eye(abalone_x_train.shape[1])

# Calculate the coefficients using the normal equation beta = B^{-1}X^T y
invBX = np.matmul(np.linalg.inv(B), abalone_x_train.T)
beta = np.matmul(invBX, abalone_y_train)

# Create predictions
y_train_pred = np.matmul(abalone_x_train, beta)
y_test_pred = np.matmul(abalone_x_test, beta)

# Find the training and the MSEs/R2
train_mse = mean_squared_error(abalone_y_train, y_train_pred)
test_mse = mean_squared_error(abalone_y_test, y_test_pred)
train_r2 = r2_score(abalone_y_train, y_train_pred)
test_r2 = r2_score(abalone_y_test, y_test_pred)

# Create a dataframe to show the result
table = {
    'Dataset': ['Train', 'Test'],
    'Mean Squared Error': [train_mse, test_mse],
    'R squared': [train_r2, test_r2]
}

print('The following table shows the desired MSEs and R2s.')
df = pd.DataFrame(table)
df


The following table shows the desired MSEs and R2s.


Unnamed: 0,Dataset,Mean Squared Error,R squared
0,Train,5.018612,0.511463
1,Test,5.218919,0.51945
