In [1]:
import boto3
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV

## Defining the bucket
s3 = boto3.resource('s3')
bucket_name = 'data-445'
bucket = s3.Bucket(bucket_name)

## Defining the csv file
file_key = 'Demos/Fish.csv'
bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

## Reading the csv file
fish = pd.read_csv(file_content_stream)
fish.head()

Unnamed: 0,Species,Weight,Length1,Length2,Length3,Height,Width
0,Bream,242.0,23.2,25.4,30.0,11.52,4.02
1,Bream,290.0,24.0,26.3,31.2,12.48,4.3056
2,Bream,340.0,23.9,26.5,31.1,12.3778,4.6961
3,Bream,363.0,26.3,29.0,33.5,12.73,4.4555
4,Bream,430.0,26.5,29.0,34.0,12.444,5.134


In [2]:
## Defining the input and target variables
X = fish[['Length1', 'Length2', 'Length3', 'Height', 'Width']]
Y = fish['Weight']

## Splitting the data 
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2)

In [5]:
## Running cross-validation to estimate lambda in LASSO
lasso_cv = LassoCV(normalize = True, cv = 5, max_iter = 10000).fit(X_train, Y_train)
lasso_alpha = lasso_cv.alpha_

## Building lasso model
lasso_md = Lasso(alpha = lasso_alpha, normalize = True, max_iter = 10000).fit(X_train, Y_train)
lasso_md.coef_

array([ 36.00523702,   0.        , -11.7646496 ,  17.8372833 ,
        40.91689226])

In [6]:
## Dropping the second variable 
X_train = X_train.drop(columns = ['Length2'], axis = 1)
X_test = X_test.drop(columns = ['Length2'], axis = 1)

In [7]:
def l2_normalization(X):
    
    x_mean = np.mean(X)
    l2 = np.sqrt(sum(X**2))
    return (X - x_mean) / l2

X_train = X_train.apply(l2_normalization, axis = 1)
X_test = X_test.apply(l2_normalization, axis = 1)

In [9]:
## Linear Regression
md1 = LinearRegression().fit(X_train, Y_train)

## Predicting on the test dataset
md1_pred = md1.predict(X_test)

## Computing the mse
mse1 = np.mean(np.power(md1_pred - Y_test, 2))
mse1

75680.49789091074

In [12]:
## Reidge regression
ridge_cv = RidgeCV(alphas = [0.001, 0.01, 0.1, 1, 10, 100], cv = 5).fit(X_train, Y_train)
ridge_alpha = ridge_cv.alpha_

## Building the ridge model
ridge_md = Ridge(alpha = ridge_alpha).fit(X_train, Y_train)

## Predicting on the test dataset
md2_pred = ridge_md.predict(X_test)

## Computing the mse
mse2 = np.mean(np.power(md2_pred - Y_test, 2))
mse2

76564.0149823623