In [2]:
import boto3
import pandas as pd; pd.set_option('display.max_columns', 100)
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV

s3 = boto3.resource('s3')
bucket_name = 'data-445'
bucket = s3.Bucket(bucket_name)

file_key = 'CarPrice_Assignment.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

# Reading the data-file
car_price = pd.read_csv(file_content_stream)
car_price.head()



Unnamed: 0,car_ID,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,carlength,carwidth,carheight,curbweight,enginetype,cylindernumber,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,1,3,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,48.8,2548,dohc,four,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0
1,2,3,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,48.8,2548,dohc,four,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0
2,3,1,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,94.5,171.2,65.5,52.4,2823,ohcv,six,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0
3,4,2,audi 100 ls,gas,std,four,sedan,fwd,front,99.8,176.6,66.2,54.3,2337,ohc,four,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950.0
4,5,2,audi 100ls,gas,std,four,sedan,4wd,front,99.4,176.6,66.4,54.3,2824,ohc,five,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450.0


In [11]:
## Defining the input and target variables
X = car_price[['wheelbase', 'enginesize', 'horsepower', 'compressionratio', 'peakrpm', 'citympg', 'highwaympg']]
Y = car_price['price']

## Splitting the data into train and test
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2)

In [12]:
## Estimating lambda for lasso
lasso_cv = LassoCV(alphas = [0.001, 0.01, 0.1, 1, 10, 100], normalize = True, cv = 5).fit(X_train, Y_train)

## Extracting the best lambda
cv_lambda = lasso_cv.alpha_
print('The estimated lambda for the lasso model is', cv_lambda)

## Building lasso
lasso_md = Lasso(alpha = cv_lambda, normalize = True).fit(X_train, Y_train)
lasso_md.coef_

The estimated lambda for the lasso model is 1.0


array([ 133.20289726,  119.92088318,   36.35620372,  354.17428231,
          2.3426193 , -147.97420566,   -0.        ])

In [14]:
## Dropping highmpg
X_train = X_train.drop(columns = ['highwaympg'], axis = 1)
X_test = X_test.drop(columns = ['highwaympg'], axis = 1)

def l2_normalization(X):
    
    x_mean = np.mean(X)
    l2 = np.sqrt(sum(X**2))
    return (X - x_mean) / l2

X_train = X_train.apply(l2_normalization, axis = 1)
X_test = X_test.apply(l2_normalization, axis = 1)

In [15]:
## Linear regression
lm_md = LinearRegression().fit(X_train, Y_train)

## Predicting on test
lm_pred = lm_md.predict(X_test)

## Computing the mse of the linear regression model
mse1 = np.mean(np.power(Y_test - lm_pred, 2))
print('The mse of the linear model is', mse1)

The mse of the linear model is 6852405.149714986


In [16]:
## Ridge regression
ridge_cv = RidgeCV(alphas = [0.001, 0.01, 0.1, 1, 10, 100], cv = 5).fit(X_train, Y_train)

## Extracting the best lambda
cv_lambda = ridge_cv.alpha_
print('The best lambda of the ridge model is', cv_lambda)

## Building the ridge model
ridge_md = Ridge(alpha = cv_lambda).fit(X_train, Y_train)

## Predicting on test
ridge_pred = ridge_md.predict(X_test)

## Computing the mse of the ridge regression model 
mse2 = np.mean(np.power(Y_test - ridge_pred, 2))
print('The mse of the ridge model is', mse2)

The best lambda of the ridge model is 0.001
The mse of the ridge model is 6040959.637958733


In [None]:
## The mse of the ridge regression model is smaller, so it preferred to predict car price.