In [1]:
import boto3
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV

## Defining the bucket
s3 = boto3.resource('s3')
bucket_name = 'data-445'
bucket = s3.Bucket(bucket_name)

## Defining the csv file
file_key = 'Demos/CarPrice_Assignment.csv'
bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

## Reading the csv file
car_price = pd.read_csv(file_content_stream)
car_price.head()

Unnamed: 0,car_ID,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,1,3,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0
1,2,3,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0
2,3,1,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0
3,4,2,audi 100 ls,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950.0
4,5,2,audi 100ls,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450.0


In [2]:
## Defining the input and target variables
X = car_price[['wheelbase', 'enginesize', 'compressionratio', 'horsepower', 'peakrpm', 'citympg', 'highwaympg']]
Y = car_price['price']

In [3]:
## Defining the list to store estimated coefficients at each split
coef = list()

for i in range(0, 1000):
    
    ## Splitting the data
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2)
    
    ## Running LASSO cross-validation to estimate optimal lambda
    lasso_cv = LassoCV(normalize = True, cv = 5).fit(X_train, Y_train)
    
    ## Building LASSO regression with optimal lambda
    lasso_md = Lasso(alpha = lasso_cv.alpha_, normalize = True).fit(X_train, Y_train)

    ## Storing estimated coefficients
    coef.append(lasso_md.coef_)

In [4]:
## Putting the list as data-frame
coef_data = pd.DataFrame(coef)
coef_data.head()

Unnamed: 0,0,1,2,3,4,5,6
0,303.854008,102.701928,174.416944,34.261588,1.422018,-69.239281,-25.860884
1,229.925395,86.938954,314.605502,50.25288,1.397806,-153.27261,0.0
2,220.487317,98.873834,299.08005,66.533016,1.946246,-219.326683,144.812791
3,185.688481,107.679124,255.371912,53.248151,2.074106,-109.660898,-0.0
4,144.14036,113.915111,311.479525,44.736918,2.331367,-163.950283,-8.041023


In [6]:
def counting_zeros(X):
    
    return sum(X == 0.0)

coef_data.apply(counting_zeros, axis = 0)

0      0
1      0
2      0
3      0
4      0
5     16
6    759
dtype: int64

In [7]:
## Removing highway
X_train = X_train.drop(columns = ['highwaympg'], axis = 1)
X_test = X_test.drop(columns = ['highwaympg'], axis = 1)

In [8]:
def l2_normalization(X):
    
    x_mean = np.mean(X)
    l2 = np.sqrt(sum(X**2))
    return (X - x_mean) / l2

X_train = X_train.apply(l2_normalization, axis = 1)
X_test = X_test.apply(l2_normalization, axis = 1)

In [9]:
X_train.shape

(164, 6)

In [10]:
## Building the linear model
md1 = LinearRegression().fit(X_train, Y_train)

## Predicting on the test dataset
pred1 = md1.predict(X_test)

## Computing the mse
mse1 = np.mean(np.power(pred1 - Y_test, 2))
mse1

20617834.01011127

In [11]:
alphas = list()

for i in range(0, 100):
    
    ## Estimating the optimal lambda
    ridge_cv = RidgeCV(alphas = [0.001, 0.01, 0.1, 1, 10, 100], cv = 5).fit(X_train, Y_train)
    alphas.append(ridge_cv.alpha_)

In [14]:
## Ridge regression model
md2 = Ridge(alpha = 0.001).fit(X_train, Y_train)

## Predicting on the test datset
pred2 = md2.predict(X_test)

## Computing the mse
mse2 = np.mean(np.power(pred2 - Y_test, 2))
mse2

18973755.4321262