In [9]:
import boto3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV

## Defining the bucket 
s3 = boto3.resource('s3')
bucket_name = 'data-445'
bucket = s3.Bucket(bucket_name)

## Defining the csv file 
file_key = 'Demos/College.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

## Reading the csv file
college = pd.read_csv(file_content_stream).reset_index(drop = True)
college.head()

Unnamed: 0,Private,Apps,Accept,Enroll,Top10perc,Top25perc,F.Undergrad,P.Undergrad,Outstate,Room.Board,Books,Personal,PhD,Terminal,S.F.Ratio,perc.alumni,Expend,Grad.Rate
0,Yes,1660,1232,721,23,52,2885,537,7440,3300,450,2200,70,78,18.1,12,7041,60
1,Yes,2186,1924,512,16,29,2683,1227,12280,6450,750,1500,29,30,12.2,16,10527,56
2,Yes,1428,1097,336,22,50,1036,99,11250,3750,400,1165,53,66,12.9,30,8735,54
3,Yes,417,349,137,60,89,510,63,12960,5450,450,875,92,97,7.7,37,19016,59
4,Yes,193,146,55,16,44,249,869,7560,4120,800,1500,76,72,11.9,2,10922,15


In [6]:
## Changing Private to 0-1
college['Private'] = np.where(college['Private'] == 'Yes', 1, 0)
college['Private']

0      1
1      1
2      1
3      1
4      1
      ..
772    0
773    1
774    1
775    1
776    1
Name: Private, Length: 777, dtype: int64

In [7]:
## Defining the input and target variables
X = college[['Private', 'F.Undergrad', 'P.Undergrad', 'Outstate', 'Room.Board', 'Books', 'Personal', 'S.F.Ratio', 'Grad.Rate']]
Y = college['Apps']

## Splitting the data into training and testing
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2)

In [8]:
## Transforming inputs to 0-1 scale
scaler = MinMaxScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [11]:
## Linear regression
md1 = LinearRegression().fit(X_train, Y_train)

## Predicting on the test dataset
pred1 = md1.predict(X_test)

## Computing the mse
mse1 = np.mean(np.power(pred1 - Y_test, 2))
mse1

9331485.068039903

In [17]:
## Estimating lambda
alphas = np.linspace(0.001, 100, num = 100)
ridge_cv = RidgeCV(alphas = alphas, cv = 5).fit(X_train, Y_train)
ridge_cv.alpha_

## Building ridge regression model with optimal lambda
md2 = Ridge(alpha = ridge_cv.alpha_).fit(X_train, Y_train)

## Predicting on the test dataset
pred2 = md2.predict(X_test)

## Computing the mse
mse2 = np.mean(np.power(pred2 - Y_test, 2))
mse2

9331684.694795828

In [15]:
np.linspace(0.001, 100, num = 100)

array([1.00000000e-03, 1.01109091e+00, 2.02118182e+00, 3.03127273e+00,
       4.04136364e+00, 5.05145455e+00, 6.06154545e+00, 7.07163636e+00,
       8.08172727e+00, 9.09181818e+00, 1.01019091e+01, 1.11120000e+01,
       1.21220909e+01, 1.31321818e+01, 1.41422727e+01, 1.51523636e+01,
       1.61624545e+01, 1.71725455e+01, 1.81826364e+01, 1.91927273e+01,
       2.02028182e+01, 2.12129091e+01, 2.22230000e+01, 2.32330909e+01,
       2.42431818e+01, 2.52532727e+01, 2.62633636e+01, 2.72734545e+01,
       2.82835455e+01, 2.92936364e+01, 3.03037273e+01, 3.13138182e+01,
       3.23239091e+01, 3.33340000e+01, 3.43440909e+01, 3.53541818e+01,
       3.63642727e+01, 3.73743636e+01, 3.83844545e+01, 3.93945455e+01,
       4.04046364e+01, 4.14147273e+01, 4.24248182e+01, 4.34349091e+01,
       4.44450000e+01, 4.54550909e+01, 4.64651818e+01, 4.74752727e+01,
       4.84853636e+01, 4.94954545e+01, 5.05055455e+01, 5.15156364e+01,
       5.25257273e+01, 5.35358182e+01, 5.45459091e+01, 5.55560000e+01,
      

In [None]:
## LASSO regression
