In [1]:
import boto3
import pandas as pd; pd.set_option('display.max_column', 100)
import numpy as np


from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler


s3= boto3.resource('s3')
bucket_name= 'morgangant-bata-445-bucket'
bucket= s3.Bucket(bucket_name)

file_key= 'College.csv'

bucket_object= bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

#reading the datefile
college = pd.read_csv(file_content_stream)
college.head()

Unnamed: 0,Private,Apps,Accept,Enroll,Top10perc,Top25perc,F.Undergrad,P.Undergrad,Outstate,Room.Board,Books,Personal,PhD,Terminal,S.F.Ratio,perc.alumni,Expend,Grad.Rate
Abilene Christian University,Yes,1660,1232,721,23,52,2885,537,7440,3300,450,2200,70,78,18.1,12,7041,60
Adelphi University,Yes,2186,1924,512,16,29,2683,1227,12280,6450,750,1500,29,30,12.2,16,10527,56
Adrian College,Yes,1428,1097,336,22,50,1036,99,11250,3750,400,1165,53,66,12.9,30,8735,54
Agnes Scott College,Yes,417,349,137,60,89,510,63,12960,5450,450,875,92,97,7.7,37,19016,59
Alaska Pacific University,Yes,193,146,55,16,44,249,869,7560,4120,800,1500,76,72,11.9,2,10922,15


In [2]:
#Changing yes/no to 1/0
college['Private'].replace(['Yes', 'No'], [1,0], inplace= True)
college.head()

Unnamed: 0,Private,Apps,Accept,Enroll,Top10perc,Top25perc,F.Undergrad,P.Undergrad,Outstate,Room.Board,Books,Personal,PhD,Terminal,S.F.Ratio,perc.alumni,Expend,Grad.Rate
Abilene Christian University,1,1660,1232,721,23,52,2885,537,7440,3300,450,2200,70,78,18.1,12,7041,60
Adelphi University,1,2186,1924,512,16,29,2683,1227,12280,6450,750,1500,29,30,12.2,16,10527,56
Adrian College,1,1428,1097,336,22,50,1036,99,11250,3750,400,1165,53,66,12.9,30,8735,54
Agnes Scott College,1,417,349,137,60,89,510,63,12960,5450,450,875,92,97,7.7,37,19016,59
Alaska Pacific University,1,193,146,55,16,44,249,869,7560,4120,800,1500,76,72,11.9,2,10922,15


In [3]:
## Defining the input and taregt variables
x= college[['Private', 'F.Undergrad', 'P.Undergrad', 'Outstate', 'Room.Board', 'Books', 'Personal', 'S.F.Ratio', 'Grad.Rate']]
y= college['Apps']

#Splitting data into train and test
x_train, x_test, y_train, y_test= train_test_split(x, y, test_size= 0.2)

In [4]:
#Min Max Scaler
scaler= MinMaxScaler()

x_train= scaler.fit_transform(x_train)
x_test= scaler.fit_transform(x_test)

In [5]:
#Building Linear Regression Model
lm_md= LinearRegression().fit(x_train, y_train)

#Predictiong on test
lm_pred= lm_md.predict(x_test)

#Compute MSE of linear regression model
mse1= np.mean(np.power(y_test - lm_pred, 2))
print('MSE of Linear Model:', mse1)

MSE of Linear Model: 3484126.177526141


In [6]:
#Ridge Regression
ridge_cv= RidgeCV(alphas= np.linspace(0.001,100, num=100), cv=5).fit(x_train, y_train)

#Extracting the best lambda
cv_lambda= ridge_cv.alpha_
print('The best lambda of the ridge model is', cv_lambda)

#Building the ridge model
ridge_md= Ridge(alpha= cv_lambda).fit(x_train, y_train)

#predicting on test
ridge_pred= ridge_md.predict(x_test)

#Computing MSE
mse2= np.mean(np.power(y_test - ridge_pred,2 ))
print('MSE of Ridge Model:', mse2)

The best lambda of the ridge model is 1.011090909090909
MSE of Ridge Model: 3527849.9346838878


In [7]:
#Estimating lambda for lasso
lasso_cv= LassoCV(alphas= np.linspace(0.001,100, num=100), normalize= True, cv=5).fit(x_train, y_train)

#Extracting best lambda
cv_lambda= lasso_cv.alpha_
print('Estimated lambda for the lasso model is:', cv_lambda)

#Building lasso
lasso_md= Lasso(alpha= cv_lambda, normalize= True).fit(x_train, y_train)
lasso_md.coef_

#predicting on test
lasso_pred= lasso_md.predict(x_test)


#Computing MSE
mse3= np.mean(np.power(y_test - lasso_pred,2 ))
print('MSE of Lasso Model:', mse3)

Estimated lambda for the lasso model is: 1.011090909090909
MSE of Lasso Model: 3346658.3681953666


If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), Lasso())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)

Set parameter alpha to: original_alpha * np.sqrt(n_samples). 
If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), Lasso())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)

Set para

In [None]:
#Model 1 Linear Regression MSE: 10032501.7945517
#Model 2 Ridge Regression MSE: 10033091.889840508
#Model 3 Lasso Regression MSE: 10090359.568750948
#Based on these numbers, we would want to use the Model 1, the linear  Regression model to predict the number of applications that a university receives!