In [2]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import ElasticNetCV, LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import mean_squared_error
%matplotlib inline

In [3]:
X = pd.read_csv('../data/to_be_split_train_X',index_col='Id')
y = pd.read_csv('../data/to_be_split_train_y',index_col='Id')
test= pd.read_csv('../data/to_be_modeled_test',index_col='Id')

In [4]:
print(X.shape,y.shape)

(2044, 381) (2044, 1)


#### Splitting our data into training and testing sets.

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=9)

By splitting the data into 2 different groups, I am able to double check my model, and ensure that the model does not suffer from over fitting.

#### Scaling the training data, and then scaling the testing data based on the scale created by the training data

In [14]:
(1 - X_train['Garage Cars_2.0'].mean()) * X_train['Garage Cars_2.0'].std() * -1400

-310.11726068894905

In [11]:
X_train['Garage Cars_2.0'].std()

0.4971865306799402

In [8]:
X_train.columns[X_train.columns.str.contains('Cars')]

Index(['Garage Cars_0.0', 'Garage Cars_1.0', 'Garage Cars_2.0',
       'Garage Cars_3.0', 'Garage Cars_4.0', 'Garage Cars_5.0'],
      dtype='object')

In [5]:
ss = StandardScaler()
X_Scaled_train = ss.fit_transform(X_train)
X_Scaled_test = ss.transform(X_test)

Standardizing the data allows us to compare all of our features on the same scale, which makes interpreting the weight of our coefficient relative to other features easier.

# Linear Regression Modeling

In [6]:
lr = LinearRegression()
lr_model = lr.fit(X_Scaled_train,y_train)

In [7]:
print('train score:',lr_model.score(X_Scaled_train,y_train))
print('test score:',lr_model.score(X_Scaled_test,y_test))
y_hat_train = lr_model.predict(X_Scaled_train)
y_hat_test = lr_model.predict(X_Scaled_test)
print('Train RMSE:',np.sqrt(mean_squared_error(y_train,y_hat_train)))
print('Test RMSE:',np.sqrt(mean_squared_error(y_test,y_hat_test)))

train score: 0.950684231346211
test score: -1.5578422105594534e+24
Train RMSE: 17358.072999943728
Test RMSE: 1.0215258410353336e+17


We can see that our linear regression is Extremely overfit, with the train RMSE being a lot lower than our testing RMSE

# Elastic Net Modeling

In [8]:
ENCV = ElasticNetCV(l1_ratio=[.0001,.3,.5,.7,.9,1],n_alphas=100)
ENCV.fit(X_Scaled_train,y_train)

  y = column_or_1d(y, warn=True)


ElasticNetCV(alphas=None, copy_X=True, cv=None, eps=0.001, fit_intercept=True,
       l1_ratio=[0.0001, 0.3, 0.5, 0.7, 0.9, 1], max_iter=1000,
       n_alphas=100, n_jobs=1, normalize=False, positive=False,
       precompute='auto', random_state=None, selection='cyclic',
       tol=0.0001, verbose=0)

by using the Elastic Net Cross Validation regression model as our model, I get the benefits of both Lasso regularization and Ridge regularization at the cost of compute.

# Model Evaluation

In [9]:
print("l1_ratio:",ENCV.l1_ratio_)
print("Alpha:",ENCV.alpha_)
print('train score:',ENCV.score(X_Scaled_train,y_train))
print('test score:',ENCV.score(X_Scaled_test,y_test))
y_hat_train = ENCV.predict(X_Scaled_train)
y_hat_test = ENCV.predict(X_Scaled_test)
print('Train RMSE:',np.sqrt(mean_squared_error(y_train,y_hat_train)))
print('Test RMSE:',np.sqrt(mean_squared_error(y_test,y_hat_test)))
print('Model Intercept:',ENCV.intercept_)

l1_ratio: 1.0
Alpha: 615.4120546203621
train score: 0.9399132398990824
test score: 0.9260326756208075
Train RMSE: 19160.11012237124
Test RMSE: 22259.098859392256
Model Intercept: 181170.8754076973


Our l1_ratio tells us that our best performing model, based on R^2, is a model that is entirely a lasso regularization. this does make sense as lasso regularization can be used select only the features that have an effect on our target feature, which is sales price. our RMSE, or Root mean squared error, tells us on average the dollar amount we are wrong by in our model. 

In [10]:
X_col_list = X.columns
coef_df = pd.DataFrame(data = ENCV.coef_,index=X_col_list,columns=["Coefficient"])
coef_df = coef_df[coef_df['Coefficient'] !=0]
coef_df.sort_values(by='Coefficient').head(15)

Unnamed: 0,Coefficient
Overall Cond_3,-3481.946644
Exter Qual_TA,-2928.727535
Fireplaces_0,-2709.504401
Bsmt Full Bath_0.0,-2133.84029
BsmtFin Type 1_Unf,-1967.220876
Full Bath_1,-1777.395424
Functional_Maj1,-1540.562241
Garage Type_2Types,-1526.800903
Garage Cars_2.0,-1377.129463
Overall Cond_4,-1293.355916


In [11]:
coef_df.sort_values(by='Coefficient').tail(15)

Unnamed: 0,Coefficient
Half Bath_1,3435.078711
Full Bath_3,3497.982191
Neighborhood_NoRidge,3530.899065
Neighborhood_GrnHill,3621.962282
Neighborhood_StoneBr,3643.255654
Bsmt Exposure_Gd,3688.915281
Year Remod/Add,3829.146427
Bsmt Qual_Ex,4853.123874
Kitchen Qual_Ex,5098.736973
Lot Area,6157.746661


These Data Frames show us the features and their coefficient, or their weight on sale price. one way to interpret these coefficients is that for every increase of 1 standard deviation, sales price increases by the coefficient.

# Kaggle Submission

In [12]:
scaled_test = ss.transform(test)

In [13]:
data = ENCV.predict(scaled_test)

In [14]:
submission = pd.DataFrame(data = data,index = test.index,columns=['SalePrice'])

In [15]:
submission.to_csv("../data/Submission_File",index_label='Id')

# Data For Presentation

In [16]:
presentation_df = X

presentation_df_fit = ss.transform(presentation_df)

presentation_df["Predictions"] = ENCV.predict(presentation_df_fit)

presentation_df["SalePrice"] = y

presentation_df.to_csv('../data/Presentation_df')
coef_df.to_csv('../data/coef_df')