In [1]:
# Loading Data and Packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Lasso, LogisticRegression, ElasticNet, Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
import pickle

In [2]:
# Reading and viewing CVSs
train = pd.read_csv("Resources/train.csv")
train

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125


In [3]:
# Dropping 'Id' column
train.drop("Id", axis = 1, inplace = True)
train

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,8,2007,WD,Normal,175000
1456,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,Inside,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,Inside,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,4,2010,WD,Normal,142125


In [4]:
# Cleaning up data (string columns with None and integer columns with 0)
train["PoolQC"] = train["PoolQC"].fillna("None")
train["MiscFeature"] = train["MiscFeature"].fillna("None")
train["Alley"] = train["Alley"].fillna("None")
train["Fence"] = train["Fence"].fillna("None")
train["FireplaceQu"] = train["FireplaceQu"].fillna("None")
train["LotFrontage"] = train.groupby("Neighborhood")["LotFrontage"].transform(lambda x: x.fillna(x.median()))
for col in ('GarageType', 'GarageFinish', 'GarageQual', 'GarageCond'):
    train[col] = train[col].fillna('None')
for col in ('GarageYrBlt', 'GarageArea', 'GarageCars'):
    train[col] = train[col].fillna(0)
for col in ('BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath'):
    train[col] = train[col].fillna(0)
for col in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'):
    train[col] = train[col].fillna('None')
train["MasVnrType"] = train["MasVnrType"].fillna("None")
train["MasVnrArea"] = train["MasVnrArea"].fillna(0)
train['MSZoning'] = train['MSZoning'].fillna(train['MSZoning'].mode()[0])
train = train.drop(['Utilities'], axis=1)
train["Functional"] = train["Functional"].fillna("Typ")
train['Electrical'] = train['Electrical'].fillna(train['Electrical'].mode()[0])
train['KitchenQual'] = train['KitchenQual'].fillna(train['KitchenQual'].mode()[0])
train['Exterior1st'] = train['Exterior1st'].fillna(train['Exterior1st'].mode()[0])
train['Exterior2nd'] = train['Exterior2nd'].fillna(train['Exterior2nd'].mode()[0])
train['SaleType'] = train['SaleType'].fillna(train['SaleType'].mode()[0])
train['MSSubClass'] = train['MSSubClass'].fillna("None")
train

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,LotConfig,LandSlope,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,,Reg,Lvl,Inside,Gtl,...,0,,,,0,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,Pave,,Reg,Lvl,FR2,Gtl,...,0,,,,0,5,2007,WD,Normal,181500
2,60,RL,68.0,11250,Pave,,IR1,Lvl,Inside,Gtl,...,0,,,,0,9,2008,WD,Normal,223500
3,70,RL,60.0,9550,Pave,,IR1,Lvl,Corner,Gtl,...,0,,,,0,2,2006,WD,Abnorml,140000
4,60,RL,84.0,14260,Pave,,IR1,Lvl,FR2,Gtl,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,60,RL,62.0,7917,Pave,,Reg,Lvl,Inside,Gtl,...,0,,,,0,8,2007,WD,Normal,175000
1456,20,RL,85.0,13175,Pave,,Reg,Lvl,Inside,Gtl,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,70,RL,66.0,9042,Pave,,Reg,Lvl,Inside,Gtl,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,20,RL,68.0,9717,Pave,,Reg,Lvl,Inside,Gtl,...,0,,,,0,4,2010,WD,Normal,142125


In [5]:
# Getting dummies for string based columns
train = pd.get_dummies(train)
train

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,60,65.0,8450,7,5,2003,2003,196.0,706,0,...,0,0,0,1,0,0,0,0,1,0
1,20,80.0,9600,6,8,1976,1976,0.0,978,0,...,0,0,0,1,0,0,0,0,1,0
2,60,68.0,11250,7,5,2001,2002,162.0,486,0,...,0,0,0,1,0,0,0,0,1,0
3,70,60.0,9550,7,5,1915,1970,0.0,216,0,...,0,0,0,1,1,0,0,0,0,0
4,60,84.0,14260,8,5,2000,2000,350.0,655,0,...,0,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,60,62.0,7917,6,5,1999,2000,0.0,0,0,...,0,0,0,1,0,0,0,0,1,0
1456,20,85.0,13175,6,6,1978,1988,119.0,790,163,...,0,0,0,1,0,0,0,0,1,0
1457,70,66.0,9042,7,9,1941,2006,0.0,275,0,...,0,0,0,1,0,0,0,0,1,0
1458,20,68.0,9717,5,6,1950,1996,0.0,49,1029,...,0,0,0,1,0,0,0,0,1,0


In [6]:
# Setting up x(target) and y(data) values
# Independent variable (dataframe)
x = train.drop(columns=['SalePrice'])
# Dependent variable (series)
y = train['SalePrice'].values.reshape(-1,1)
print(x.shape, y.shape)

(1460, 300) (1460, 1)


In [7]:
# Splitting data into test and train
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42)

In [8]:
# Creating scaler
x_scaler = StandardScaler().fit(x_train)
y_scaler = StandardScaler().fit(y_train)

In [9]:
# Transforming with scaler
x_train_scaled = x_scaler.transform(x_train)
x_test_scaled = x_scaler.transform(x_test)
y_train_scaled = y_scaler.transform(y_train)
y_test_scaled = y_scaler.transform(y_test)

In [10]:
# Creating empty regular linear regression model
regressor = LinearRegression()
# Training model
regressor.fit(x_train_scaled, y_train_scaled)
# Making predictions on test portion
reg_predictions = regressor.predict(x_test_scaled)
reg_MSE = mean_squared_error(y_test_scaled, reg_predictions)
r2_reg = regressor.score(x_test_scaled, y_test_scaled)
print(f"Regular Mean Squared Error: {reg_MSE}, Regular Linear Score: {r2_reg}")

Regular Mean Squared Error: 6.468112512446369e+25, Regular Linear Score: -5.605861201088467e+25


In [11]:
# Training lasso linear regression model
lasso = Lasso(alpha=.01).fit(x_train_scaled, y_train_scaled)
# Making predictions on test portion
predictions = lasso.predict(x_test_scaled)
MSE = mean_squared_error(y_test_scaled, predictions)
r2_las = lasso.score(x_test_scaled, y_test_scaled)
print(f"Lasso Mean Squared Error: {MSE}, Lasso Score: {r2_las}")

Lasso Mean Squared Error: 0.11198392620528083, Lasso Score: 0.9029444299474153


In [12]:
# Creating empty random forest regression model
rf = RandomForestRegressor(n_estimators=200)
# Training random forest regression model
rf = rf.fit(x_train, y_train)
# Making predictions on test portion
rf_predictions = rf.predict(x_test_scaled)
rf_MSE = mean_squared_error(y_test_scaled, rf_predictions)
r2_rf = rf.score(x_test, y_test)
print(f"Random Forest Mean Squared Error: {rf_MSE}, Random Forest Score: {r2_rf}")

  after removing the cwd from sys.path.


Random Forest Mean Squared Error: 4093169485.868983, Random Forest Score: 0.8924218142052752


In [13]:
# Training ridge linear regression model
ridge = Ridge(alpha=.01).fit(x_train_scaled, y_train_scaled)
# Making predictions on test portion
ridge_predictions = ridge.predict(x_test_scaled)
ridge_MSE = mean_squared_error(y_test_scaled, ridge_predictions)
r2_ridge = ridge.score(x_test_scaled, y_test_scaled)
print(f"Ridge Mean Squared Error: {ridge_MSE}, Ridge Score: {r2_ridge}")

Ridge Mean Squared Error: 0.5314679747086739, Ridge Score: 0.5393809719129983


In [14]:
# Training elastic net linear regression model
en = ElasticNet(alpha=.01).fit(x_train_scaled, y_train_scaled)
# Making predictions on test portion
en_predictions = en.predict(x_test_scaled)
en_MSE = mean_squared_error(y_test_scaled, en_predictions)
r2_en = en.score(x_test_scaled, y_test_scaled)
print(f"Elastic Net Mean Squared Error: {en_MSE}, Elastic Net Score: {r2_en}")

Elastic Net Mean Squared Error: 0.1119869652540367, Elastic Net Score: 0.9029417960282502


In [15]:
# Lasso is slightly better than Elastic Net so it will be chosen to continue
# Inversing scaling on y
predictions = y_scaler.inverse_transform(predictions)
predictions

array([148393.82114147, 347809.50100843,  96722.26069262, 168137.67632608,
       329083.88596242,  67212.12802028, 226046.74134092, 140023.60195975,
        60461.31903823, 138348.18087433, 142482.99110059, 109299.2212298 ,
        95114.2207917 , 214841.26135097, 174047.67141066, 128022.88550453,
       193488.17486018, 131858.6511305 , 123289.00161286, 214300.72524759,
       155905.43474898, 206592.38445656, 175419.5410394 , 123297.71182395,
       202508.86968095, 147352.55681164, 199406.39971571,  98957.15865461,
       173769.56711842, 217498.08088464, 121658.18049414, 263704.22841   ,
       238749.59942329, 105611.63381192, 235079.02149958, 152845.10313446,
       135449.76910182, 204194.30075427, 310659.57544884, 106673.89209417,
       127786.72370494, 228463.76731429, 110738.5104073 , 367803.1054551 ,
       129891.04360621, 154516.44024207,  98975.93373136, 131300.0614023 ,
       411969.05066003, 130875.62950369, 113503.73114918, 235463.10863629,
       114939.64567602, 2

In [16]:
# Reducing y_test from list of lists to just a list
y_test_ravel = np.ravel(y_test)

In [17]:
# Turning raveled predictions and y_test_scaled into a dataframe to evaluate
preds_tv = pd.DataFrame({"predictions": predictions,"true values": y_test_ravel})
preds_tv

Unnamed: 0,predictions,true values
0,148393.821141,154500
1,347809.501008,325000
2,96722.260693,115000
3,168137.676326,159000
4,329083.885962,315500
...,...,...
360,190630.703907,195000
361,118948.091413,120000
362,236239.641635,228500
363,259203.004069,248000


In [18]:
# Labeling index to match columns in tableau
preds_tv.rename_axis(index='Id', inplace=True)
preds_tv

Unnamed: 0_level_0,predictions,true values
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,148393.821141,154500
1,347809.501008,325000
2,96722.260693,115000
3,168137.676326,159000
4,329083.885962,315500
...,...,...
360,190630.703907,195000
361,118948.091413,120000
362,236239.641635,228500
363,259203.004069,248000


In [19]:
# Saving to csv for tableau
preds_tv.to_csv('Output/predictions_vs_true_values.csv', index = True)

In [20]:
# Listing variables and their scores in descending order
lasso_scores = sorted(list(zip(x_train, lasso.coef_)),reverse=True)
lasso_scores

[('YrSold', 0.0),
 ('YearRemodAdd', 0.030861170087827718),
 ('YearBuilt', 0.08934640382350317),
 ('WoodDeckSF', 0.019226227857973687),
 ('TotalBsmtSF', 0.08340174171828263),
 ('TotRmsAbvGrd', 0.0006513975506880858),
 ('Street_Pave', 1.6222436889500004e-18),
 ('Street_Grvl', -0.0009838212814519054),
 ('ScreenPorch', 0.01777972329932037),
 ('SaleType_WD', -0.0),
 ('SaleType_Oth', 0.0),
 ('SaleType_New', 0.0),
 ('SaleType_ConLw', -0.0),
 ('SaleType_ConLI', -0.0),
 ('SaleType_ConLD', 0.0),
 ('SaleType_Con', 0.006325406239965981),
 ('SaleType_CWD', 0.0),
 ('SaleType_COD', -0.0),
 ('SaleCondition_Partial', 0.04970214517250172),
 ('SaleCondition_Normal', 0.0),
 ('SaleCondition_Family', -0.0032235401222167038),
 ('SaleCondition_Alloca', 0.013086206907631412),
 ('SaleCondition_AdjLand', 0.0),
 ('SaleCondition_Abnorml', -0.014776873214665464),
 ('RoofStyle_Shed', 0.0),
 ('RoofStyle_Mansard', 0.00035809738817386174),
 ('RoofStyle_Hip', 0.0),
 ('RoofStyle_Gambrel', 0.0),
 ('RoofStyle_Gable', -0.00

In [21]:
# Turning variables and their scores into a dataframe to evaluate
var_df = pd.DataFrame(lasso_scores)
var_df

Unnamed: 0,0,1
0,YrSold,0.000000
1,YearRemodAdd,0.030861
2,YearBuilt,0.089346
3,WoodDeckSF,0.019226
4,TotalBsmtSF,0.083402
...,...,...
295,Alley_None,0.000000
296,Alley_Grvl,-0.000000
297,3SsnPorch,0.005350
298,2ndFlrSF,0.000000


In [22]:
# Renaming columns
var_df.columns=["variables","coefficients"]

In [23]:
var_df

Unnamed: 0,variables,coefficients
0,YrSold,0.000000
1,YearRemodAdd,0.030861
2,YearBuilt,0.089346
3,WoodDeckSF,0.019226
4,TotalBsmtSF,0.083402
...,...,...
295,Alley_None,0.000000
296,Alley_Grvl,-0.000000
297,3SsnPorch,0.005350
298,2ndFlrSF,0.000000


In [24]:
# Taking absolute value of coefficients
var_df['coefficients'] = abs(var_df['coefficients'])
var_df

Unnamed: 0,variables,coefficients
0,YrSold,0.000000
1,YearRemodAdd,0.030861
2,YearBuilt,0.089346
3,WoodDeckSF,0.019226
4,TotalBsmtSF,0.083402
...,...,...
295,Alley_None,0.000000
296,Alley_Grvl,0.000000
297,3SsnPorch,0.005350
298,2ndFlrSF,0.000000


In [25]:
# Sorting absolute value of coefficients
lasso_sorted = var_df.sort_values(by="coefficients",ascending=False)
top_10 = lasso_sorted.head(10)
top_10
# GrLivArea: living area square feet (above ground)
# RoofMatl_ClyTile: Clay or Tile roof material
# OverallQual: Rating of overall material and finish of the house
# Condition2_PosN: Near to positive off-site feature (park, greenbelt, etc.)
# BsmtQual_Ex: height of the basement (100+ inches)
# YearBuilt: Original construction date
# TotalBsmtSF: Total square feet of basement area
# BsmtFinSF1: Rating of basement finished area's squared feet
# KitchenQual_Ex: Excellent Kitchen quality
# Neighborhood_NoRidge: Northridge neighborhood

Unnamed: 0,variables,coefficients
136,GrLivArea,0.339096
37,RoofMatl_ClyTile,0.211141
46,OverallQual,0.141465
238,Condition2_PosN,0.10181
259,BsmtQual_Ex,0.090708
2,YearBuilt,0.089346
4,TotalBsmtSF,0.083402
277,BsmtFinSF1,0.082735
114,KitchenQual_Ex,0.079694
58,Neighborhood_NoRidge,0.070848


In [26]:
# Labeling index to match columns in tableau
lasso_sorted.rename_axis(index='Id', inplace=True)
lasso_sorted

Unnamed: 0_level_0,variables,coefficients
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
136,GrLivArea,0.339096
37,RoofMatl_ClyTile,0.211141
46,OverallQual,0.141465
238,Condition2_PosN,0.101810
259,BsmtQual_Ex,0.090708
...,...,...
135,HalfBath,0.000000
137,GarageYrBlt,0.000000
139,GarageType_Detchd,0.000000
140,GarageType_CarPort,0.000000


In [27]:
# Saving to csv for tableau
lasso_sorted.to_csv('Output/variables.csv', index = True)

In [28]:
# Retraining from original data but with top 3 variables only
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42)
x_train_final = x_train[['GrLivArea', 'OverallQual', 'YearBuilt']]
x_test_final = x_test[['GrLivArea', 'OverallQual', 'YearBuilt']]
# Creating scaler
x_scaler_final = StandardScaler().fit(x_train_final)
# Transforming with scaler
x_train_final_scaled = x_scaler_final.transform(x_train_final)
x_test_final_scaled = x_scaler_final.transform(x_test_final)

In [29]:
# Lasso is slightly better than Elastic Net so it will be chosen to continue
x_train_2 = pd.DataFrame(x_train_scaled)
x_test_2 = pd.DataFrame(x_test_scaled)

In [30]:
# Narrowing down to top 3 variables for user
# Training lasso linear regression model
lasso2 = Lasso(alpha=.01).fit(x_train_final, y_train_scaled)
# Making predictions on test portion
predictions2 = lasso2.predict(x_test_final)
MSE2 = mean_squared_error(y_test_scaled, predictions2)
r2_las2 = lasso2.score(x_test_final, y_test_scaled)
print(f"Lasso Mean Squared Error: {MSE2}, Lasso Score: {r2_las2}")

Lasso Mean Squared Error: 0.2787217461529849, Lasso Score: 0.7584340996462217


In [31]:
# Saving model to disk
pickle.dump(lasso2, open('model.pkl','wb'))
pickle.dump(x_scaler, open('x_scaler.pkl', 'wb'))
pickle.dump(y_scaler, open('y_scaler.pkl', 'wb'))