In [1]:
# Generic inputs for most ML tasks
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor

pd.options.display.float_format = '{:,.2f}'.format

# setup interactive notebook mode
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from IPython.display import display, HTML

#### Read and pre-process data

In [2]:
# fetch data 

housing_data = pd.read_csv('house_price_subset_expanded.csv')

housing_data.head()

Unnamed: 0,LotArea,OverallQual,OverallCond,GrLivArea,FullBath,BedroomAbvGr,GarageCars,SaleCondition,LotShape,GarageType,SalePrice
0,8450,7,5,1710,2,3,2,Normal,Reg,Attchd,208500
1,9600,6,8,1262,2,3,2,Normal,Reg,Attchd,181500
2,11250,7,5,1786,2,3,2,Normal,IR1,Attchd,223500
3,9550,7,5,1717,1,3,3,Abnorml,IR1,Detchd,140000
4,14260,8,5,2198,2,4,3,Normal,IR1,Attchd,250000


In [3]:
# subset data to only include columns for Normal Sale Condition

len(housing_data)

subset_data = housing_data[housing_data['SaleCondition'] == 'Normal'].drop(columns = ['SaleCondition'])

subset_data.head()

len(subset_data)

1460

Unnamed: 0,LotArea,OverallQual,OverallCond,GrLivArea,FullBath,BedroomAbvGr,GarageCars,LotShape,GarageType,SalePrice
0,8450,7,5,1710,2,3,2,Reg,Attchd,208500
1,9600,6,8,1262,2,3,2,Reg,Attchd,181500
2,11250,7,5,1786,2,3,2,IR1,Attchd,223500
4,14260,8,5,2198,2,4,3,IR1,Attchd,250000
5,14115,5,5,1362,1,1,2,IR1,Attchd,143000


1198

In [4]:
subset_data.isna().sum()

LotArea          0
OverallQual      0
OverallCond      0
GrLivArea        0
FullBath         0
BedroomAbvGr     0
GarageCars       0
LotShape         0
GarageType      61
SalePrice        0
dtype: int64

In [5]:
print(set(subset_data['LotShape']))
print(set(subset_data['GarageType']))

{'IR1', 'IR2', 'Reg', 'IR3'}
{nan, 'CarPort', 'Attchd', 'Basment', 'BuiltIn', 'Detchd', '2Types'}


In [6]:
subset_data['LotShape'].replace(['IR1', 'IR2', 'IR3'], 'Irr', inplace = True)
subset_data['GarageType'].replace(['BuiltIn', 'Basment', '2Types', 'CarPort'], 'Other', inplace = True)
subset_data['GarageType'].fillna('Other', inplace = True)
print(set(subset_data['LotShape']))
print(set(subset_data['GarageType']))
subset_data.head(20)

{'Irr', 'Reg'}
{'Other', 'Attchd', 'Detchd'}


Unnamed: 0,LotArea,OverallQual,OverallCond,GrLivArea,FullBath,BedroomAbvGr,GarageCars,LotShape,GarageType,SalePrice
0,8450,7,5,1710,2,3,2,Reg,Attchd,208500
1,9600,6,8,1262,2,3,2,Reg,Attchd,181500
2,11250,7,5,1786,2,3,2,Irr,Attchd,223500
4,14260,8,5,2198,2,4,3,Irr,Attchd,250000
5,14115,5,5,1362,1,1,2,Irr,Attchd,143000
6,10084,8,5,1694,2,3,2,Reg,Attchd,307000
7,10382,7,6,2090,2,3,2,Irr,Attchd,200000
9,7420,5,6,1077,1,2,1,Reg,Attchd,118000
10,11200,5,5,1040,1,3,1,Reg,Detchd,129500
12,12968,5,6,912,1,2,1,Irr,Detchd,144000


In [7]:
# adding non-linear columns
if True: 
    subset_data['LogLotArea'] = np.log(subset_data['LotArea'])
    subset_data['LotoverLiv'] = subset_data['LotArea']*subset_data['GrLivArea']
    subset_data['OvQualSq'] = subset_data['OverallQual']**2
    subset_data['OvCondsq'] = subset_data['OverallCond']**2
    subset_data['OvCondcu'] = subset_data['OverallCond']**3
    subset_data['BedAbGndsq'] = subset_data['BedroomAbvGr']**2
    subset_data['GarageCarsSq'] = subset_data['GarageCars']**2

In [8]:
pd.get_dummies(subset_data, drop_first=True) # Change drop_first to True after demonstration
subset_data.head(20)
# This is a simple way, but it has some downsides (read up about the problems)

Unnamed: 0,LotArea,OverallQual,OverallCond,GrLivArea,FullBath,BedroomAbvGr,GarageCars,SalePrice,LogLotArea,LotoverLiv,OvQualSq,OvCondsq,OvCondcu,BedAbGndsq,GarageCarsSq,LotShape_Reg,GarageType_Detchd,GarageType_Other
0,8450,7,5,1710,2,3,2,208500,9.04,14449500,49,25,125,9,4,1,0,0
1,9600,6,8,1262,2,3,2,181500,9.17,12115200,36,64,512,9,4,1,0,0
2,11250,7,5,1786,2,3,2,223500,9.33,20092500,49,25,125,9,4,0,0,0
4,14260,8,5,2198,2,4,3,250000,9.57,31343480,64,25,125,16,9,0,0,0
5,14115,5,5,1362,1,1,2,143000,9.55,19224630,25,25,125,1,4,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,7917,6,5,1647,2,3,2,175000,8.98,13039299,36,25,125,9,4,1,0,0
1456,13175,6,6,2073,2,3,2,210000,9.49,27311775,36,36,216,9,4,1,0,0
1457,9042,7,9,2340,2,4,1,266500,9.11,21158280,49,81,729,16,1,1,0,0
1458,9717,5,6,1078,1,2,1,142125,9.18,10474926,25,36,216,4,1,1,0,0


Unnamed: 0,LotArea,OverallQual,OverallCond,GrLivArea,FullBath,BedroomAbvGr,GarageCars,LotShape,GarageType,SalePrice,LogLotArea,LotoverLiv,OvQualSq,OvCondsq,OvCondcu,BedAbGndsq,GarageCarsSq
0,8450,7,5,1710,2,3,2,Reg,Attchd,208500,9.04,14449500,49,25,125,9,4
1,9600,6,8,1262,2,3,2,Reg,Attchd,181500,9.17,12115200,36,64,512,9,4
2,11250,7,5,1786,2,3,2,Irr,Attchd,223500,9.33,20092500,49,25,125,9,4
4,14260,8,5,2198,2,4,3,Irr,Attchd,250000,9.57,31343480,64,25,125,16,9
5,14115,5,5,1362,1,1,2,Irr,Attchd,143000,9.55,19224630,25,25,125,1,4
6,10084,8,5,1694,2,3,2,Reg,Attchd,307000,9.22,17082296,64,25,125,9,4
7,10382,7,6,2090,2,3,2,Irr,Attchd,200000,9.25,21698380,49,36,216,9,4
9,7420,5,6,1077,1,2,1,Reg,Attchd,118000,8.91,7991340,25,36,216,4,1
10,11200,5,5,1040,1,3,1,Reg,Detchd,129500,9.32,11648000,25,25,125,9,1
12,12968,5,6,912,1,2,1,Irr,Detchd,144000,9.47,11826816,25,36,216,4,1


In [9]:
from sklearn.preprocessing import OneHotEncoder

def get_ohe(df, col):
    ohe = OneHotEncoder(drop='first', handle_unknown='error', sparse=False, dtype='int')
    ohe.fit(df[[col]])
    temp_df = pd.DataFrame(data=ohe.transform(df[[col]]), columns=ohe.get_feature_names())
    # If you have a newer version, replace with columns=ohe.get_feature_names_out()
    df.drop(columns=[col], axis=1, inplace=True)
    df = pd.concat([df.reset_index(drop=True), temp_df], axis=1)
    return df

In [10]:
subset_data = get_ohe(subset_data, 'LotShape')
subset_data = get_ohe(subset_data, 'GarageType')
subset_data.head(20)



Unnamed: 0,LotArea,OverallQual,OverallCond,GrLivArea,FullBath,BedroomAbvGr,GarageCars,SalePrice,LogLotArea,LotoverLiv,OvQualSq,OvCondsq,OvCondcu,BedAbGndsq,GarageCarsSq,x0_Reg,x0_Detchd,x0_Other
0,8450,7,5,1710,2,3,2,208500,9.04,14449500,49,25,125,9,4,1,0,0
1,9600,6,8,1262,2,3,2,181500,9.17,12115200,36,64,512,9,4,1,0,0
2,11250,7,5,1786,2,3,2,223500,9.33,20092500,49,25,125,9,4,0,0,0
3,14260,8,5,2198,2,4,3,250000,9.57,31343480,64,25,125,16,9,0,0,0
4,14115,5,5,1362,1,1,2,143000,9.55,19224630,25,25,125,1,4,0,0,0
5,10084,8,5,1694,2,3,2,307000,9.22,17082296,64,25,125,9,4,1,0,0
6,10382,7,6,2090,2,3,2,200000,9.25,21698380,49,36,216,9,4,0,0,0
7,7420,5,6,1077,1,2,1,118000,8.91,7991340,25,36,216,4,1,1,0,0
8,11200,5,5,1040,1,3,1,129500,9.32,11648000,25,25,125,9,1,1,1,0
9,12968,5,6,912,1,2,1,144000,9.47,11826816,25,36,216,4,1,0,1,0


In [11]:
# X_train, X_test, y_train, y_test = train_test_split(subset_data.drop(columns = ['SalePrice']), subset_data['SalePrice'], test_size=0.25)
X_train, X_test, y_train, y_test = train_test_split(subset_data.drop(columns = ['SalePrice']), subset_data['SalePrice'], test_size=0.25, random_state=35)
# X_train, X_test, y_train, y_test = train_test_split(subset_data.drop(columns = ['SalePrice', 'x0_Reg', 'x0_Detchd', 'x0_Other']), subset_data['SalePrice'], test_size=0.25, random_state=35)
# Pick a random_state as below and keep using the same number (example 35) to repeat the same test and train data
# X_train, X_test, y_train, y_test = train_test_split(subset_data['bmi'], subset_data['charges'], test_size=0.25, random_state=35)
X_train
X_test
y_train
y_test

Unnamed: 0,LotArea,OverallQual,OverallCond,GrLivArea,FullBath,BedroomAbvGr,GarageCars,LogLotArea,LotoverLiv,OvQualSq,OvCondsq,OvCondcu,BedAbGndsq,GarageCarsSq,x0_Reg,x0_Detchd,x0_Other
473,10500,5,7,1109,1,3,1,9.26,11644500,25,49,343,9,1,1,0,1
540,13560,6,3,1392,1,2,2,9.51,18875520,36,9,27,4,4,1,0,0
80,10921,4,5,960,1,3,1,9.30,10484160,16,25,125,9,1,1,0,0
156,7472,7,9,1479,1,4,2,8.92,11051088,49,81,729,16,4,0,0,0
630,53504,8,5,3279,3,4,3,10.89,175439616,64,25,125,16,9,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
744,9286,5,7,1268,1,3,1,9.14,11774648,25,49,343,9,1,0,1,0
959,6951,5,5,923,1,3,1,8.85,6415773,25,25,125,9,1,0,0,0
1004,7728,5,6,1190,1,3,2,8.95,9196320,25,36,216,9,4,1,0,0
1057,8400,5,5,1052,1,3,1,9.04,8836800,25,25,125,9,1,1,0,0


Unnamed: 0,LotArea,OverallQual,OverallCond,GrLivArea,FullBath,BedroomAbvGr,GarageCars,LogLotArea,LotoverLiv,OvQualSq,OvCondsq,OvCondcu,BedAbGndsq,GarageCarsSq,x0_Reg,x0_Detchd,x0_Other
1104,11643,5,5,2634,2,6,4,9.36,30667662,25,25,125,36,16,1,1,0
1179,7407,6,7,1236,1,2,2,8.91,9155052,36,49,343,4,4,1,0,0
794,2651,7,5,1382,2,3,2,7.88,3663682,49,25,125,9,4,1,1,0
131,5500,4,6,882,1,1,0,8.61,4851000,16,36,216,1,0,1,0,1
999,10482,6,8,1138,1,3,1,9.26,11928516,36,64,512,9,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107,7388,5,6,1327,1,3,2,8.91,9803876,25,36,216,9,4,1,1,0
477,11841,6,5,816,1,3,0,9.38,9662256,36,25,125,9,0,1,0,1
1139,9100,7,5,1525,2,3,2,9.12,13877500,49,25,125,9,4,1,0,0
77,9337,6,5,1786,2,3,2,9.14,16675882,36,25,125,9,4,0,0,0


473     139000
540     110000
80       94750
156     184000
630     538000
         ...  
744     143500
959     119500
1004    132500
1057    138500
951     235000
Name: SalePrice, Length: 898, dtype: int64

1104    200000
1179    149700
794     165000
131     103200
999     145000
         ...  
107     150750
477     118500
1139    235000
77      204750
635     142500
Name: SalePrice, Length: 300, dtype: int64

In [12]:
model = LinearRegression(fit_intercept = True)
model.fit(X_train, y_train) 

# The following gives the R-square score
model.score(X_train, y_train) 

# This is the coefficient Beta_1, ..., Beta_10 (7 more for non-linear)
model.coef_

# This is the coefficient Beta_0
model.intercept_

LinearRegression()

0.8796773073762235

array([-4.16430963e+00, -3.49663419e+04,  4.53474380e+04,  3.34956147e+01,
        3.08512878e+02, -4.96621317e+03,  9.05651684e+03,  2.35651728e+04,
        2.18038822e-03,  4.59178118e+03, -4.77629060e+03,  1.48006590e+02,
       -7.20800195e+02,  2.15668063e+03, -6.63617543e+03, -1.48234472e+04,
       -8.36611463e+03])

-166650.4928382334

In [1]:
test_output = pd.DataFrame(model.predict(X_test), index = X_test.index, columns = ['pred_SalePrice'])
# When extending to multiple features remove .array.reshape(-1, 1)
test_output.head()

NameError: name 'pd' is not defined

In [14]:
test_output = test_output.merge(y_test, left_index = True, right_index = True)
test_output.head()
mean_absolute_error = abs(test_output['pred_SalePrice'] - test_output['SalePrice']).mean()
print('Mean absolute error is ')
print(mean_absolute_error)

Unnamed: 0,pred_SalePrice,SalePrice
1104,220525.15,200000
1179,171099.19,149700
794,152921.07,165000
131,98851.06,103200
999,143860.64,145000


Mean absolute error is 
19654.589712322086


#### Visualize data

In [15]:
# define function to import viz libraries
import plotly
plotly.offline.init_notebook_mode(connected=True)
from plotly.graph_objs import *
from plotly import tools
import plotly.graph_objects as go
import seaborn as sns

In [16]:
if False: 
    cols = X_train.columns
    for col in cols:
        plot_data = []
        plot_data.append(go.Scatter(x= X_train[col], y= y_train, name = 'Train data actual', mode = 'markers', text="country"))
        plot_data.append(go.Scatter(x= X_train[col], y= model.predict(X_train), name = 'Train data predicted', mode = 'markers'))
        plot_data.append(go.Scatter(x= X_test[col], y= y_test, name = 'Test data actual', mode = 'markers'))
        plot_data.append(go.Scatter(x= X_test[col], y= model.predict(X_test), name = 'Test data predicted', mode = 'markers'))

        layout = go.Layout(xaxis = dict(title=col), yaxis = dict(title= 'SalePrice'), 
                           title = 'Plot of predicted and actual')
        fig = go.Figure(data= plot_data, layout=layout)
        plotly.offline.iplot(fig)


In [17]:
mean_absolute_error_ratio = abs(test_output['pred_SalePrice'] - test_output['SalePrice']).mean()/test_output['SalePrice'].mean()
print('Mean absolute error ratio is ')
print(mean_absolute_error_ratio)

# Without the categorical variables:
# Mean absolute error ratio is 
# 0.145 (no non-linear, no cat)
# 0.139 (no non-linear, cat)
# 0.125 (non-linear, no cat)

model.score(X_test, y_test)

Mean absolute error ratio is 
0.1164919589373251


0.831982273718285