In [1]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from scipy import stats
from scipy.stats import zscore
import itertools
import re
import matplotlib.pyplot as plt
import seaborn as sns
import helper

In [2]:
plt.rcParams['figure.dpi'] = 300
plt.rcParams['savefig.dpi'] = 300
isu_pal = [
    "#FF0B04", "#9A3324", "#B9975B", "#707372", "#F1BE48",
    "#FF0B04", "#9A3324", "#B9975B", "#707372", "#F1BE48",
    "#FF0B04", "#9A3324", "#B9975B", "#707372", "#F1BE48",
    "#FF0B04", "#9A3324", "#B9975B", "#707372", "#F1BE48",
    "#FF0B04", "#9A3324", "#B9975B", "#707372", "#F1BE48",
    "#FF0B04", "#9A3324", "#B9975B", "#707372", "#F1BE48"
]
sns.set_theme()
sns.set_context('notebook')
sns.set_palette(sns.color_palette(isu_pal))

In [3]:
reg = linear_model.LinearRegression()

In [4]:
hous_all = pd.read_csv('all.csv', index_col=0, low_memory=False)

In [5]:
# Exclude everything except where 'SaleCondition' is 'Normal' or 'Partial'.
hous_all = hous_all[(hous_all['SaleCondition'] == 'Normal') | (hous_all['SaleCondition'] == 'Partial')]

In [6]:
hous_all['SaleCondition'].value_counts()

Normal     2414
Partial      82
Name: SaleCondition, dtype: int64

In [7]:
# Perform the split using stratified sampling based on 'Neighborhood'.
hous_trn, hous_tst = helper.stratified_split(hous_all, 'Neighborhood')

In [8]:
print(hous_trn.shape)
print(hous_tst.shape)

(1871, 81)
(624, 81)


In [9]:
hous_trn['LogGrLivArea'] = np.log(hous_trn['GrLivArea'])

In [10]:
## target: log(SalePrice), predictors: log(GrLivArea) and Neighborhood
X = hous_trn.loc[:, ['LogGrLivArea', 'Neighborhood']]
transformer = ColumnTransformer([("Cat", OneHotEncoder(drop='first'), ['Neighborhood'])], remainder='passthrough')
X = transformer.fit_transform(X)
y = np.log(hous_trn['SalePrice'])
reg.fit(X, y)
reg.score(X, y)

0.7853806529862986

In [11]:
## target: log(SalePrice), predictors: GrLivArea and Neighborhood
X = hous_trn.loc[:, ['GrLivArea', 'Neighborhood']]
transformer = ColumnTransformer([("Cat", OneHotEncoder(drop='first'), ['Neighborhood'])], remainder='passthrough')
X = transformer.fit_transform(X)
y = np.log(hous_trn['SalePrice'])
reg.fit(X, y)
reg.score(X, y)

0.7818429791603757

In [12]:
## target: SalePrice, predictors: GrLivArea and Neighborhood
X = hous_trn.loc[:, ['GrLivArea', 'Neighborhood']]
transformer = ColumnTransformer([("Cat", OneHotEncoder(drop='first'), ['Neighborhood'])], remainder='passthrough')
X = transformer.fit_transform(X)
y = hous_trn['SalePrice']
reg.fit(X, y)
reg.score(X, y)

0.7607642362334881

In [13]:
## target: SalePrice/GrLivArea, predictors: log(GrLivArea), Neighborhood
X = hous_trn.loc[:, ['LogGrLivArea', 'Neighborhood']]
transformer = ColumnTransformer([("Cat", OneHotEncoder(drop='first'), ['Neighborhood'])], remainder='passthrough')
X = transformer.fit_transform(X)
y = hous_trn['SalePrice']/hous_trn['GrLivArea']
reg.fit(X, y)
reg.score(X, y)

0.5153763380911464

In [14]:
## target: SalePrice/GrLivArea, predictors: GrLivArea, Neighborhood
X = hous_trn.loc[:, ['GrLivArea', 'Neighborhood']]
transformer = ColumnTransformer([("Cat", OneHotEncoder(drop='first'), ['Neighborhood'])], remainder='passthrough')
X = transformer.fit_transform(X)
y = hous_trn['SalePrice']/hous_trn['GrLivArea']
reg.fit(X, y)
reg.score(X, y)

0.4977267693396702

In [15]:
## target: SalePrice/GrLivArea, predictors: Neighborhood
X = hous_trn.loc[:, ['Neighborhood']]
transformer = ColumnTransformer([("Cat", OneHotEncoder(drop='first'), ['Neighborhood'])], remainder='passthrough')
X = transformer.fit_transform(X)
y = hous_trn['SalePrice']/hous_trn['GrLivArea']
reg.fit(X, y)
reg.score(X, y)

0.39629197253798154

In [16]:
## target: log(SalePrice/GrLivArea), predictors: Neighborhood
X = hous_trn.loc[:, ['Neighborhood']]
transformer = ColumnTransformer([("Cat", OneHotEncoder(drop='first'), ['Neighborhood'])], remainder='passthrough')
X = transformer.fit_transform(X)
y = np.log(hous_trn['SalePrice']/hous_trn['GrLivArea'])
reg.fit(X, y)
reg.score(X, y)

0.38508667411156094