In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import random

%matplotlib inline

In [2]:
def str_to_seed(s):
    seed = 0
    for c in s:
        seed += ord(c)
    return seed

In [3]:
seed = str_to_seed('AMES Housing')
np.random.seed(seed)
random.seed(seed)

## Importing Data

In [4]:
data_df = pd.read_csv('./dataset/data.csv')
data_df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [5]:
#!cat ./dataset/data_description.txt

## Null Handling & Splitting

In [6]:
data_df = data_df.drop(['Id'], axis=1) if 'Id' in data_df.columns else train_df
len(data_df)

1460

In [7]:
data_df.isna().sum()[data_df.isna().sum() > 0]

LotFrontage      259
Alley           1369
MasVnrType         8
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64

Let's get rid of the _very_ null-ridden columns.

In [8]:
na_thresh = 1000
data_df = data_df.drop(list(data_df.isna().sum()[data_df.isna().sum() > na_thresh].index), axis=1)

Lets drop the nulls in `LotFrontage` and `GarageYrBlt` (the nulls for `Garage*` are likely to be the same) since those are the most reasonable to drop. I'm not really concerned with the 1, 8, and 37 from `MasVnr*`, `Bsmt*`, and `Electrical`.

In [9]:
data_df = data_df.dropna(subset=['LotFrontage', 'GarageYrBlt'])
len(data_df)

1127

We're now down from 1460 to 1127. Not looking too good but we should re-evaluate if our results aren't satisfactory

In [10]:
X = data_df.drop(['SalePrice'], axis=1)
y = data_df['SalePrice']

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

## Basic heuristics

Taking a look at things correlated with sale price

In [12]:
train_all = X_train.copy()
train_all['SalePrice'] = y_train
train_all

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
704,20,RL,70.0,8400,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,144,0,0,0,5,2010,WD,Normal,213000
394,50,RL,60.0,10134,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,7,2007,WD,Normal,109000
9,190,RL,50.0,7420,Pave,Reg,Lvl,AllPub,Corner,Gtl,...,0,0,0,0,0,1,2008,WD,Normal,118000
831,160,FV,30.0,3180,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,6,2006,WD,Normal,151000
79,50,RM,60.0,10440,Pave,Reg,Lvl,AllPub,Corner,Gtl,...,128,0,0,0,0,5,2009,WD,Normal,110000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
324,80,RL,96.0,11275,Pave,Reg,Lvl,AllPub,Corner,Gtl,...,0,0,0,0,0,6,2010,WD,Normal,242000
1275,90,RL,95.0,11345,Pave,Reg,Lvl,AllPub,Corner,Gtl,...,0,0,0,0,0,7,2007,WD,Normal,137000
141,20,RL,78.0,11645,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,1,2006,WD,Normal,260000
1010,50,RL,115.0,21286,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,108,0,0,0,0,8,2008,WD,Normal,135000


In [13]:
train_all.corr()['SalePrice'].head()

MSSubClass    -0.112697
LotFrontage    0.310951
LotArea        0.306409
OverallQual    0.794528
OverallCond   -0.141200
Name: SalePrice, dtype: float64

Let's find things with more correlation than the median

In [14]:
train_all.corr()['SalePrice'].median()

0.3064086820379551

In [15]:
correlation_threshold = train_all.corr()['SalePrice'].median() - 0.1 # ten percent margin
correlated_features = list(train_all.corr()[np.abs(train_all.corr()['SalePrice']) > correlation_threshold]['SalePrice'].index)
correlated_features.remove('SalePrice')

correlated_features

['LotFrontage',
 'LotArea',
 'OverallQual',
 'YearBuilt',
 'YearRemodAdd',
 'MasVnrArea',
 'BsmtFinSF1',
 'TotalBsmtSF',
 '1stFlrSF',
 '2ndFlrSF',
 'GrLivArea',
 'BsmtFullBath',
 'FullBath',
 'HalfBath',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageYrBlt',
 'GarageCars',
 'GarageArea',
 'WoodDeckSF',
 'OpenPorchSF']

Now we create a new X for test and train with our selected correlated features

In [16]:
X_train_select = X_train[correlated_features]
X_test_select = X_test[correlated_features]

X_train_select

Unnamed: 0,LotFrontage,LotArea,OverallQual,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,TotalBsmtSF,1stFlrSF,2ndFlrSF,...,BsmtFullBath,FullBath,HalfBath,TotRmsAbvGrd,Fireplaces,GarageYrBlt,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF
704,70.0,8400,7,2004,2005,109.0,712,1473,1484,0,...,1,2,0,7,0,2004.0,2,606,0,35
394,60.0,10134,5,1940,1950,0.0,0,735,735,299,...,0,1,0,5,0,1940.0,1,240,0,39
9,50.0,7420,5,1939,1950,0.0,851,991,1077,0,...,1,1,0,5,2,1939.0,1,205,0,4
831,30.0,3180,7,2005,2005,0.0,0,600,520,600,...,0,2,1,4,0,2005.0,2,480,0,166
79,60.0,10440,5,1910,1981,0.0,0,440,682,548,...,0,1,1,5,0,1966.0,2,440,74,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
324,96.0,11275,7,1967,2007,300.0,0,710,1898,1080,...,0,2,1,11,1,1961.0,2,564,240,0
1275,95.0,11345,5,1948,1950,567.0,220,928,928,992,...,0,2,0,10,0,1948.0,2,400,0,0
141,78.0,11645,7,2005,2005,0.0,1300,1734,1734,0,...,1,2,0,7,0,2005.0,2,660,160,24
1010,115.0,21286,5,1948,1950,0.0,0,720,720,551,...,0,2,0,7,1,1948.0,1,312,0,0


Let's check our selected features for nulls.

In [17]:
X_train_select.isna().sum()

LotFrontage     0
LotArea         0
OverallQual     0
YearBuilt       0
YearRemodAdd    0
MasVnrArea      6
BsmtFinSF1      0
TotalBsmtSF     0
1stFlrSF        0
2ndFlrSF        0
GrLivArea       0
BsmtFullBath    0
FullBath        0
HalfBath        0
TotRmsAbvGrd    0
Fireplaces      0
GarageYrBlt     0
GarageCars      0
GarageArea      0
WoodDeckSF      0
OpenPorchSF     0
dtype: int64

Let's just drop these 3

In [18]:
X_train_select = X_train_select.dropna()
X_test_select = X_test_select.dropna()

Let's make sure our target has the same samples

In [19]:
def ids_to_drop_from_y(X, y):
    ids_drop = []
    for y_id in y.index:
        if y_id not in X.index:
            ids_drop.append(y_id)
    return ids_drop

In [20]:
y_train = y_train.drop(ids_to_drop_from_y(X_train_select, y_train))
y_test = y_test.drop(ids_to_drop_from_y(X_test_select, y_test))

## Pipeline creation

Now let's build our pipeline!

In [21]:
from sklearn.preprocessing import StandardScaler # preprocessing
from sklearn.pipeline import Pipeline #pipeline class

### Linear Regression

In [22]:
from sklearn.linear_model import LinearRegression #linear regression

linear_regression_pipeline = Pipeline([
    ('scalar', StandardScaler()),
    ('linear', LinearRegression())
])
linear_regression_pipeline.fit(X_train_select, y_train)

Pipeline(steps=[('scalar', StandardScaler()), ('linear', LinearRegression())])

In [23]:
linear_regression_pipeline.score(X_test_select, y_test)

0.8035346731968731

### Stochastic Gradient Descent

In [24]:
from sklearn.linear_model import SGDRegressor

sgd_pipeline = Pipeline([
    ('scalar', StandardScaler()),
    ('sgd', SGDRegressor(max_iter=9000, tol=1e-3))
])
sgd_pipeline.fit(X_train_select, y_train)

Pipeline(steps=[('scalar', StandardScaler()),
                ('sgd', SGDRegressor(max_iter=9000, tol=1e-05))])

In [25]:
sgd_pipeline.score(X_test_select, y_test)

0.729975986942317