In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import Imputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor

In [2]:
np.random.seed(0)
###### Read the data
train = pd.read_csv('original/train.csv')
train.sample(3)
target = train.SalePrice

# Descide which variables to keep
First I go through the whole list and decide which fields I will drop for further investigations...

In [3]:
#columns_of_interest = ["LotFrontage", "LotArea", "Street", "Alley", "LotShape", "LandContour", "Utilities", "Utilities", "LotConfig", "LandSlope", "Condition1", "Condition2", "YearBuilt", "YearRemodAdd", "MoSold", "YrSold", "SaleType", "SaleCondition"]
#train = train[columns_of_interest]

In [7]:
train = train.drop(["Id", "SalePrice"], axis = 1)

In [8]:
train.sample(3)

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
627,80,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,3,2010,WD,Normal
1373,20,RL,,11400,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,3,2007,WD,Normal
339,20,RL,66.0,12400,Pave,,IR1,Lvl,AllPub,Inside,...,234,0,,,,0,6,2009,WD,Normal


In [9]:
#pd.get_dummies(train, prefix= train.select_dtypes(include=list(train.select_dtypes(include=['object']).columns))

# Find reasonable object variables
First I want to decide, which kind of object type variables I want to include! So I will go through the whole list...

In [10]:
low_cardinality = []
numeric_cols = []
for cname in train.columns:
    if train[cname].dtype == "object":
        if train[cname].nunique() < 5:
            low_cardinality.append(cname)
            print(cname)
    else:
        numeric_cols.append(cname)

Street
Alley
LotShape
LandContour
Utilities
LandSlope
MasVnrType
ExterQual
BsmtQual
BsmtCond
BsmtExposure
CentralAir
KitchenQual
GarageFinish
PavedDrive
PoolQC
Fence
MiscFeature


In [12]:
numeric_cols

['MSSubClass',
 'LotFrontage',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 'MasVnrArea',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'GrLivArea',
 'BsmtFullBath',
 'BsmtHalfBath',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageYrBlt',
 'GarageCars',
 'GarageArea',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 '3SsnPorch',
 'ScreenPorch',
 'PoolArea',
 'MiscVal',
 'MoSold',
 'YrSold']

In [13]:
train = train[numeric_cols + low_cardinality]

In [14]:
##### Detect object type columns
train_objects = train[low_cardinality]
train_objects.sample(3)

Unnamed: 0,Street,Alley,LotShape,LandContour,Utilities,LandSlope,MasVnrType,ExterQual,BsmtQual,BsmtCond,BsmtExposure,CentralAir,KitchenQual,GarageFinish,PavedDrive,PoolQC,Fence,MiscFeature
623,Pave,,Reg,Lvl,AllPub,Gtl,BrkFace,Gd,Gd,TA,No,Y,Gd,Unf,Y,,,
650,Pave,,Reg,Lvl,AllPub,Gtl,,Gd,Gd,TA,No,Y,Gd,RFn,Y,,,
1330,Pave,,Reg,Lvl,AllPub,Gtl,Stone,Gd,Gd,Gd,Av,Y,Gd,RFn,Y,,,


Figure out which entries have null values!

In [15]:
object_nas = train_objects.isnull().sum()*100/len(train_objects)
object_nas[object_nas != 0]

Alley           93.767123
MasVnrType       0.547945
BsmtQual         2.534247
BsmtCond         2.534247
BsmtExposure     2.602740
GarageFinish     5.547945
PoolQC          99.520548
Fence           80.753425
MiscFeature     96.301370
dtype: float64

Set Alleys with Nas to no, since there is no alley!


In [21]:
train.Alley = train["Alley"].fillna("No")
train.MasVnrType = train["MasVnrType"].fillna("No")
train.BsmtQual = train["BsmtQual"].fillna("No")
train.BsmtCond = train["BsmtCond"].fillna("No")
train.BsmtExposure = train["BsmtExposure"].fillna("No")
train.GarageFinish = train["GarageFinish"].fillna("No")
train.PoolQC = train["PoolQC"].fillna("No")
train.Fence = train["Fence"].fillna("No")
train.MiscFeature = train["MiscFeature"].fillna("No")

Search for other missing values

In [22]:
object_nas = train.isnull().sum()*100/len(train)
object_nas[object_nas != 0]

LotFrontage    17.739726
MasVnrArea      0.547945
GarageYrBlt     5.547945
dtype: float64

Since only 17% of values are missing, do imputation with average LotFrontage.

In [23]:
my_imputer = Imputer()
train["LotFrontage"] = my_imputer.fit_transform(train[["LotFrontage"]]).ravel()
train["MasVnrArea"] = my_imputer.fit_transform(train[["MasVnrArea"]]).ravel()
train["GarageYrBlt"] = my_imputer.fit_transform(train[["GarageYrBlt"]]).ravel()
train.sample(5)

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,BsmtQual,BsmtCond,BsmtExposure,CentralAir,KitchenQual,GarageFinish,PavedDrive,PoolQC,Fence,MiscFeature
1229,80,70.0,7910,5,5,1960,1960,0.0,666,0,...,TA,TA,No,Y,TA,Unf,Y,No,GdWo,No
189,120,41.0,4923,8,5,2001,2002,0.0,1153,0,...,Ex,TA,Av,Y,Ex,Fin,Y,No,No,No
512,20,70.0,9100,5,5,1958,1958,0.0,521,174,...,TA,TA,No,Y,TA,Unf,Y,No,No,No
1208,20,70.0,7763,5,7,1962,1980,0.0,504,108,...,TA,TA,No,Y,TA,Unf,Y,No,No,No
397,60,69.0,7590,5,5,1962,1962,288.0,540,0,...,TA,TA,No,Y,TA,RFn,Y,No,No,No


In [24]:
object_nas = train.isnull().sum()*100/len(train)
object_nas[object_nas != 0]

Series([], dtype: float64)

Make Hot-Encoding for objects

In [25]:
train.shape

(1460, 54)

In [26]:
train.columns

Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
       'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
       'MoSold', 'YrSold', 'Street', 'Alley', 'LotShape', 'LandContour',
       'Utilities', 'LandSlope', 'MasVnrType', 'ExterQual', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'CentralAir', 'KitchenQual', 'GarageFinish',
       'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature'],
      dtype='object')

In [27]:
train_hot = pd.get_dummies(train)

In [28]:
train_hot.sample(1)

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,Fence_GdPrv,Fence_GdWo,Fence_MnPrv,Fence_MnWw,Fence_No,MiscFeature_Gar2,MiscFeature_No,MiscFeature_Othr,MiscFeature_Shed,MiscFeature_TenC
378,20,88.0,11394,9,2,2010,2010,350.0,1445,0,...,0,0,0,0,1,0,1,0,0,0


In [29]:
## score dataset
def calc_mae(X, y, seed):
    train_X, val_X, train_y, val_y = train_test_split(X, y,random_state = seed)
    # run ml
    model = RandomForestRegressor()
    model.fit(train_X, train_y)
    # pred values
    pred_val = model.predict(val_X)
    # calc accuracy
    mae = mean_absolute_error(val_y, pred_val)
    return mae

In [30]:
max = 100
mae = 0
for i in range (0, max):
    mae = mae + calc_mae(train_hot, target, i)
print(max,mae/max)

100 19294.7581425
