In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import Imputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor

In [2]:
np.random.seed(0)
###### Read the data
train = pd.read_csv('original/train.csv')
train.sample(3)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
529,530,20,RL,,32668,Pave,,IR1,Lvl,AllPub,...,0,,,,0,3,2007,WD,Alloca,200624
491,492,50,RL,79.0,9490,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,8,2006,WD,Normal,133000
459,460,50,RL,,7015,Pave,,IR1,Bnk,AllPub,...,0,,,,0,7,2009,WD,Normal,110000


# Descide which variables to keep
First I go through the whole list and decide which fields I will drop for further investigations...

In [3]:
columns_of_interest = ["LotFrontage", "LotArea", "Street", "Alley", "LotShape", "LandContour", "Utilities", "Utilities", "LotConfig", "LandSlope", "Condition1", "Condition2", "YearBuilt", "YearRemodAdd", "MoSold", "YrSold", "SaleType", "SaleCondition", "SalePrice"]
train = train[columns_of_interest]

In [4]:
train.sample(3)

Unnamed: 0,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,Utilities.1,LotConfig,LandSlope,Condition1,Condition2,YearBuilt,YearRemodAdd,MoSold,YrSold,SaleType,SaleCondition,SalePrice
1453,90.0,17217,Pave,,Reg,Lvl,AllPub,AllPub,Inside,Gtl,Norm,Norm,2006,2006,7,2006,WD,Abnorml,84500
216,65.0,8450,Pave,,Reg,Lvl,AllPub,AllPub,Inside,Gtl,Norm,Norm,2004,2004,4,2008,WD,Normal,210000
1413,88.0,10994,Pave,,IR1,Lvl,AllPub,AllPub,Corner,Gtl,Norm,Norm,2005,2006,9,2009,COD,Abnorml,257000


# Find reasonable object variables
First I want to decide, which kind of object type variables I want to include! So I will go through the whole list...

In [5]:
##### Detect object type columns
train_objects = train.select_dtypes(include=['object'])
train_objects.sample(3)

Unnamed: 0,Street,Alley,LotShape,LandContour,Utilities,Utilities.1,LotConfig,LandSlope,Condition1,Condition2,SaleType,SaleCondition
627,Pave,,Reg,Lvl,AllPub,AllPub,Inside,Gtl,Norm,Norm,WD,Normal
1373,Pave,,Reg,Lvl,AllPub,AllPub,Inside,Gtl,Norm,Norm,WD,Normal
339,Pave,,IR1,Lvl,AllPub,AllPub,Inside,Gtl,Feedr,Norm,WD,Normal


Figure out which entries have null values!

In [6]:
object_nas = train_objects.isnull().sum()*100/len(train_objects)
object_nas[object_nas != 0]

Alley    93.767123
dtype: float64

Set Alleys with Nas to no, since there is no alley!


In [7]:
train.Alley = train["Alley"].fillna("NoAlley")

Search for other missing values

In [16]:
object_nas = train.isnull().sum()*100/len(train)
object_nas[object_nas != 0]

LotFrontage    17.739726
dtype: float64

Since only 17% of values are missing, do imputation with average LotFrontage.

In [25]:
my_imputer = Imputer()
train["LotFrontage"] = my_imputer.fit_transform(train[["LotFrontage"]]).ravel()
train.sample(5)

Unnamed: 0,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,Utilities.1,LotConfig,LandSlope,Condition1,Condition2,YearBuilt,YearRemodAdd,MoSold,YrSold,SaleType,SaleCondition,SalePrice
968,50.0,5925,Pave,0,Reg,Lvl,AllPub,AllPub,Inside,Gtl,Norm,Norm,1910,1950,5,2009,WD,Abnorml,37900
330,70.049958,10624,Pave,0,IR1,Lvl,AllPub,AllPub,Inside,Gtl,Norm,Norm,1964,1964,11,2007,WD,Normal,119000
1047,57.0,9245,Pave,0,IR2,Lvl,AllPub,AllPub,Inside,Gtl,Norm,Norm,1994,1995,2,2008,WD,Normal,145000
527,67.0,14948,Pave,0,IR1,Lvl,AllPub,AllPub,Inside,Gtl,Norm,Norm,2008,2008,11,2008,New,Partial,446261
509,80.0,9600,Pave,0,Reg,Lvl,AllPub,AllPub,Corner,Gtl,Norm,Norm,1959,1959,7,2009,WD,Normal,124500


In [28]:
object_nas = train.isnull().sum()*100/len(train)
object_nas[object_nas != 0]

Series([], dtype: float64)

Make Hot-Encoding for objects

In [31]:
train_hot = pd.get_dummies(train)

ValueError: cannot copy sequence with size 2 to array axis with dimension 1460

In [12]:
pd.get_dummies(train)

ValueError: cannot copy sequence with size 2 to array axis with dimension 1460

In [9]:
train.sample(3)

Unnamed: 0,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,Utilities.1,LotConfig,LandSlope,Condition1,Condition2,YearBuilt,YearRemodAdd,MoSold,YrSold,SaleType,SaleCondition,SalePrice
623,,2117,Pave,0,Reg,Lvl,AllPub,AllPub,Inside,Gtl,Norm,Norm,2000,2000,6,2007,WD,Normal,168500
650,65.0,8125,Pave,0,Reg,Lvl,AllPub,AllPub,Inside,Gtl,Norm,Norm,2007,2007,5,2008,WD,Normal,205950
1330,85.0,10000,Pave,0,Reg,Lvl,AllPub,AllPub,Inside,Gtl,Norm,Norm,2006,2006,12,2007,WD,Normal,227000


In [13]:
cols_with_missing = [col for col in train_data.columns 
                                 if train_data[col].isnull().any()] 

NameError: name 'train_data' is not defined