In [1]:
import pandas as pd 
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

In [2]:
cols_to_import = ['OverallQual', 'TotalBsmtSF', '1stFlrSF', 'GrLivArea', 'WoodDeckSF',
                  'BsmtUnfSF', 'LotFrontage', 'MasVnrArea', 'GarageYrBlt', 'SalePrice']

df = pd.read_csv('../data/house_price/train.csv', usecols=cols_to_import)

In [3]:
df

Unnamed: 0,LotFrontage,OverallQual,MasVnrArea,BsmtUnfSF,TotalBsmtSF,1stFlrSF,GrLivArea,GarageYrBlt,WoodDeckSF,SalePrice
0,65.0,7,196.0,150,856,856,1710,2003.0,0,208500
1,80.0,6,0.0,284,1262,1262,1262,1976.0,298,181500
2,68.0,7,162.0,434,920,920,1786,2001.0,0,223500
3,60.0,7,0.0,540,756,961,1717,1998.0,0,140000
4,84.0,8,350.0,490,1145,1145,2198,2000.0,192,250000
...,...,...,...,...,...,...,...,...,...,...
1455,62.0,6,0.0,953,953,953,1647,1999.0,0,175000
1456,85.0,6,119.0,589,1542,2073,2073,1978.0,349,210000
1457,66.0,7,0.0,877,1152,1188,2340,1941.0,0,266500
1458,68.0,5,0.0,0,1078,1078,1078,1950.0,366,142125


In [5]:
# check null values 
df.isnull().mean()

LotFrontage    0.177397
OverallQual    0.000000
MasVnrArea     0.005479
BsmtUnfSF      0.000000
TotalBsmtSF    0.000000
1stFlrSF       0.000000
GrLivArea      0.000000
GarageYrBlt    0.055479
WoodDeckSF     0.000000
SalePrice      0.000000
dtype: float64

`LotFrontage`, `MasVnrArea`, and `GarageYrBlt` have missing values. `LotFrontage` has 17.8% data that are missing.

# Split data

In [45]:
cols = ['OverallQual', 'TotalBsmtSF', '1stFlrSF', 'GrLivArea', 'WoodDeckSF',
       'BsmtUnfSF', 'LotFrontage', 'MasVnrArea', 'GarageYrBlt']

X_train, X_test, y_train, y_test = train_test_split(df[cols],
                                                   df['SalePrice'],
                                                   test_size=0.3,
                                                   random_state=0)

X_train.shape, X_test.shape

((1022, 9), (438, 9))

In [46]:
X_train.isnull().mean()

OverallQual    0.000000
TotalBsmtSF    0.000000
1stFlrSF       0.000000
GrLivArea      0.000000
WoodDeckSF     0.000000
BsmtUnfSF      0.000000
LotFrontage    0.184932
MasVnrArea     0.004892
GarageYrBlt    0.052838
dtype: float64

# SimpleImputer

In [47]:
imputer = SimpleImputer(strategy='mean', add_indicator=False)

imputer.fit(X_train)

SimpleImputer()

In [48]:
imputer.statistics_

array([   6.07925636, 1055.34344423, 1161.7221135 , 1522.13796477,
         94.85225049,  565.99217221,   69.66866747,  103.55358899,
       1978.01239669])

In [49]:
# cross check the mean values 
X_train.mean()

OverallQual       6.079256
TotalBsmtSF    1055.343444
1stFlrSF       1161.722114
GrLivArea      1522.137965
WoodDeckSF       94.852250
BsmtUnfSF       565.992172
LotFrontage      69.668667
MasVnrArea      103.553589
GarageYrBlt    1978.012397
dtype: float64

In [50]:
X_train.head()

Unnamed: 0,OverallQual,TotalBsmtSF,1stFlrSF,GrLivArea,WoodDeckSF,BsmtUnfSF,LotFrontage,MasVnrArea,GarageYrBlt
64,7,1057,1057,2034,576,318,,573.0,1998.0
682,6,1291,1291,1291,307,288,,0.0,1996.0
960,5,858,858,858,117,162,50.0,0.0,
1384,6,560,698,1258,0,356,60.0,0.0,1939.0
1100,2,290,438,438,0,0,60.0,0.0,1930.0


In [51]:
X_train_imputed = imputer.transform(X_train)
X_train_imputed = pd.DataFrame(X_train_imputed, columns=cols)

In [52]:
X_train_imputed.head()

Unnamed: 0,OverallQual,TotalBsmtSF,1stFlrSF,GrLivArea,WoodDeckSF,BsmtUnfSF,LotFrontage,MasVnrArea,GarageYrBlt
0,7.0,1057.0,1057.0,2034.0,576.0,318.0,69.668667,573.0,1998.0
1,6.0,1291.0,1291.0,1291.0,307.0,288.0,69.668667,0.0,1996.0
2,5.0,858.0,858.0,858.0,117.0,162.0,50.0,0.0,1978.012397
3,6.0,560.0,698.0,1258.0,0.0,356.0,60.0,0.0,1939.0
4,2.0,290.0,438.0,438.0,0.0,0.0,60.0,0.0,1930.0
