### To-Do
- [x] Libraries
- [x] Load data
- [x] Remove columns with the most missing values
- [x] Categorical data -> One-Hot Encoded
- [x] Imputation
    - [x] Numerical -> Median
- [x] Save the model
- [x] Preprocess the Test set
- [x] Submit if you got better results

##### Old score
- MAE:  17807.438333333328
- RMSE 33919.31112490001

#### Current score
- MAE: 17492.58344748858
- RMSE: 33657.29332096441

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

from sklearn.metrics import mean_squared_error, mean_absolute_error

from sklearn.externals import joblib

In [2]:
# Accuracy matrix
from math import sqrt
def RMSE(actuals, predictions):
    return sqrt(mean_squared_error(actuals, predictions))

In [3]:
# Load the data
X_full = pd.read_csv('train.csv', index_col='Id')
X_test_full = pd.read_csv('test.csv', index_col='Id')

# Remove rows with missing targets
X_full.dropna(axis=0, subset=['SalePrice'], inplace=True)

# Seperate targets from predictors
y = X_full.SalePrice
X_full.drop(['SalePrice'], axis=1, inplace=True)

# 

# Break off validation data from train data
X_train, X_valid, y_train, y_valid = train_test_split(X_full, y, test_size=.2,
                                                   random_state=0)

In [4]:
# Train, Validation, Test shapes
print('X_train: ' + str(X_train.shape))
print('X_valid: ' + str(X_valid.shape))
print('X_test: ' + str(X_test_full.shape))

X_train: (1168, 79)
X_valid: (292, 79)
X_test: (1459, 79)


In [5]:
# Number of missing values per columns 
missing_val_cols = X_train.isnull().sum()
print(missing_val_cols[missing_val_cols>0].sort_values(ascending=False))

PoolQC          1164
MiscFeature     1119
Alley           1097
Fence            954
FireplaceQu      551
LotFrontage      212
GarageYrBlt       58
GarageType        58
GarageFinish      58
GarageQual        58
GarageCond        58
BsmtFinType2      29
BsmtFinType1      28
BsmtExposure      28
BsmtCond          28
BsmtQual          28
MasVnrArea         6
MasVnrType         6
Electrical         1
dtype: int64


In [7]:
# Get categorical data with missing values
cat_cols_miss = [col for col in X_train.columns
                if X_train[col].isnull().any()
                and X_train[col].dtype.kind == 'O']

# Get numerical data with missing values
num_cols_miss = [col for col in X_train.columns
                if X_train[col].isnull().any()
                and X_train[col].dtype.kind != 'O']

In [8]:
cat_cols_miss

['Alley',
 'MasVnrType',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Electrical',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PoolQC',
 'Fence',
 'MiscFeature']

In [9]:
num_cols_miss

['LotFrontage', 'MasVnrArea', 'GarageYrBlt']

In [11]:
# Remove categorical data with missing values
X_train.drop(cat_cols_miss, axis=1, inplace=True)
X_valid.drop(cat_cols_miss, axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [12]:
all_cat_cols = [col for col in X_train.columns
                if X_train[col].dtype.kind == 'O']

In [13]:
all_cat_cols

['MSZoning',
 'Street',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'KitchenQual',
 'Functional',
 'PavedDrive',
 'SaleType',
 'SaleCondition']

In [25]:
# High Cardinality columns
object_nunique = list(map(lambda col:X_train[col].nunique(), all_cat_cols))

d = dict(zip(all_cat_cols, object_nunique))

sorted(d.items(), key=lambda x:x[1])

[('Street', 2),
 ('Utilities', 2),
 ('CentralAir', 2),
 ('LandSlope', 3),
 ('PavedDrive', 3),
 ('LotShape', 4),
 ('LandContour', 4),
 ('ExterQual', 4),
 ('KitchenQual', 4),
 ('MSZoning', 5),
 ('LotConfig', 5),
 ('BldgType', 5),
 ('ExterCond', 5),
 ('HeatingQC', 5),
 ('Condition2', 6),
 ('RoofStyle', 6),
 ('Foundation', 6),
 ('Heating', 6),
 ('Functional', 6),
 ('SaleCondition', 6),
 ('RoofMatl', 7),
 ('HouseStyle', 8),
 ('Condition1', 9),
 ('SaleType', 9),
 ('Exterior1st', 15),
 ('Exterior2nd', 16),
 ('Neighborhood', 25)]

In [15]:
# One-hot encoded
from sklearn.preprocessing import OneHotEncoder

my_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)

OH_cols_train = pd.DataFrame(my_encoder.fit_transform(X_train[all_cat_cols]))
OH_cols_valid = pd.DataFrame(my_encoder.transform(X_valid[all_cat_cols]))

OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index

# Remove categorical data
num_X_train = X_train.drop(all_cat_cols, axis=1)
num_X_valid = X_valid.drop(all_cat_cols, axis=1)

# Concatinate the encoded features
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)

In [16]:
# Impute nan numerical data using the median

my_imputer = SimpleImputer(strategy='median')

X_final_train = pd.DataFrame(my_imputer.fit_transform(OH_X_train))
X_final_valid = pd.DataFrame(my_imputer.transform(OH_X_valid))

X_final_train.columns = OH_X_train.columns
X_final_valid.columns = OH_X_valid.columns

In [17]:
# Random Forest
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators=100, random_state=0)
model.fit(X_final_train, y_train)

preds = model.predict(X_final_valid)

print('MAE:', mean_absolute_error(preds, y_valid))
print('RMSE:', RMSE(y_valid, preds))

MAE: 17492.58344748858
RMSE: 33657.29332096441


In [18]:
# Save Model
joblib.dump(model, 'Models/rf_cat_num_imputation.pkl')

['Models/rf_cat_num_imputation.pkl']

In [74]:
# preprocess the test set

# drop categorical columns that contains nan
X_test = X_test_full.drop(test_cat_cols_miss, axis=1)
# drop columns that aren't in the training dataset


# one-hot encode
OH_test_cols = pd.DataFrame(my_encoder.transform(X_test[all_cat_cols]))
OH_test_cols.index = X_test.index

# remove the rest categorical data
OH_X_test = X_test.drop(all_cat_cols, axis=1)

# Concatinate the numeric features
OH_X_test = pd.concat([OH_X_test, OH_test_cols], axis=1) 

# Imputation
IM_X_test = pd.DataFrame(my_imputer.transform(OH_X_test))
IM_X_test.columns = OH_X_test.columns

KeyError: "['Utilities', 'Functional', 'Exterior1st', 'KitchenQual', 'Exterior2nd', 'SaleType', 'MSZoning'] not in index"

In [64]:
print(OH_X_train.shape)
print(IM_X_test.shape)

(1168, 214)
(1450, 214)


In [66]:
preds = model.predict(IM_X_test)

output = pd.DataFrame({
    'Id': X_test.index,
    'SalePrice': preds
})

output.to_csv('submission.csv', index=False)

In [68]:
all_cat_cols = [col for col in X_train.columns
                if X_train[col].dtype.kind == 'O']

In [69]:
all_cat_cols

['MSZoning',
 'Street',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'KitchenQual',
 'Functional',
 'PavedDrive',
 'SaleType',
 'SaleCondition']

In [72]:
test_cat_cols_miss = [col for col in X_test_full.columns
                if X_test_full[col].isnull().any()
                and X_test_full[col].dtype.kind == 'O']
test_cat_cols_miss

['MSZoning',
 'Alley',
 'Utilities',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'KitchenQual',
 'Functional',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PoolQC',
 'Fence',
 'MiscFeature',
 'SaleType']