### To-do
- [x] Remove categorical with missing value
- Better results on train.csv but slightly worse on test.csv
- MAE: 17523.535102739726
- Kaggle score: 16356.22338

In [2]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.externals import joblib

In [3]:
# get the data
X_full = pd.read_csv('train.csv')
X_test_full = pd.read_csv('test.csv')

# remove rows with missing targets
X_full.dropna(axis=0, subset=['SalePrice'], inplace=True)

# seperate targets from predictors
y = X_full.SalePrice
X_full.drop(['SalePrice'], axis=1, inplace=True)

# Break off validation set from training set
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X_full, y, test_size=.2,
                                                               random_state=0)

In [6]:
# Categorical columns with missing values
cat_cols_missing = [col for col in X_train_full.columns
                   if X_train_full[col].isnull().sum()>0 
                   and X_train_full[col].dtype == 'object']
cat_cols_missing

['Alley',
 'MasVnrType',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Electrical',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PoolQC',
 'Fence',
 'MiscFeature']

In [7]:
# drop categorical data with missing values
X_train = X_train_full.drop(cat_cols_missing, axis=1, inplace=False)
X_valid = X_valid_full.drop(cat_cols_missing, axis=1, inplace=False)
X_test = X_test_full.drop(cat_cols_missing, axis=1, inplace=False)

In [10]:
# the rest of categorical data and numerical data
categorical_cols = [col for col in X_train.columns
           if X_train[col].dtype == 'object']

numerical_cols = [col for col in X_train.columns
                 if X_train[col].dtype in ['float64', 'int64']]

In [11]:
X_train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition
618,619,20,RL,90.0,11694,Pave,Reg,Lvl,AllPub,Inside,...,108,0,0,260,0,0,7,2007,New,Partial
870,871,20,RL,60.0,6600,Pave,Reg,Lvl,AllPub,Inside,...,0,0,0,0,0,0,8,2009,WD,Normal
92,93,30,RL,80.0,13360,Pave,IR1,HLS,AllPub,Inside,...,0,44,0,0,0,0,8,2009,WD,Normal
817,818,20,RL,,13265,Pave,IR1,Lvl,AllPub,CulDSac,...,59,0,0,0,0,0,7,2008,WD,Normal
302,303,20,RL,118.0,13704,Pave,IR1,Lvl,AllPub,Corner,...,81,0,0,0,0,0,1,2006,WD,Normal


In [20]:
# the pipeline 

# preprocessing the numerical data
numerical_transformer = SimpleImputer(strategy='median')

# preprocessing the categorical data
categorical_transformer = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')), # no missing in training but there may be in test
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# bundl numerical and categorical transformers
preprocessor = ColumnTransformer(transformers=[
    ('numerical', numerical_transformer, numerical_cols),
    ('categorical', categorical_transformer, categorical_cols)
])

# the model 
model = RandomForestRegressor(n_estimators=100, random_state=0)

# bundl the model and transformer
my_pipeline = Pipeline(steps=[
    ('prerocessor', preprocessor),
    ('model', model)
])

# fit the pipeline 
my_pipeline.fit(X_train, y_train)

# get predictions
preds = my_pipeline.predict(X_valid)

print('MAE:', mean_absolute_error(y_valid, preds))

MAE: 17523.535102739726


In [23]:
# Save and generate submission
joblib.dump(my_pipeline, 'Models/rf_pipeline_no_cat_missing.pkl')

sample_submission = pd.read_csv('sample_submission.csv')

test_preds = my_pipeline.predict(X_test)

output = pd.DataFrame({
    'Id': sample_submission.Id,
    'SalePrice': test_preds
})

output.to_csv('submission.csv', index=False)

In [24]:
output.head()

Unnamed: 0,Id,SalePrice
0,1461,126499.5
1,1462,156959.0
2,1463,179915.39
3,1464,182621.46
4,1465,195832.79
