### To-Do
- [x] Use Only Numerical data
- [x] deal with missing data
    - [x] Imputation
- [x] Submit 

In [1]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.impute import SimpleImputer

# Cross-validation pipeline
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import mean_squared_error, mean_absolute_error

from sklearn.externals import joblib # for saving the model

In [2]:
from math import sqrt
def rmse(actuals, predictions):
    return sqrt(mean_squared_error(actuals, predictions))

In [3]:
# Read data
X_full = pd.read_csv('train.csv', index_col='Id')
X_test_full = pd.read_csv('test.csv', index_col='Id')

# Remove rows with missing targets, seperate target from predictors
X_full.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = X_full.SalePrice
X_full.drop(['SalePrice'], axis=1, inplace=True)

# Only use numerical predictors
X = X_full.select_dtypes(exclude=['object'])
X_test = X_test_full.select_dtypes(exclude=['object'])

# Break off validation data from training data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=.2, random_state=0)

In [4]:
print(X_train.shape)
print(X_test.shape)

(1168, 36)
(1459, 36)


In [5]:
# Number of missing values per columns
missing_val_count_by_column = (X_train.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column>0].sort_values(ascending=False))

LotFrontage    212
GarageYrBlt     58
MasVnrArea       6
dtype: int64


In [18]:
# Just the median

mz_imputer = SimpleImputer(strategy='median')

X_final_train = pd.DataFrame(mz_imputer.fit_transform(X_train))
X_final_valid = pd.DataFrame(mz_imputer.transform(X_valid))

X_final_train.columns = X_train.columns
X_final_valid.columns = X_valid.columns

In [19]:
from sklearn.ensemble import RandomForestRegressor

model_2 = RandomForestRegressor(n_estimators=100, random_state=0)

model_2.fit(X_final_train, y_train)

preds_2 = model_2.predict(X_final_valid)
print('MAE: ', mean_absolute_error(preds_2, y_valid))
print('RMSE', rmse(y_valid, preds_2))

MAE:  17807.438333333328
RMSE 33919.31112490001


In [20]:
# Save the models 
joblib.dump(model_2, 'Models/rf_onlyMedian.pkl')

['Models/rf_onlyMedian.pkl']

In [21]:
# Preprocess test data
final_X_test = pd.DataFrame(mz_imputer.transform(X_test))
final_X_test.columns = X_test.columns

In [None]:
# Preprocess the test set

In [22]:
# Generate test predictions
preds_test = model_2.predict(final_X_test)

# Save and submit 
output = pd.DataFrame({'Id': X_test.index,
                     'SalePrice': preds_test})

output.to_csv('submission.csv', index=False)