Imports

In [41]:
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import Imputer
from xgboost import XGBRegressor

Define an all purpose Mean Absolute Error calculating function

In [42]:
def get_mae(predictors_train, predictors_val, targ_train, targ_val, model, *args, **kwargs):
    if args:
        model.fit(predictors_train, targ_train, *args)
    if kwargs:
        model.fit(predictors_train, targ_train, **kwargs)
    else:
        model.fit(predictors_train, targ_train)
    preds_val = model.predict(predictors_val)
    mae = mean_absolute_error(targ_val, preds_val)
    return(mae)

Import the Iowa data and define the target (y) vs. rest (X) of the data which could be used for building the model to predict y.

In [43]:
main_file_path = './data/' # this is the path to the Iowa data that you will use
data = pd.read_csv(main_file_path + 'train.csv')
data.dropna(axis=0, subset=['SalePrice'], inplace=True)

# Read the test data
test = pd.read_csv(main_file_path + 'test.csv')

y = data.SalePrice
X = data.drop(['SalePrice'], axis=1)

In [44]:
print(data.isnull().sum())

Id                  0
MSSubClass          0
MSZoning            0
LotFrontage       259
LotArea             0
Street              0
Alley            1369
LotShape            0
LandContour         0
Utilities           0
LotConfig           0
LandSlope           0
Neighborhood        0
Condition1          0
Condition2          0
BldgType            0
HouseStyle          0
OverallQual         0
OverallCond         0
YearBuilt           0
YearRemodAdd        0
RoofStyle           0
RoofMatl            0
Exterior1st         0
Exterior2nd         0
MasVnrType          8
MasVnrArea          8
ExterQual           0
ExterCond           0
Foundation          0
                 ... 
BedroomAbvGr        0
KitchenAbvGr        0
KitchenQual         0
TotRmsAbvGrd        0
Functional          0
Fireplaces          0
FireplaceQu       690
GarageType         81
GarageYrBlt        81
GarageFinish       81
GarageCars          0
GarageArea          0
GarageQual         81
GarageCond         81
PavedDrive

Create one hot encoded categorical variables

In [45]:
one_hot_encoded_training = pd.get_dummies(X)
one_hot_encoded_test = pd.get_dummies(test)
final_train, final_test = one_hot_encoded_training.align(one_hot_encoded_test, join='inner', axis=1)

Excluding categorical variables in order to use Imputer

I'm still a little confused as to why one would want to mark these columns as "was missing"

In [46]:
def impute_X(X):
    num_X = X.select_dtypes(exclude=['object'])

    i_X = num_X.copy()

    #cols_with_missing = (col for col in num_X.columns
    #                    if num_X[col].isnull().any())
    #for col in cols_with_missing:
    #    i_X[col + '_was_missing'] = i_X[col].isnull()

    my_imputer = Imputer()
    i_X = my_imputer.fit_transform(i_X)
    return i_X

X = impute_X(final_train)
# Treat the test data in the same way as training data. In this case, impute
test_X = impute_X(final_test)

Split the data into training and test data.

In [47]:
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)

Here is code for Random Forest including computing a guess about optimal number of leaves

In [48]:
#opti_max_leaf = []
#for max_leaf_nodes in [5, 25, 50, 75, 100, 125, 150, 200, 250, 500, 1000, 2500, 5000]:
#    iowa_model = RandomForestRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
#    my_mae = get_mae(train_X, val_X, train_y, val_y, iowa_model)
#    print("Max leaf nodes: %d \t\t Mean Absolute Error: %d" %(max_leaf_nodes, my_mae))
#    opti_max_leaf.append((max_leaf_nodes, my_mae))

#opti_max_leaf = sorted(opti_max_leaf, key=lambda leaf_num: leaf_num[1])

#iowa_model = RandomForestRegressor(max_leaf_nodes=opti_max_leaf[0][0], random_state=0)
#iowa_model.fit(train_X, train_y)

Code for using XGBoost

In [67]:
iowa_model = XGBRegressor(n_estimators=1000, learning_rate=0.08)
#silent=True can be done later so you don't have to see so much output
#iowa_model.fit(train_X, train_y, verbose=False)

Check for mean absolute error

In [68]:
print(get_mae(train_X, val_X, train_y, val_y, iowa_model, early_stopping_rounds=8, eval_set=[(val_X, val_y)], verbose=False))

15900.888741438355


Submission generation code

In [69]:
# Use the model to make predictions
predicted_prices = iowa_model.predict(test_X)
# We will look at the predicted prices to ensure we have something sensible.
print(predicted_prices)

my_submission = pd.DataFrame({'Id': test.Id, 'SalePrice': predicted_prices})
# you could use any filename. We choose submission here
my_submission.to_csv('submission.csv', index=False)

[122183.87 165347.34 182069.83 ... 147391.1  123465.55 241406.47]
