Imports

In [1]:
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Imputer
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

Import the Iowa data and define the target (y) vs. rest (X) of the data which could be used for building the model to predict y.

In [2]:
main_file_path = './data/' # this is the path to the Iowa data that you will use
data = pd.read_csv(main_file_path + 'train.csv') # load the training dataset
data.dropna(axis=0, subset=['SalePrice'], inplace=True) # if a row is missing the sale price, drop it

# Read the test data
test = pd.read_csv(main_file_path + 'test.csv') # read in the Kaggle competition evaluation data

y = data.SalePrice # define the target
X = data.drop(['SalePrice'], axis=1) # get rid of the target from list of features

Create one hot encoded categorical variables

In [3]:
one_hot_encoded_training = pd.get_dummies(X) # perform one hot encoding on training data

one_hot_encoded_test = pd.get_dummies(test) # also perform it on final evaluation data

final_train, final_test = one_hot_encoded_training.align(one_hot_encoded_test, join='inner', axis=1) # ensure that the columns are synced between the two datasets


Split the data into training and test data.

In [13]:
train_X, val_X, train_y, val_y = train_test_split(final_train, y, random_state = 0)

Create an Imputer -> XGBRegressor pipeline. Parameters for training XGBRegressor seem to be accessible but there is some problem with adjusting the fit parameters.

In [77]:
pipe = Pipeline(steps=[("Imputer", Imputer()),
                     ("XGB", XGBRegressor())])

In [78]:
xgb_hyperparams = {'XGB__n_estimators': [1000, 2000, 3000],
                   'XGB__learning_rate': [0.01, 0.03, 0.05, 0.07],
                   'XGB__max_depth': [3, 4, 5]}

fit_parameters = {'XGB__early_stopping_rounds': 5,
                  'XGB__eval_metric': 'mae',
                  'XGB__eval_set': [(val_X, val_y)],
                  'XGB__verbose': False}

In [79]:
grid_search = GridSearchCV(pipe,
                          xgb_hyperparams,
                          #fit_params=fit_params,
                          scoring='neg_mean_squared_error',
                          cv=5,
                          n_jobs=1,
                          verbose=3)

In [80]:
grid_search.fit(train_X, train_y) #, fit_params=fit_parameters)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV] XGB__learning_rate=0.01, XGB__n_estimators=1000, XGB__max_depth=3 
[CV]  XGB__learning_rate=0.01, XGB__n_estimators=1000, XGB__max_depth=3, score=-698039540.968, total=   8.9s
[CV] XGB__learning_rate=0.01, XGB__n_estimators=1000, XGB__max_depth=3 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    9.0s remaining:    0.0s


[CV]  XGB__learning_rate=0.01, XGB__n_estimators=1000, XGB__max_depth=3, score=-1704956517.51, total=  10.0s
[CV] XGB__learning_rate=0.01, XGB__n_estimators=1000, XGB__max_depth=3 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   19.0s remaining:    0.0s


[CV]  XGB__learning_rate=0.01, XGB__n_estimators=1000, XGB__max_depth=3, score=-743081316.788, total=  10.7s
[CV] XGB__learning_rate=0.01, XGB__n_estimators=1000, XGB__max_depth=3 


KeyboardInterrupt: 

In [None]:
y_preds = pipe.predict(val_X)

output_score = mean_absolute_error(val_y, y_preds)
print(output_score)

Before creating the pipeline, the hyperparameter space that will be searched in order to tune the hyperparameters of the XGBRegressor are defined.

In [None]:

pipe.fit(train_X, train_y)


Submission generation code

In [None]:
# Use the model to make predictions
predicted_prices = pipe.predict(final_test)
# We will look at the predicted prices to ensure we have something sensible.
print(predicted_prices)

my_submission = pd.DataFrame({'Id': test.Id, 'SalePrice': predicted_prices})
# you could use any filename. We choose submission here
my_submission.to_csv('submission.csv', index=False)