In [None]:
import numpy as np

import cleaning as cl
import tuning as tn

seed = 1337

## Read in data

In [None]:
import pandas as pd

train_df = pd.read_csv('train.csv')
X_eval = pd.read_csv('test.csv')

X = train_df.drop(columns='SalePrice')
y = train_df.SalePrice

## Clean up data for model fitting

In [None]:
from sklearn.model_selection import train_test_split

X, X_eval = cl.create_one_hot_encoding(X, y, X_eval)

## Create models

In [None]:
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor

models = {'rfr': RandomForestRegressor(n_estimators=100, 
                                       criterion='mae', 
                                       n_jobs=-1,
                                       random_state=seed),
          'xgbr': XGBRegressor(n_estimators=100,
                               random_state=seed), 
          'skgbr': GradientBoostingRegressor(loss='lad', 
                                             n_estimators=100,
                                             random_state=seed)}

## Pipeline models into estimators

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer

estimators = {name: make_pipeline(SimpleImputer(), model) 
              for name, model in models.items()}

## Steps for auto param search

1. Use `RandomizedSearchCV` to find best params over a range.

2. Take results of `RandomizedSearchCV` to reduce area of grid to search.
    + Search should happen around the best param, and the area should be reduced by a factor.
    + Add parameter specifying number of iterations of `RandomizedSearchCV` to run, each with the reduced parameter grid area.

3. Return the parameters, the score, and the model that is the best fit.

In [None]:
param_ranges = [{'name': 'xgbregressor__n_estimators', 
                      'low': 1e2, 
                      'high': 1e3, 
                      'n': 10, 
                      'is_log': False, 
                      'is_int': True},
                     {'name': 'xgbregressor__learning_rate',
                      'low': 1e-2,
                      'high': 1e-1,
                      'n': 10,
                      'is_log': True,
                      'is_int': False}]

In [None]:
best_params = tn.find_hyperparams_iterated(estimators['xgbr'], X, y, param_ranges, 
                                        {'xgbregressor__eval_metric': 'mae'}, 10, 
                                        'neg_mean_absolute_error', 5, 3, .5)

In [None]:
print(best_params)

### Make predictions on test set

In [None]:
y_pred_test = (make_pipeline(SimpleImputer(),
                             XGBRegressor(n_estimators=best_params['xgbregressor__n_estimators'], 
                                          learning_rate=best_params['xgbregressor__learning_rate'], 
                                          random_state=seed))
               .fit(X,y)
               .predict(X_eval))

In [None]:
out = pd.DataFrame({'Id': X_eval.Id.astype(int), 'SalePrice': y_pred_test})
out.to_csv('xgbr_submission.csv', index=False)

### Make partial dependence plots

In [None]:
from sklearn.ensemble.partial_dependence import plot_partial_dependence
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
bgbt = GradientBoostingRegressor(n_estimators=300, loss='lad')
bgbt.fit(X_t, y_t)

In [None]:
important_features = ['LotArea', 'BedroomAbvGr', 'OverallCond', 'TotRmsAbvGrd']
features_indices = [X_t.columns.get_loc(f) for f in important_features]

In [None]:
plot_partial_dependence(bgbt, X_t, [0,1], important_features)

In [None]:
plot_partial_dependence(bgbt, X_t, [2,3], important_features)