In [None]:
import cleaning as cl

seed = 1337

## Read in data

In [None]:
import pandas as pd

train_df = pd.read_csv('train.csv')
X_eval = pd.read_csv('test.csv')

X = train_df.drop(columns='SalePrice')
y = train_df.SalePrice

## Clean up data for model fitting

In [None]:
from sklearn.model_selection import train_test_split

X, X_eval = cl.create_one_hot_encoding(X, y, X_eval)
X_t, X_v, y_t, y_v = train_test_split(X, y, 
                                      test_size=.2,
                                      random_state=seed)

## Create models

In [None]:
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor

models = {'rfr': RandomForestRegressor(n_estimators=100, 
                                       criterion='mae', 
                                       n_jobs=-1,
                                       random_state=seed),
          'xgbr': XGBRegressor(n_estimators=100,
                               random_state=seed), 
          'skgbr': GradientBoostingRegressor(loss='lad', 
                                             n_estimators=100,
                                             random_state=seed)}

## Pipeline models into estimators

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer

estimators = {name: make_pipeline(SimpleImputer(), model) 
              for name, model in models.items()}

## Fit estimators to training data

In [None]:
fit_params = {'rfr': {},
              'xgbr': {'xgbregressor__eval_set': [(X_v.values, y_v.values)],
                       'xgbregressor__eval_metric': 'mae',
                       'xgbregressor__early_stopping_rounds': 100,
                       'xgbregressor__verbose': False}, 
              'skgbr': {}}

In [None]:
for name, params in fit_params.items():
    estimators[name].fit(X_t, y_t, **params)

In [None]:
from sklearn.metrics import mean_absolute_error

for name, estimator in estimators.items():
    y_pred = estimator.predict(X_v)

    print(mean_absolute_error(y_pred, y_v))

## Get cross validation scores

### Make predictions on test set

In [None]:
y_pred_test = p.predict(X_eval)

In [None]:
out = pd.DataFrame({'Id': X_eval.Id.astype(int), 'SalePrice': y_pred_test})
out.to_csv('xgbr_submission.csv', index=False)

### Make partial dependence plots

In [None]:
from sklearn.ensemble.partial_dependence import plot_partial_dependence
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
bgbt = GradientBoostingRegressor(n_estimators=300, loss='lad')
bgbt.fit(X_t, y_t)

In [None]:
important_features = ['LotArea', 'BedroomAbvGr', 'OverallCond', 'TotRmsAbvGrd']
features_indices = [X_t.columns.get_loc(f) for f in important_features]

In [None]:
plot_partial_dependence(bgbt, X_t, [0,1], important_features)

In [None]:
plot_partial_dependence(bgbt, X_t, [2,3], important_features)