In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error

import cleaning as cl

### Read in data

In [None]:
train_df = pd.read_csv('train.csv')
X_eval = pd.read_csv('test.csv')

### Split training data into X and y

In [None]:
X = train_df.drop(columns='SalePrice')
y = train_df.SalePrice

## Use pipelines

In [None]:
from xgboost import XGBRegressor
model = XGBRegressor(n_estimators = 1000)
p, X, y, X_eval, X_t, X_v, y_t, y_v = cl.prepare_data_and_pipeline(X, y, X_eval, model)

In [None]:
p.named_steps.keys()

In [None]:
xgb_fit_params = {'xgbregressor__eval_set': [(X_v, y_v)],
                  'xgbregressor__eval_metric': 'mae',
                  'xgbregressor__early_stopping_rounds': 100}

In [None]:
p.fit(X_t, y_t, 
      xgbregressor__eval_set=[(X_v.values, y_v.values)],
      xgbregressor__eval_metric='mae',
      xgbregressor__early_stopping_rounds=100,
      xgbregressor__verbose=False)

In [None]:
y_pred = p.predict(X_v)

In [None]:
print(mean_absolute_error(y_pred, y_v))

### Make partial dependence plots

In [None]:
from sklearn.ensemble.partial_dependence import plot_partial_dependence
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
bgbt = GradientBoostingRegressor(n_estimators=300, loss='lad')
bgbt.fit(X_t, y_t)

In [None]:
important_features = ['LotArea', 'BedroomAbvGr', 'OverallCond', 'TotRmsAbvGrd']
features_indices = [X_t.columns.get_loc(f) for f in important_features]

In [None]:
plot_partial_dependence(bgbt, X_t, [0,1], important_features)

In [None]:
plot_partial_dependence(bgbt, X_t, [2,3], important_features)

### Make predictions on test set

In [None]:
y_pred_test = p.predict(X_eval)

In [None]:
out = pd.DataFrame({'Id': X_eval.Id.astype(int), 'SalePrice': y_pred_test})
out.to_csv('xgbr_submission.csv', index=False)