In [None]:
import xgboost as xgb
import matplotlib.pyplot as plt
from matplotlib import pyplot, figure
from xgboost import XGBClassifier, plot_importance
from pandas import read_csv
from keras.losses import mean_squared_error
from sklearn.model_selection import train_test_split
from numpy import sqrt
import bayes_opt
from bayes_opt import BayesianOptimization

In [None]:
#Read in the data, set all NA's as 0
df = read_csv('ucsbdata.csv')
df.fillna(0, inplace=True)

# Split data into validation set (2007-2008), and training\testing set (2008-2018)
validation_min = '2007-08-30'
initial_start = '2008-08-30'
validation_full = df.loc[df.Index > validation_min]
validation = validation_full.loc[validation_full.Index < initial_start]
dataset = df.loc[df.Index > initial_start]

# Remove from consideration the predicted value (R) and the Date (Index)
x, y = dataset.drop(['R', 'Index'], axis = 1), dataset.R
a, b = validation.drop(['R', 'Index'], axis = 1), validation.R

In [None]:
# Split training & testing data
x_train, x_test, y_train, y_test = train_test_split(x, 
                                                    y, 
                                                    test_size = 0.20, 
                                                    random_state = 123)
xgb.DMatrix(x_train, y_train)

In [None]:
# Attempt to optimize hyper-parameters

pbounds = {#base_score=0.5, 
           #booster='gbtree', 
           colsample_bylevel=1,
           colsample_bynode=1, 
           colsample_bytree=1, 
           gamma=0,
           importance_type='weight', 
           learning_rate=0.1, 
           max_delta_step=0,
           #max_depth=3, 
                             min_child_weight=1, 
                             missing=None, 
                             n_estimators=100,
                             n_jobs=1, 
                             nthread=None, 
                             objective='reg:squarederror', 
                             random_state=0,
                             reg_alpha=0, 
                             reg_lambda=1, 
                             scale_pos_weight=1, 
                             seed=None,
                             silent=None, 
                             subsample=1, 
                             verbosity=1}

optimizer = BayesianOptimization(
    f=xgb.XGBRegressor,
    pbounds=pbounds,
    random_state=1,
)

# Build model with XGBRegressor
xgb_model = xgb.XGBRegressor(#base_score=0.5, 
                             #booster='gbtree', 
                             colsample_bylevel=1,
                             colsample_bynode=1, 
                             colsample_bytree=1, 
                             gamma=0,
                             importance_type='weight', 
                             learning_rate=0.1, 
                             max_delta_step=0,
                             #max_depth=3, 
                             min_child_weight=1, 
                             missing=None, 
                             n_estimators=100,
                             n_jobs=1, 
                             nthread=None, 
                             objective='reg:squarederror', 
                             random_state=0,
                             reg_alpha=0, 
                             reg_lambda=1, 
                             scale_pos_weight=1, 
                             seed=None,
                             silent=None, 
                             subsample=1, 
                             verbosity=1)
xgb_model.fit(x_train, y_train)

In [None]:
# Find RMSE of prediction (try to get below 0.005)
prediction = xgb_model.predict(x_test)
rmse = sqrt(mean_squared_error(y_test, prediction))
rmse

In [None]:
# Plot 20 most important features
ax = xgb.plot_importance(xgb_model, max_num_features=20)
fig = ax.figure
fig.set_size_inches(20, 20)

In [None]:
# Plot predictions from validation data vs actual validation points 
validation_point = a.iloc[1:]
validation_actual = b.iloc[1:]

predicted_actual = xgb_model.predict(validation_point)
plt.figure(figsize=(15,10))
plt.plot(predicted_actual, label='prediction')
plt.plot(validation_actual.values, label='actual values')
plt.legend()

In [None]:
#Compute R^2 for model