# XGBoost model

## Imports

In [12]:
# With help from https://machinelearningmastery.com/xgboost-for-regression/

import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.model_selection import RepeatedKFold, GridSearchCV
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor


In [13]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Data

In [14]:
# load the dataset
data = pd.read_csv('/content/drive/Othercomputers/My MacBook Air/Dropbox/ga/capstone/data/merged_data.csv', index_col=0)
data.head()


Unnamed: 0,wave_height,dominant_period,avg_period,water_temp,dominant_wave_direction_sin,dominant_wave_direction_cos,wave_height_3.0_h,dominant_period_3.0_h,avg_period_3.0_h,water_temp_3.0_h,...,dominant_period_12.0_h,avg_period_12.0_h,water_temp_12.0_h,dominant_wave_direction_sin_12.0_h,dominant_wave_direction_cos_12.0_h,score,wind_speed,gust_speed,wind_direction_sin,wind_direction_cos
2014-12-12 06:00:00,2.56,11.76,9.82,25.5,-0.681998,0.731354,2.5,13.33,9.83,25.6,...,14.29,9.75,25.6,-0.681998,0.731354,6.44,0.6,1.05,0.410719,0.911762
2014-12-12 06:30:00,2.46,13.33,9.98,25.5,-0.681998,0.731354,2.69,12.5,9.86,25.6,...,13.33,9.98,25.6,-0.71934,0.694658,7.266667,0.28,0.82,0.671721,-0.740805
2014-12-12 07:00:00,2.2,12.5,9.79,25.6,-0.681998,0.731354,2.5,13.33,9.81,25.6,...,14.29,9.83,25.6,-0.642788,0.766044,7.223333,0.2,0.78,0.972776,-0.231748
2014-12-12 07:30:00,2.33,11.76,9.75,25.6,-0.681998,0.731354,2.86,13.33,10.22,25.6,...,13.33,9.97,25.6,-0.669131,0.743145,15.18,0.28,0.7,0.829038,0.559193
2014-12-12 08:00:00,2.48,11.76,10.13,25.6,-0.529919,0.848048,2.52,13.33,9.6,25.5,...,13.33,10.4,25.6,-0.71934,0.694658,4.786667,0.52,0.9,-0.92321,0.384295


## X, y

In [15]:
# split data into input and output columns
X = data.drop(columns="score")
y = data["score"]

## Standard Scaling

In [16]:
ss = StandardScaler()
X_sc = ss.fit_transform(X)

## Train, test split

In [17]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X_sc, y, random_state=1331)

## Baseline Model RSME

In [18]:
y_baseline_preds = np.full_like(y, y.mean()) 
mean_squared_error(y, y_baseline_preds, squared=False)

3.1675113889842983

## XGBoost model, no hyperparameters. KFold cross validation

In [19]:
# define model
model = XGBRegressor()
# define model evaluation method
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
scores = cross_val_score(
    model, X_train, y_train, scoring="neg_root_mean_squared_error", cv=cv, n_jobs=-1
)
# force scores to be positive
scores = np.absolute(scores)
print("Mean RMSE: %.3f (%.3f)" % (scores.mean(), scores.std()))


Mean RMSE: 2.980 (0.421)


## XGBoost model, GridSearchCV

Grid search over:
- n_estimators
- max_depth
- learning_rate

In [20]:
# define model
model = XGBRegressor(objective='reg:squarederror')


In [22]:
params = {
    'max_depth': [1, 3, 5],
    'n_estimators': [2_000, 5_000, 10_000],
    'learning_rate': [.1, .01, .001, .0001] 
}

In [23]:
gs_result = GridSearchCV(model, params, n_jobs=-1, scoring='neg_root_mean_squared_error')
gs_result.fit(X_train, y_train)

Best score: -2.887165790545826. Used these parameters: {'learning_rate': 0.001, 'max_depth': 1, 'n_estimators': 10000}
-3.134372 (0.192867) with: {'learning_rate': 0.1, 'max_depth': 1, 'n_estimators': 2000}
-3.352286 (0.234245) with: {'learning_rate': 0.1, 'max_depth': 1, 'n_estimators': 5000}
-3.491580 (0.257461) with: {'learning_rate': 0.1, 'max_depth': 1, 'n_estimators': 10000}
-3.159217 (0.222371) with: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 2000}
-3.159215 (0.222370) with: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 5000}
-3.159215 (0.222370) with: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 10000}
-3.038870 (0.114014) with: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 2000}
-3.038870 (0.114014) with: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 5000}
-3.038869 (0.114013) with: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 10000}
-2.893817 (0.127430) with: {'learning_rate': 0.01, 'max_depth': 1, 'n_estimators': 200

In [27]:
# Result summary
print(f"Best score: {np.absolute(gs_result.best_score_)}. Used these parameters: {gs_result.best_params_}")

# This part copied from machine learning mastery prints out all results to check where improvements can be made
means = np.absolute(gs_result.cv_results_['mean_test_score'])
stds = gs_result.cv_results_['std_test_score']
params = gs_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best score: 2.887165790545826. Used these parameters: {'learning_rate': 0.001, 'max_depth': 1, 'n_estimators': 10000}
3.134372 (0.192867) with: {'learning_rate': 0.1, 'max_depth': 1, 'n_estimators': 2000}
3.352286 (0.234245) with: {'learning_rate': 0.1, 'max_depth': 1, 'n_estimators': 5000}
3.491580 (0.257461) with: {'learning_rate': 0.1, 'max_depth': 1, 'n_estimators': 10000}
3.159217 (0.222371) with: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 2000}
3.159215 (0.222370) with: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 5000}
3.159215 (0.222370) with: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 10000}
3.038870 (0.114014) with: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 2000}
3.038870 (0.114014) with: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 5000}
3.038869 (0.114013) with: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 10000}
2.893817 (0.127430) with: {'learning_rate': 0.01, 'max_depth': 1, 'n_estimators': 2000}
2.934245

In [25]:
xgb_preds = gs_result.best_estimator_.predict(X_test)

In [26]:
mean_squared_error(y_test, xgb_preds)

6.59736082587745