# XGBoost model

## Imports

In [1]:
# With help from https://machinelearningmastery.com/xgboost-for-regression/

import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.model_selection import RepeatedKFold, GridSearchCV
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Data

In [3]:
# # local data

# data = pd.read_csv('../data/merged_data.csv')
# data.drop(columns='Unnamed: 0', inplace=True)
# data.head()


In [4]:
# Collab data

data = pd.read_csv('/content/drive/Othercomputers/My MacBook Air/Dropbox/ga/capstone/data/merged_data.csv')
data.drop(columns='Unnamed: 0', inplace=True)
data.head()


Unnamed: 0,wave_height,dominant_period,avg_period,dominant_wave_direction_sin,dominant_wave_direction_cos,wave_height_1.0_h,dominant_period_1.0_h,avg_period_1.0_h,dominant_wave_direction_sin_1.0_h,dominant_wave_direction_cos_1.0_h,...,wave_height_9.0_h,dominant_period_9.0_h,avg_period_9.0_h,dominant_wave_direction_sin_9.0_h,dominant_wave_direction_cos_9.0_h,score,wind_speed,gust_speed,wind_direction_sin,wind_direction_cos
0,2.56,11.76,9.82,-0.681998,0.731354,2.52,13.33,9.6,-0.669131,0.743145,...,2.94,13.33,10.56,-0.71934,0.694658,6.44,0.6,1.05,0.410719,0.911762
1,2.46,13.33,9.98,-0.681998,0.731354,2.4,13.33,9.99,-0.669131,0.743145,...,2.8,14.29,10.01,-0.731354,0.681998,7.266667,0.28,0.82,0.671721,-0.740805
2,2.2,12.5,9.79,-0.681998,0.731354,2.56,11.76,9.82,-0.681998,0.731354,...,2.71,13.33,9.84,-0.681998,0.731354,7.223333,0.2,0.78,0.972776,-0.231748
3,2.33,11.76,9.75,-0.681998,0.731354,2.46,13.33,9.98,-0.681998,0.731354,...,2.67,13.33,9.67,-0.694658,0.71934,15.18,0.28,0.7,0.829038,0.559193
4,2.48,11.76,10.13,-0.529919,0.848048,2.2,12.5,9.79,-0.681998,0.731354,...,2.65,14.29,9.92,-0.71934,0.694658,4.786667,0.52,0.9,-0.92321,0.384295


## X, y

In [5]:
# split data into input and output columns
X = data.drop(columns="score")
y = data["score"]

## Standard Scaling

In [6]:
ss = StandardScaler()
X_sc = ss.fit_transform(X)

## Train, test split

In [7]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X_sc, y, random_state=1331)

## Baseline Model RSME

In [8]:
y_baseline_preds = np.full_like(y, y.mean()) 
mean_squared_error(y, y_baseline_preds, squared=False)

3.1675113889842983

## Create list of dicts to save model results

In [9]:
all_model_results = []

## XGBoost model, GridSearchCV

Grid search over:
- n_estimators
- max_depth
- learning_rate

In [10]:
# Empty dict for results
xgb_results = {}

# Instantiate model
model = XGBRegressor(objective='reg:squarederror')

# Define GS params
xgb_params = {'max_depth': [1, 3, 5],
    'n_estimators': [2_000, 5_000, 10_000],
    'learning_rate': [.1, .01, .001, .0001] 
}

# Instantiate GridSearch
xgb_gs = GridSearchCV(model, xgb_params)

# Fit on training set
xgb_gs.fit(X_train, y_train)

# Score on training and testing set
print(f"R-squared score on train set: {xgb_gs.score(X_train, y_train)}")
print(f"R-squared score on test set: {xgb_gs.score(X_test, y_test)}")


R-squared score on train set: 0.7143623421682606
R-squared score on test set: 0.0806700368483535


In [11]:
# Get best params
xgb_best_params = xgb_gs.best_params_
xgb_best_params

{'learning_rate': 0.001, 'max_depth': 3, 'n_estimators': 5000}

### XGBoost Regression RMSE

In [12]:
xgb_train_rmse = mean_squared_error(y_train, xgb_gs.best_estimator_.predict(X_train), squared=False)
xgb_train_rmse


1.719407060938767

In [13]:
xgb_test_rmse = mean_squared_error(y_test, xgb_gs.best_estimator_.predict(X_test), squared=False)
xgb_test_rmse


2.8711189584814503

### Save model result for final comparisson

In [14]:
xgb_results = [
    {
        "Model Name": "XGBoost Regression",
        "Score Type": "Train RMSE",
        "Score": xgb_train_rmse,
        "Observations": xgb_best_params,
    },
    {
        "Model Name": "XGBoost Regression",
        "Score Type": "Test RMSE",
        "Score": xgb_test_rmse,
        "Observations": xgb_best_params,
    },
]

for result in xgb_results:
    all_model_results.append(result)
all_model_results

[{'Model Name': 'XGBoost Regression',
  'Observations': {'learning_rate': 0.001,
   'max_depth': 3,
   'n_estimators': 5000},
  'Score': 1.719407060938767,
  'Score Type': 'Train RMSE'},
 {'Model Name': 'XGBoost Regression',
  'Observations': {'learning_rate': 0.001,
   'max_depth': 3,
   'n_estimators': 5000},
  'Score': 2.8711189584814503,
  'Score Type': 'Test RMSE'}]

## Save results to csv

In [15]:
# pd.DataFrame(all_model_results).to_csv('../model-results/xgboost.csv')

In [16]:
pd.DataFrame(all_model_results).to_csv('/content/drive/Othercomputers/My MacBook Air/Dropbox/ga/capstone/model-results/xgboost.csv')