In [5]:
!pip install xgboost

Collecting xgboost
  Obtaining dependency information for xgboost from https://files.pythonhosted.org/packages/75/dd/9afe0d9d0f61a5384c3932626a022e38c396a5d88e6f5345ad2f7b576747/xgboost-1.7.6-py3-none-win_amd64.whl.metadata
  Downloading xgboost-1.7.6-py3-none-win_amd64.whl.metadata (1.9 kB)
Downloading xgboost-1.7.6-py3-none-win_amd64.whl (70.9 MB)
   ---------------------------------------- 0.0/70.9 MB ? eta -:--:--
   ---------------------------------------- 0.3/70.9 MB 6.8 MB/s eta 0:00:11
   ---------------------------------------- 0.4/70.9 MB 8.5 MB/s eta 0:00:09
    --------------------------------------- 1.2/70.9 MB 8.5 MB/s eta 0:00:09
   - -------------------------------------- 2.3/70.9 MB 11.9 MB/s eta 0:00:06
   - -------------------------------------- 3.4/70.9 MB 14.6 MB/s eta 0:00:05
   -- ------------------------------------- 4.7/70.9 MB 16.5 MB/s eta 0:00:05
   --- ------------------------------------ 5.8/70.9 MB 17.6 MB/s eta 0:00:04
   --- ----------------------------

In [74]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import get_scorer_names

from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.model_selection import cross_val_score

import warnings
import datetime
warnings.filterwarnings("ignore")

RSEED = 42

In [25]:
fdf = pd.read_csv('data/wrangled_data.csv')
fdf.columns

Index(['Unnamed: 0.1', 'Unnamed: 0', 'date_caught', 'capture_site', 'tag_2',
       'ccl_cm', 'ccw_cm', 'weight_kg', 'status', 'release_site', 'cm_beached',
       'cm_by hand', 'cm_collected floater', 'cm_fish trap', 'cm_jarife',
       'cm_longline', 'cm_net', 'cm_not_recorded', 'cm_speargun',
       'cm_stranded', 'cm_uzio', 'foraging_ground_0', 'foraging_ground_1',
       'cs_category_0', 'cs_category_1', 'cs_category_2', 'cs_category_3',
       'cs_category_4', 'species_0', 'species_1', 'species_2', 'species_3',
       'species_4', 'species_5', 'species_6', 'species_7', 'year',
       'week_of_year', 'year_woy', 'cs_category', 'type', 'capture_number'],
      dtype='object')

In [26]:
y =fdf['capture_number']
X = fdf.drop(['capture_number','date_caught'],axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=RSEED,stratify=y)


In [27]:
#encode y values (to avoid xgboost error)
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train = le.fit_transform(y_train)

In [51]:
# Fit model to training data
model = XGBRegressor()
model.fit(X_train, y_train)
print(model)

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=None, num_parallel_tree=None,
             predictor=None, random_state=None, ...)


In [29]:
def evaluate_rmse(y_true, y_pred,ndigits=3):
   rmse = mean_squared_error(y_true, y_pred, squared=False)
   print("Number of predictions: ", len(y_pred))
   print("RMSE: ", round(rmse,ndigits))
   return rmse

In [52]:
y_predict = model.predict(X_test)
evaluate_rmse(y_test, y_predict)

Number of predictions:  5419
RMSE:  2.098


2.0977951577194403

In [62]:
scorer = make_scorer(mean_squared_error, squared=False)
xgbr = XGBRegressor()
print("CV scores:", cross_val_score(xgbr, X_train, y_train, cv=5, scoring=scorer,verbose=5,n_jobs=-1))


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


CV scores: [1.8845107  1.79361674 1.82645228 1.87646059 1.82223915]


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    2.2s remaining:    3.4s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.3s finished


In [90]:
from sklearn.model_selection import RandomizedSearchCV

param_grid = {
    "learning_rate": [0.05, 0.10, 0.15, 0.20, 0.25],
    'max_depth' : [3,5,7,9,10],
    'gamma':[ 0.0, 0.1, 0.2, 0.3, 0.4],
    'min_child_weight': [ 1, 3, 5, 7, 10],
    'colsample_bytree':[ 0.2, 0.3, 0.4, 0.5, 0.6],
}

estimator = XGBRegressor(random_state=RSEED)

random_search = RandomizedSearchCV(estimator, param_distributions=param_grid, n_iter=10, scoring= 'neg_root_mean_squared_error',
                                   cv=5, verbose=5, random_state=RSEED, n_jobs=-1)

random_search.fit(X_train, y_train)


Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [91]:
best_model = random_search.best_estimator_
final_pred = best_model.predict(X_test)

In [92]:
get_scorer_names()

['accuracy',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'average_precision',
 'balanced_accuracy',
 'completeness_score',
 'explained_variance',
 'f1',
 'f1_macro',
 'f1_micro',
 'f1_samples',
 'f1_weighted',
 'fowlkes_mallows_score',
 'homogeneity_score',
 'jaccard',
 'jaccard_macro',
 'jaccard_micro',
 'jaccard_samples',
 'jaccard_weighted',
 'matthews_corrcoef',
 'max_error',
 'mutual_info_score',
 'neg_brier_score',
 'neg_log_loss',
 'neg_mean_absolute_error',
 'neg_mean_absolute_percentage_error',
 'neg_mean_gamma_deviance',
 'neg_mean_poisson_deviance',
 'neg_mean_squared_error',
 'neg_mean_squared_log_error',
 'neg_median_absolute_error',
 'neg_negative_likelihood_ratio',
 'neg_root_mean_squared_error',
 'normalized_mutual_info_score',
 'positive_likelihood_ratio',
 'precision',
 'precision_macro',
 'precision_micro',
 'precision_samples',
 'precision_weighted',
 'r2',
 'rand_score',
 'recall',
 'recall_macro',
 'recall_micro',
 'recall_samples',
 'recall_weighted',

In [94]:
evaluate_rmse(final_pred,y_test)

Number of predictions:  5419
RMSE:  1.905


1.9047024391034801