In [None]:
!pip install xgboost

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import get_scorer_names

from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.model_selection import cross_val_score

import warnings
import datetime
warnings.filterwarnings("ignore")

RSEED = 42

In [None]:
fdf = pd.read_csv('../data/wrangled_data.csv')
fdf.columns

In [None]:
y =fdf['capture_number']
X = fdf.drop(['capture_number','date_caught'],axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=RSEED,stratify=y)


In [None]:
#encode y values (to avoid xgboost error)
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train = le.fit_transform(y_train)

In [None]:
# Fit model to training data
model = XGBRegressor()
model.fit(X_train, y_train)
print(model)

In [None]:
def evaluate_rmse(y_true, y_pred,ndigits=3):
   rmse = mean_squared_error(y_true, y_pred, squared=False)
   print("Number of predictions: ", len(y_pred))
   print("RMSE: ", round(rmse,ndigits))
   return rmse

In [None]:
y_predict = model.predict(X_test)
evaluate_rmse(y_test, y_predict)

In [None]:
scorer = make_scorer(mean_squared_error, squared=False)
xgbr = XGBRegressor()
print("CV scores:", cross_val_score(xgbr, X_train, y_train, cv=5, scoring=scorer,verbose=5,n_jobs=-1))


In [None]:
from sklearn.model_selection import RandomizedSearchCV

param_grid = {
    "learning_rate": [0.05, 0.10, 0.15, 0.20, 0.25],
    'max_depth' : [3,5,7,9,10],
    'gamma':[ 0.0, 0.1, 0.2, 0.3, 0.4],
    'min_child_weight': [ 1, 3, 5, 7, 10],
    'colsample_bytree':[ 0.2, 0.3, 0.4, 0.5, 0.6],
}

estimator = XGBRegressor(random_state=RSEED)

random_search = RandomizedSearchCV(estimator, param_distributions=param_grid, n_iter=10, scoring= 'neg_root_mean_squared_error',
                                   cv=5, verbose=5, random_state=RSEED, n_jobs=-1)

random_search.fit(X_train, y_train)


In [None]:
best_model = random_search.best_estimator_
final_pred = best_model.predict(X_test)

In [None]:
get_scorer_names()

In [None]:
evaluate_rmse(final_pred,y_test)