### Packages

In [1]:
from sklearn.model_selection import train_test_split
# ^^^ pyforest auto-imports - don't write above this line
import xgboost as xgb
#from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import numpy as np
from sklearn.metrics import precision_score, recall_score, accuracy_score

#import warnings
#def ignore_warn(*args, **kwargs): pass warnings.warn = ignore_warn
import numpy as np 
import pandas as pd 
%matplotlib inline
import matplotlib.pyplot as plt 
import seaborn as sns
from scipy import stats
from scipy.stats import norm, skew
from sklearn import preprocessing
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import ElasticNetCV, ElasticNet
from xgboost import XGBRegressor, plot_importance 
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
pd.set_option('display.float_format', lambda x: '{:.3f}'.format(x))

from sklearn import preprocessing
from sklearn.compose import ColumnTransformer
import category_encoders as ce
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score

### Data Import, Exploration and Preparation

In [2]:
df = pd.read_csv('merged_data_with_total_agb_converted.csv')#load merged_data_with_total_agb_converted dataset

In [3]:
X = df[['species','aerial_data_height','area']] #Features (removed 'Genus','Family','score',)
y = df['agb'] #Target

In [4]:
#OneHotEncoding of String Variables
encoder = preprocessing.OneHotEncoder() #OneHotEncoder for all 'categorical' variables
onehot = encoder.fit(X)
X=onehot.transform(X)

In [5]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.3)

<IPython.core.display.Javascript object>

### XGBoost Initial Model Definitions

In [6]:
# Unsupervised Regression Predictor with default settings on GPU
predictor = xgb.XGBRegressor(tree_method='gpu_hist')
steps = 50  # number of training iterations

### XGBoost Initial Model Training and Testing

In [7]:
model = predictor.fit(X_train, Y_train,eval_set=[(X_test, Y_test)], verbose=False)
Y_train_pred = model.predict(X_train)
Y_pred = model.predict(X_test)

In [8]:
model

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=0,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=12, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='gpu_hist', validate_parameters=1, verbosity=None)

### XGBoost Initial Model Results

In [9]:
print('Train r2 score: ', r2_score(Y_train_pred, Y_train))
print('Test r2 score: ', r2_score(Y_test, Y_pred))
train_mse = mean_squared_error(Y_train_pred, Y_train)
test_mse = mean_squared_error(Y_pred, Y_test)
train_rmse = np.sqrt(train_mse)
test_rmse = np.sqrt(test_mse)
print('Train RMSE: %.4f' % train_rmse)
print('Test RMSE: %.4f' % test_rmse)

Train r2 score:  0.7361768792805197
Test r2 score:  0.42972132820109077
Train RMSE: 158096.0946
Test RMSE: 280498.7584


### XGBoost Hyperparameter Tuning

In [10]:
sorted(sklearn.metrics.SCORERS.keys())

['accuracy',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'average_precision',
 'balanced_accuracy',
 'completeness_score',
 'explained_variance',
 'f1',
 'f1_macro',
 'f1_micro',
 'f1_samples',
 'f1_weighted',
 'fowlkes_mallows_score',
 'homogeneity_score',
 'jaccard',
 'jaccard_macro',
 'jaccard_micro',
 'jaccard_samples',
 'jaccard_weighted',
 'max_error',
 'mutual_info_score',
 'neg_brier_score',
 'neg_log_loss',
 'neg_mean_absolute_error',
 'neg_mean_absolute_percentage_error',
 'neg_mean_gamma_deviance',
 'neg_mean_poisson_deviance',
 'neg_mean_squared_error',
 'neg_mean_squared_log_error',
 'neg_median_absolute_error',
 'neg_root_mean_squared_error',
 'normalized_mutual_info_score',
 'precision',
 'precision_macro',
 'precision_micro',
 'precision_samples',
 'precision_weighted',
 'r2',
 'rand_score',
 'recall',
 'recall_macro',
 'recall_micro',
 'recall_samples',
 'recall_weighted',
 'roc_auc',
 'roc_auc_ovo',
 'roc_auc_ovo_weighted',
 'roc_auc_ovr',
 'roc_auc_ovr_we

In [None]:
predictor = xgb.XGBRegressor()
parameters = {
    'eta': [0.05, 0.10, 0.15, 0.20, 0.25, 0.30],  # learning rates
    'n_estimators': [100, 200, 300, 500, 800, 1300, 2100],#Number of gradient boosted trees
    'max_depth': [3, 5, 8, 13, 21],#Maximum tree depth for base learners
    'gamma': [0.0, 0.1, 0.2, 0.3, 0.4],#Minimum loss reduction required to make a further partition 
    'min_child_weight': [1, 2, 3, 5, 8],#Minimum sum of instance weight
    'colsample_bytree' : [0.1, 0.2, 0.3, 0.5, 0.8]#Subsample ratio of columns when constructing each tree
}

grid = GridSearchCV(predictor, parameters, n_jobs=6, scoring='accuracy', cv=5)
grid.fit(X_train, Y_train)
grid.best_estimator_

In [71]:
grid.best_estimator_

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.1, eta=0.05, gamma=0.0,
             gpu_id=-1, importance_type='gain', interaction_constraints='',
             learning_rate=0.0500000007, max_delta_step=0, max_depth=3,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=12, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

### Model  Results after Tuning

In [None]:
tuned_model = XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.1, eta=0.05, gamma=0.0,
             gpu_id=-1, importance_type='gain', interaction_constraints='',
             learning_rate=0.0500000007, max_delta_step=0, max_depth=3,
             min_child_weight=1,monotone_constraints='()',
             n_estimators=100, n_jobs=12, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)
tuned_model.fit(X_train, Y_train,  
             eval_set=[(X_test, Y_test)], verbose=False)
Y_train_pred = tuned_model.predict(X_train)
Y_pred = tuned_model.predict(X_test)

### Evaluation of Model

In [None]:
#predictions = model.predict(D_test) predictions was changed to Y_pred
best_predictions = np.asarray([np.argmax(line) for line in Y_pred])
print("Precision = {}".format(precision_score(Y_test, best_predictions, average='macro')))
print("Recall = {}".format(recall_score(Y_test, best_predictions, average='macro')))
print("Accuracy = {}".format(accuracy_score(Y_test, best_predictions)))

In [77]:
print('Train r2 score: ', r2_score(Y_train_pred, Y_train))
print('Test r2 score: ', r2_score(Y_test, Y_pred))
train_mse = mean_squared_error(Y_train_pred, Y_train)
test_mse = mean_squared_error(Y_pred, Y_test)
train_rmse = np.sqrt(train_mse)
test_rmse = np.sqrt(test_mse)
print('Train RMSE: %.4f' % train_rmse)
print('Test RMSE: %.4f' % test_rmse)

Train r2 score:  -1.4263773959185508
Test r2 score:  0.24504685232265455
Train RMSE: 184763.2089
Test RMSE: 234827.0653
