In [264]:
%load_ext autoreload
%autoreload 2

import sys
import numpy as np
import pandas as pd
import joblib
import xgboost as xgb
from importlib.machinery import SourceFileLoader
# For Hyperparameter optimization
from hpsklearn import HyperoptEstimator
from hyperopt import tpe, hp, fmin, STATUS_OK,Trials

# TODO Jupyter working directory is /notebooks. Therefore importing "from src.data" isn't working. Manually passing --notebook-dir
# when running the notebook didn't work. Needs to be investigated and fixed. This is a workaround
process_test_data = SourceFileLoader('process_test_data', '../src/data/process_test_data.py').load_module()
visualize = SourceFileLoader('visualize', '../src/visualization/visualize.py').load_module()

# True if you want to run with the Kaggle train and test set for submission. Otherwise it will use split kaggle train data for model 
# optimization to calculate stats and parameter optimization
use_kaggle_data = False 
run_parameter_optimization = True
run_feature_selection = True

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


If not using Kaggle data set for submission, split train datasets for training (80%), testing (10%) and validation (10%)
and normalize features using MinMaxScaler. Else load full Kaggle data and predict using Kaggle test set for submission

In [265]:
if use_kaggle_data:
    X_train, y_train, X_test = process_test_data.load_kaggle_train_and_test_data('../data/raw/train.csv', '../data/raw/test.csv')
else:    
    X_train, y_train, X_test, y_test, X_valid, y_valid = \
    process_test_data.split_and_normalize('../data/raw/train.csv', '../data/processed')

Original train shape: (8000, 21)
Concat shape: (8000, 20)
Files written to: ../data/processed
X_train shape: (6400, 19)
y_train shape: (6400, 1)
X_test shape: (800, 20)
y_test shape: (800, 1)
X_valid shape: (800, 20)
y_valid shape: (800, 1)


Check details of the data if required

In [266]:
 # X_test.describe()

Selecting features using sequential feature selection if required

In [267]:
if run_feature_selection==True:
    num_of_features_to_select = 13
    features =  process_test_data.sequential_feature_selection('../data/raw/train.csv', num_of_features_to_select) # ['GP', 'MIN', 'FGM', '3P Made', 'OREB', 'BLK', 'TOV']
    X_train = X_train[features]
    # Appending Id column since it should be kept
    features.append('Id')
    X_test = X_test[features]

['GP' 'MIN' 'PTS' 'FGM' 'FGA' 'FG%' '3P Made' '3PA' 'OREB' 'DREB' 'REB'
 'BLK' 'TOV']


Running parameter optimization if required

In [268]:
# Defining pre-identified best parameters if parameter optimization is not going to run
hyp_params = { # kaggle score of 0.70342, with 13 features
'colsample_bytree': 0.75,
'grow_policy': 'lossguide',
'learning_rate': 0.2,
'max_delta_step': 2,
'max_depth': 1, 
'min_child_weight': 2,
'min_split_loss': 12,
'subsample': 0.95,
'booster': 'gbtree'     
}

def hyperparameter_tuning(space):
    from sklearn.model_selection import cross_val_score
    from sklearn.metrics import roc_auc_score
    
    xgboost = xgb.XGBClassifier(**space, eval_metric="auc", use_label_encoder=False) 
    acc = cross_val_score(xgboost, X_train, y_train, cv=10, scoring='roc_auc').mean()
    return{'loss': 1-acc, 'status': STATUS_OK }

if run_parameter_optimization==True:
    booster = ['gbtree', 'gblinear', 'dart']
    grow_policy = ['depthwise', 'lossguide']
    
    space = {
    "max_depth": hp.choice('max_depth', range(5, 20, 1)),
    "learning_rate": hp.quniform('learning_rate', 0.01, 0.5, 0.05),
    "min_child_weight": hp.quniform('min_child_weight', 1, 10, 1),
    "subsample": hp.quniform('subsample', 0.1, 1, 0.05),
    "colsample_bytree": hp.quniform('colsample_bytree', 0.1, 1.0, 0.05),
    "min_split_loss": hp.choice('min_split_loss', range(0, 20, 1)),
    "max_delta_step": hp.choice('max_delta_step', range(0, 10, 1)),
    "grow_policy": hp.choice("grow_policy", grow_policy),     
    "booster": booster[0]     
    }
    
    # Initialize trials object
    trials = Trials()
    
    best = fmin(fn=hyperparameter_tuning, space = space, algo=tpe.suggest, max_evals=100, trials=trials)
    
    hyp_params['max_depth'] = best['max_depth']
    hyp_params['learning_rate'] = best['learning_rate']
    hyp_params['min_child_weight'] = best['min_child_weight']
    hyp_params['subsample'] = best['subsample']
    hyp_params['colsample_bytree'] = best['colsample_bytree'] 
    hyp_params['min_split_loss'] = best['min_split_loss']
    hyp_params['max_delta_step'] = best['max_delta_step']
    hyp_params['grow_policy'] = grow_policy[best['grow_policy']]
    hyp_params['booster'] = booster[0]
    
    print("Best: {}".format(best))

100%|██████████████████████████████████████████████████████████████████████████████████████| 100/100 [10:21<00:00,  6.22s/trial, best loss: 0.2980653739630136]
Best: {'colsample_bytree': 0.8, 'grow_policy': 1, 'learning_rate': 0.2, 'max_delta_step': 3, 'max_depth': 10, 'min_child_weight': 9.0, 'min_split_loss': 13, 'subsample': 0.5}


Training the xgboost

In [269]:
xgboost = xgb.XGBClassifier(**hyp_params, use_label_encoder=False)

# Converting column y values to 1d array
xgboost.fit(X_train, y_train, eval_metric='auc')

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8,
              enable_categorical=False, gamma=13, gpu_id=-1,
              grow_policy='lossguide', importance_type=None,
              interaction_constraints='', learning_rate=0.2, max_delta_step=3,
              max_depth=10, min_child_weight=9.0, min_split_loss=13,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=8, num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=0.5,
              tree_method='exact', use_label_encoder=False,
              validate_parameters=1, ...)

Predicting using trained random forest 

In [270]:
# Selecting columns to train
test_X = X_test.loc[:, 'GP':'TOV']
# Selecting Ids for CSV
test_X_Ids = X_test.loc[:,'Id']

if use_kaggle_data==True:
    # Predicting probabilities for kaggle submission and selecting probability of class 1.
    pred = xgboost.predict_proba(test_X)[:,1]  
else:
    # Predicting classes (1 or 0) for calculating accuracy
    pred = xgboost.predict(test_X) 
    # Probabilities for calculating ROC
    rf_probs = xgboost.predict_proba(test_X)[:,1]

# Data frame with ID for csv writing. In Kaggle mode pred will contains probabilities and else contains classes
result = pd.DataFrame(data = {'Id': test_X_Ids, 'TARGET_5Yrs': pred}) 
# Extracting values for calculating stats
result_values = result[['TARGET_5Yrs']] 

Saving the trainned model and writing result to a CSV file

In [271]:
joblib.dump(xgboost, "../models/nuwan_xgboost_v16.joblib", compress=3)

['../models/nuwan_xgboost_v16.joblib']

Show stats related to performance of the model if not using Kaggle dataset

In [272]:
if use_kaggle_data==False:
    visualize.show_random_forest_stats(xgboost, test_X, y_test, rf_probs)
    # visualize.show_feature_importance(rf, X_train) # Uncomment to see feature importance if required
else:
    result.to_csv("../data/external/submission_nuwan_v16.csv", index = False)
    print("Kaggle dataset and no stats. Writing to a file.")

Average absolute error: 16.874999999999996%
ROC: 0.70496
