## Advanced Modelling of the Data

In [1]:
# first lets reload the dataset
# we do need to import pandas as it is also need for display max_rows 
import pandas as pd
import cudf
import cudf as dd
import numpy as np
import gc
import matplotlib
from cuml.preprocessing.model_selection import train_test_split
from cuml.metrics import confusion_matrix, roc_auc_score
use_gpu=True
%matplotlib inline

In [2]:
# We will have to test the building of the features 
# to try and make the model actually learn

In [3]:
train = dd.read_parquet('raw_data/train.parquet')
test = dd.read_parquet('raw_data/test.parquet')

# Run the feature engineering

In [4]:
from basic_feature_engineering import basic_feature_engineering

In [5]:
train, test, train_target = basic_feature_engineering(train, test, gpu=use_gpu)

In [6]:
# TODO - confirm -  we have an unbalanced pos to negative ratio so it will help to feed this into xgb
ratio = (train_target == 0).sum()/ (train_target == 1).sum()
ratio

11.387150050352467

## Building a more advanced XGB Model

Default train test split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(train, train_target, 
                                                    test_size=0.3, random_state=42)

In [8]:
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold

In [9]:
skf = StratifiedKFold()

In [10]:
cv_params = {'tree_method': 'gpu_hist', 'max_depth': 6, 
        'learning_rate': 0.05, 'subsample':0.5, 'objective': 'binary:logistic',
         'eval_metric':'auc', 'scale_pos_weight': ratio,
         'gamma':0.2, 'subsample':0.5}

Add a training loop for the stratifiedkfold

In [11]:
for i, (train_index, test_index) in enumerate(skf.split(X_train.index.to_arrow().tolist(), 
                                                        y_train.to_arrow().tolist())):
    print("Fold {0}".format(i))
    X_train_kf, X_valid_kf = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train_kf, y_valid_kf = y_train.iloc[train_index], y_train.iloc[test_index]
    
    train_matrix_kf= xgb.DMatrix(X_train_kf, label=y_train_kf)
    val_matrix_kf = xgb.DMatrix(X_valid_kf, label=y_valid_kf)
    
    bst = xgb.train(params=cv_params, dtrain=train_matrix_kf, 
                evals=[(train_matrix_kf, 'train'), (val_matrix_kf, 'valid')], 
                num_boost_round=100, early_stopping_rounds=20, verbose_eval=20)

Fold 0
[0]	train-auc:0.71352	valid-auc:0.69314
[20]	train-auc:0.75283	valid-auc:0.73045
[40]	train-auc:0.76562	valid-auc:0.73821
[60]	train-auc:0.77633	valid-auc:0.74302
[80]	train-auc:0.78428	valid-auc:0.74518
[99]	train-auc:0.79064	valid-auc:0.74689
Fold 1
[0]	train-auc:0.71511	valid-auc:0.71271
[20]	train-auc:0.75183	valid-auc:0.73679
[40]	train-auc:0.76320	valid-auc:0.74272
[60]	train-auc:0.77470	valid-auc:0.74753
[80]	train-auc:0.78279	valid-auc:0.75105
[99]	train-auc:0.78943	valid-auc:0.75290
Fold 2
[0]	train-auc:0.71455	valid-auc:0.70772
[20]	train-auc:0.75122	valid-auc:0.73901
[40]	train-auc:0.76330	valid-auc:0.74521
[60]	train-auc:0.77417	valid-auc:0.74937
[80]	train-auc:0.78267	valid-auc:0.75172
[99]	train-auc:0.78883	valid-auc:0.75287
Fold 3
[0]	train-auc:0.71567	valid-auc:0.70026
[20]	train-auc:0.75284	valid-auc:0.72977
[40]	train-auc:0.76479	valid-auc:0.73688
[60]	train-auc:0.77571	valid-auc:0.74255
[80]	train-auc:0.78340	valid-auc:0.74501
[99]	train-auc:0.79020	valid-auc:

Once things look a bit stable with the tuning, we can train the whole dataset once

In [12]:
full_cv_params =  cv_params
full_cv_params['learning_rate'] = cv_params['learning_rate']/10

In [13]:
train_matrix = xgb.DMatrix(X_train, label=y_train)
val_matrix = xgb.DMatrix(X_test, label=y_test)
    
final_bst = xgb.train(params=full_cv_params, dtrain=train_matrix, 
                evals=[(train_matrix, 'train'), (val_matrix, 'valid')], 
                num_boost_round=500, early_stopping_rounds=20, verbose_eval=50)

[0]	train-auc:0.71612	valid-auc:0.70022
[50]	train-auc:0.74041	valid-auc:0.72289
[100]	train-auc:0.74389	valid-auc:0.72523
[150]	train-auc:0.74716	valid-auc:0.72754
[200]	train-auc:0.75017	valid-auc:0.72914
[250]	train-auc:0.75299	valid-auc:0.73072
[300]	train-auc:0.75588	valid-auc:0.73228
[350]	train-auc:0.75880	valid-auc:0.73395
[400]	train-auc:0.76170	valid-auc:0.73556
[450]	train-auc:0.76459	valid-auc:0.73708
[499]	train-auc:0.76716	valid-auc:0.73842


In [14]:
adv_y_pred = final_bst.predict(val_matrix)
# convert to 0/1 based on threshold
adv_y_final = np.where(adv_y_pred>0.5, 1, 0)

In [15]:
adv_y_final

array([0, 0, 1, ..., 1, 1, 0])

# Final Assessments

In [16]:
confusion_matrix(y_test, adv_y_final.astype('Int64'))

  confusion_matrix(y_test, adv_y_final.astype('Int64'))


Unnamed: 0,0,1
0,58659,26093
1,2551,4950


In [17]:
roc_auc_score(y_test, adv_y_pred)

0.738418459892273