## Advanced Modelling of the Data

In [35]:
# first lets reload the dataset
# we do need to import pandas as it is also need for display max_rows 
import pandas as pd
import cudf
import cudf as dd
import numpy as np
import gc
import matplotlib
from cuml.preprocessing.model_selection import train_test_split
from cuml.metrics import confusion_matrix, roc_auc_score
%matplotlib inline

In [2]:
# We will have to test the building of the features 
# to try and make the model actually learn

In [3]:
train = dd.read_parquet('raw_data/train.parquet')
test = dd.read_parquet('raw_data/test.parquet')

# Run the feature engineering

In [None]:
from basic_feature_engineering import basic_feature_engineering

In [4]:
# current treatment
def see_percent_missing_values(df):
    
    total_missing = df.isnull().sum()/df.shape[0]
    percent_missing = total_missing*100
    return percent_missing.sort_values(ascending=False).round(1)
app_train_mis_values = see_percent_missing_values(train)
df_app_train_miss_values= dd.DataFrame({'columns': app_train_mis_values.index, 
                                        'missing percent': app_train_mis_values.values})
# NOTE that in cudf some conversions are slightly different so whilst it is mostly 
# a drop in replacement sometimes adjustments do have to be made
if type(df_app_train_miss_values) == cudf.core.dataframe.DataFrame:
    drop_columns = df_app_train_miss_values[df_app_train_miss_values['missing percent'] \
                                        >= 40]['columns'].to_arrow().to_pylist()
else:
    drop_columns = df_app_train_miss_values[df_app_train_miss_values['missing percent'] \
                                        >= 40]['columns'].tolist()

train = train.drop(drop_columns, axis=1)
test = test.drop(drop_columns, axis=1)
train_target = train['TARGET']
train = train.drop('TARGET', axis=1)
# here we will use a basic dummy treatment
# we merged the dataframes first because when we dummify 
# we could have some columns only in train or only in test. Merging first will prevent this 
unified = dd.concat([train, test])
dummy_cols = unified.select_dtypes(['bool', 'O', 'category']).columns.tolist()
unified = dd.get_dummies(unified, columns=dummy_cols, dtype='int64')
## Splitting back into train and test


In [5]:
# XGB for pandas does not like Int64
for col in unified.select_dtypes('Int64').columns.tolist():
    unified[col] = unified[col].fillna(int(unified[col].mean()))
    unified[col] = unified[col].astype('int64')

In [6]:
unified.isna().any()[unified.isna().any()==True]

AMT_ANNUITY        True
AMT_GOODS_PRICE    True
EXT_SOURCE_2       True
EXT_SOURCE_3       True
dtype: bool

In [7]:
for col in unified.isna().any()[unified.isna().any()==True].index.to_arrow().tolist():
    unified[col] = unified[col].fillna(0)

In [8]:
train = unified[0:307511]
test = unified[307511:]

In [9]:
train.shape

(307511, 243)

In [10]:
# TODO - confirm -  we have an unbalanced pos to negative ratio so it will help to feed this into xgb
ratio = (train_target == 0).sum()/ (train_target == 1).sum()
ratio

11.387150050352467

## Building a more advanced XGB Model

Default train test split

In [11]:
X_train, X_test, y_train, y_test = train_test_split(train, train_target, 
                                                    test_size=0.3, random_state=42)

In [12]:
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold

In [13]:
skf = StratifiedKFold()

In [24]:
cv_params = {'tree_method': 'gpu_hist', 'max_depth': 6, 
        'learning_rate': 0.05, 'subsample':0.5, 'objective': 'binary:logistic',
         'eval_metric':'auc', 'scale_pos_weight': ratio,
         'gamma':0.2, 'subsample':0.5}

Add a training loop for the stratifiedkfold

In [25]:
for i, (train_index, test_index) in enumerate(skf.split(X_train.index.to_arrow().tolist(), 
                                                        y_train.to_arrow().tolist())):
    print("Fold {0}".format(i))
    X_train_kf, X_valid_kf = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train_kf, y_valid_kf = y_train.iloc[train_index], y_train.iloc[test_index]
    
    train_matrix_kf= xgb.DMatrix(X_train_kf, label=y_train_kf)
    val_matrix_kf = xgb.DMatrix(X_valid_kf, label=y_valid_kf)
    
    bst = xgb.train(params=cv_params, dtrain=train_matrix_kf, 
                evals=[(train_matrix_kf, 'train'), (val_matrix_kf, 'valid')], 
                num_boost_round=100, early_stopping_rounds=20, verbose_eval=20)

Fold 0
[0]	train-auc:0.71426	valid-auc:0.69691
[20]	train-auc:0.74964	valid-auc:0.73145
[40]	train-auc:0.76208	valid-auc:0.73889
[60]	train-auc:0.77402	valid-auc:0.74387
[80]	train-auc:0.78200	valid-auc:0.74629
[99]	train-auc:0.78744	valid-auc:0.74777
Fold 1
[0]	train-auc:0.71689	valid-auc:0.70880
[20]	train-auc:0.74935	valid-auc:0.73418
[40]	train-auc:0.76134	valid-auc:0.74004
[60]	train-auc:0.77177	valid-auc:0.74531
[80]	train-auc:0.77993	valid-auc:0.74809
[99]	train-auc:0.78541	valid-auc:0.74954
Fold 2
[0]	train-auc:0.71438	valid-auc:0.70856
[20]	train-auc:0.74866	valid-auc:0.73700
[40]	train-auc:0.76086	valid-auc:0.74337
[60]	train-auc:0.77159	valid-auc:0.74849
[80]	train-auc:0.77923	valid-auc:0.75072
[99]	train-auc:0.78550	valid-auc:0.75190
Fold 3
[0]	train-auc:0.71893	valid-auc:0.70896
[20]	train-auc:0.75027	valid-auc:0.73435
[40]	train-auc:0.76253	valid-auc:0.74050
[60]	train-auc:0.77289	valid-auc:0.74505
[80]	train-auc:0.78096	valid-auc:0.74753
[99]	train-auc:0.78666	valid-auc:

Once things look a bit stable with the tuning, we can train the whole dataset once

In [18]:
full_cv_params =  cv_params
full_cv_params['learning_rate'] = cv_params['learning_rate']/10

In [19]:
train_matrix = xgb.DMatrix(X_train, label=y_train)
val_matrix = xgb.DMatrix(X_test, label=y_test)
    
final_bst = xgb.train(params=full_cv_params, dtrain=train_matrix, 
                evals=[(train_matrix, 'train'), (val_matrix, 'valid')], 
                num_boost_round=500, early_stopping_rounds=20, verbose_eval=50)

[0]	train-auc:0.71691	valid-auc:0.70097
[50]	train-auc:0.74118	valid-auc:0.72393
[100]	train-auc:0.74755	valid-auc:0.72804
[150]	train-auc:0.75342	valid-auc:0.73184
[200]	train-auc:0.75888	valid-auc:0.73510
[250]	train-auc:0.76394	valid-auc:0.73813
[300]	train-auc:0.76880	valid-auc:0.74071
[350]	train-auc:0.77305	valid-auc:0.74312
[400]	train-auc:0.77651	valid-auc:0.74459
[450]	train-auc:0.77947	valid-auc:0.74563
[499]	train-auc:0.78235	valid-auc:0.74670


In [31]:
adv_y_pred = final_bst.predict(val_matrix)
# convert to 0/1 based on threshold
adv_y_final = np.where(adv_y_pred>0.5, 1, 0)

In [32]:
adv_y_final

array([0, 1, 0, ..., 0, 0, 0])

# Final Assessments

In [33]:
confusion_matrix(y_test, adv_y_final.astype('Int64'))

  confusion_matrix(y_test, adv_y_final.astype('Int64'))


Unnamed: 0,0,1
0,39626,16820
1,1728,3328


In [36]:
roc_auc_score(y_test, adv_y_pred)

0.7466998100280762