# Part 1 of 3: Get key packages and data

In [2]:
#-- get key packages

import io
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xgboost
from sklearn.metrics import confusion_matrix, accuracy_score, roc_curve, classification_report, roc_auc_score
from sklearn.model_selection import train_test_split, GridSearchCV

#-- make sure we can read in the file without having to scan for viruses

!pip install --upgrade google-api-python-client google-auth-httplib2 google-auth-oauthlib
from google.colab import auth
from googleapiclient.discovery import build
from googleapiclient.http import MediaIoBaseDownload

auth.authenticate_user()

def download_file_from_google_drive(file_id):
    drive_service = build('drive', 'v3')
    request = drive_service.files().get_media(fileId=file_id)
    downloaded = io.BytesIO()
    downloader = MediaIoBaseDownload(downloaded, request)
    done = False
    while done is False:
        status, done = downloader.next_chunk()
        print("Download %d%%." % int(status.progress() * 100))
    downloaded.seek(0)
    return downloaded

#-- get data

file_id_training = '1j1bCvwfs4RUU-IocMr6qwAiyvBC-YURd'
file_id_test = '1C9Y8ppzcqm3ZcZ1eGa4w7NurdV-EJh4D'

training_data = download_file_from_google_drive(file_id_training)
test_data = download_file_from_google_drive(file_id_test)

df_training = pd.read_csv(training_data)
df_test = pd.read_csv(test_data)

print('### Viewing the head of training and testing data')
print(df_training.head())
print('')
print(df_test.head())


Collecting google-api-python-client
  Downloading google_api_python_client-2.125.0-py2.py3-none-any.whl (12.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.5/12.5 MB[0m [31m56.8 MB/s[0m eta [36m0:00:00[0m
Collecting google-auth-httplib2
  Downloading google_auth_httplib2-0.2.0-py2.py3-none-any.whl (9.3 kB)
Installing collected packages: google-auth-httplib2, google-api-python-client
  Attempting uninstall: google-auth-httplib2
    Found existing installation: google-auth-httplib2 0.1.1
    Uninstalling google-auth-httplib2-0.1.1:
      Successfully uninstalled google-auth-httplib2-0.1.1
  Attempting uninstall: google-api-python-client
    Found existing installation: google-api-python-client 2.84.0
    Uninstalling google-api-python-client-2.84.0:
      Successfully uninstalled google-api-python-client-2.84.0
Successfully installed google-api-python-client-2.125.0 google-auth-httplib2-0.2.0
Download 34%.
Download 69%.
Download 100%.
Download 34%.
Download 

# Part 2 of 3: Try a model

In [3]:
#-- split data into training and testing portions

var_cols = [x for x in df_training.columns if x not in ['ID_code', 'target']]
X = df_training.loc[:, var_cols]
y = df_training.loc[:, 'target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

test_set = [(X_test, y_test)]


In [9]:
print(df_training['target'].unique())

[0 1]


In [4]:
#-- view default parameters

dict_DefaultParams = xgboost.XGBClassifier().get_params()
print(dict_DefaultParams)


{'objective': 'binary:logistic', 'base_score': None, 'booster': None, 'callbacks': None, 'colsample_bylevel': None, 'colsample_bynode': None, 'colsample_bytree': None, 'device': None, 'early_stopping_rounds': None, 'enable_categorical': False, 'eval_metric': None, 'feature_types': None, 'gamma': None, 'grow_policy': None, 'importance_type': None, 'interaction_constraints': None, 'learning_rate': None, 'max_bin': None, 'max_cat_threshold': None, 'max_cat_to_onehot': None, 'max_delta_step': None, 'max_depth': None, 'max_leaves': None, 'min_child_weight': None, 'missing': nan, 'monotone_constraints': None, 'multi_strategy': None, 'n_estimators': None, 'n_jobs': None, 'num_parallel_tree': None, 'random_state': None, 'reg_alpha': None, 'reg_lambda': None, 'sampling_method': None, 'scale_pos_weight': None, 'subsample': None, 'tree_method': None, 'validate_parameters': None, 'verbosity': None}


In [5]:
#-- try a model

model_xgboost = xgboost.XGBClassifier(learning_rate=0.1,
                                      max_depth=5,
                                      n_estimators=5000,
                                      subsample=0.5,
                                      colsample_bytree=0.5,
                                      eval_metric='auc',
                                      verbosity=1)

model_xgboost.fit(X_train,
                  y_train,
                  early_stopping_rounds=10,
                  eval_set=test_set,
                  verbose=True)

y_preds_train = model_xgboost.predict_proba(X_train)[:, 1]
y_preds_test = model_xgboost.predict_proba(X_test)[:, 1]

auc_train = roc_auc_score(y_train, y_preds_train)
auc_test = roc_auc_score(y_test, y_preds_test)

print('### performance metrics for the model')
print(f'# auc_train: {auc_train:.3f}')
print(f'# auc_test: {auc_test:.3f}')





[0]	validation_0-auc:0.63056
[1]	validation_0-auc:0.66356
[2]	validation_0-auc:0.67492
[3]	validation_0-auc:0.68708
[4]	validation_0-auc:0.69386
[5]	validation_0-auc:0.70100
[6]	validation_0-auc:0.71445
[7]	validation_0-auc:0.72434
[8]	validation_0-auc:0.73182
[9]	validation_0-auc:0.73992
[10]	validation_0-auc:0.74229
[11]	validation_0-auc:0.74606
[12]	validation_0-auc:0.74978
[13]	validation_0-auc:0.75133
[14]	validation_0-auc:0.75790
[15]	validation_0-auc:0.76117
[16]	validation_0-auc:0.76515
[17]	validation_0-auc:0.76988
[18]	validation_0-auc:0.77291
[19]	validation_0-auc:0.77502
[20]	validation_0-auc:0.77609
[21]	validation_0-auc:0.77929
[22]	validation_0-auc:0.78212
[23]	validation_0-auc:0.78383
[24]	validation_0-auc:0.78690
[25]	validation_0-auc:0.79035
[26]	validation_0-auc:0.79246
[27]	validation_0-auc:0.79489
[28]	validation_0-auc:0.79577
[29]	validation_0-auc:0.79776
[30]	validation_0-auc:0.79890
[31]	validation_0-auc:0.80101
[32]	validation_0-auc:0.80358
[33]	validation_0-au

# Part 3 of 3: Perform hyperparameter tuning; try model with best parameters

In [20]:
#-- hyperparameter tuning using GridSearchCV

list_learning_rates = [0.02, 0.05, 0.1]
list_max_depth = [2, 3, 5]
list_n_estimators = [1000, 2000, 3000]

param_grid = {
    'learning_rate': list_learning_rates,
    'max_depth': list_max_depth,
    'n_estimators': list_n_estimators
}

#-- make a function to be used for scoring

def my_roc_auc_score(model, X, y):
  return roc_auc_score(y, model.predict_proba(X)[:, 1])

model_xgboost_hp = GridSearchCV(estimator=xgboost.XGBClassifier(subsample=0.5,
                                                               colsample_bytree=0.25,
                                                               eval_metric='auc',
                                                               use_label_encoder=False),
                               param_grid=param_grid,
                               cv=2,
                               scoring=my_roc_auc_score,
                               return_train_score=True,
                               verbose=4)

model_xgboost_hp.fit(X, y)

df_cv_results = pd.DataFrame(model_xgboost_hp.cv_results_)
df_cv_results = df_cv_results[['rank_test_score', 'mean_test_score', 'mean_train_score',
                               'param_learning_rate','param_max_depth', 'param_n_estimators']]

df_cv_results.sort_values(by='rank_test_score', inplace=True)

print('### View top 5 results after using GridSearchCV')
print(df_cv_results.head())


Fitting 2 folds for each of 27 candidates, totalling 54 fits
[CV 1/2] END learning_rate=0.02, max_depth=2, n_estimators=1000;, score=(train=0.879, test=0.855) total time=  56.9s
[CV 2/2] END learning_rate=0.02, max_depth=2, n_estimators=1000;, score=(train=0.876, test=0.861) total time=  57.2s
[CV 1/2] END learning_rate=0.02, max_depth=2, n_estimators=2000;, score=(train=0.905, test=0.877) total time= 1.9min
[CV 2/2] END learning_rate=0.02, max_depth=2, n_estimators=2000;, score=(train=0.901, test=0.883) total time= 1.9min
[CV 1/2] END learning_rate=0.02, max_depth=2, n_estimators=3000;, score=(train=0.917, test=0.886) total time= 2.8min
[CV 2/2] END learning_rate=0.02, max_depth=2, n_estimators=3000;, score=(train=0.913, test=0.892) total time= 2.8min
[CV 1/2] END learning_rate=0.02, max_depth=3, n_estimators=1000;, score=(train=0.907, test=0.869) total time= 1.1min
[CV 2/2] END learning_rate=0.02, max_depth=3, n_estimators=1000;, score=(train=0.903, test=0.875) total time= 1.1min
[CV

In [12]:
#-- try the best model

best_learning_rate = df_cv_results['learning_rate'].iloc[0]
best_max_depth = df_cv_results['max_depth'].iloc[0]
best_n_estimators = df_cv_results['n_estimators'].iloc[0]

model_xgboost = xgboost.XGBClassifier(learning_rate=best_learning_rate,
                                      max_depth=best_max_depth,
                                      n_estimators=best_n_estimators,
                                      subsample=0.5,
                                      colsample_bytree=0.25,
                                      eval_metric='auc',
                                      verbosity=1)

model_xgboost.fit(X_train,
                  y_train,
                  early_stopping_rounds=10,
                  eval_set=test_set,
                  verbose=False)

y_preds_train = model_xgboost.predict_proba(X_train)[:, 1]
y_preds_test = model_xgboost.predict_proba(X_test)[:, 1]

y_preds_binary_test = model_xgboost.predict(X_test)
model_class_report = classification_report(y_test, y_preds_binary_test)

auc_train = roc_auc_score(y_train, y_preds_train)
auc_test = roc_auc_score(y_test, y_preds_test)

print('### performance metrics for the model')
print(f'# auc_train: {auc_train:.3f}')
print(f'# auc_test: {auc_test:.3f}')
print('# classification report:')
print(model_class_report)


### performance metrics for the model
# auc_train: 0.915
# auc_test: 0.897
# classification report:
              precision    recall  f1-score   support

           0       0.92      0.99      0.96     35936
           1       0.83      0.28      0.42      4064

    accuracy                           0.92     40000
   macro avg       0.88      0.64      0.69     40000
weighted avg       0.91      0.92      0.90     40000

