In [1]:
%load_ext autoreload
%autoreload 2

# Getting Started Kaggle TPS Challenge with Tabular ML Toolkit

> A Tutorial to showcase usage of tabular_ml_toolkit (tmlt) library on Kaggle TPS Challenge Nov 2021.

> tabular_ml_toolkit is a helper library to jumpstart your machine learning project based on Tabular or Structured data.

> It comes with model parallelism and cutting edge hyperparameter search techniques.

> Under the hood TMLT uses optuna, xgboost and scikit-learn pipelines

## Install

`pip install -U tabular_ml_toolkit`

### How to Best Use tabular_ml_toolkit

Start with your favorite model and then just simply create **tmlt** with one API.

*Here we are using XGBClassifier, on  [Kaggle TPS Challenge (Nov 2021) data](https://www.kaggle.com/c/tabular-playground-series-nov-2021/data)*

In [2]:
from tabular_ml_toolkit.tmlt import *
from xgboost import XGBClassifier
import numpy as np
import gc

In [3]:
from sklearn.metrics import roc_auc_score, accuracy_score, log_loss

In [4]:
# Dataset file names and Paths
DIRECTORY_PATH = "/home/pankaj/kaggle_datasets/tpc_dec_2021/"
TRAIN_FILE = "train.csv"
TEST_FILE = "test.csv"
SAMPLE_SUB_FILE = "sample_submission.csv"
OUTPUT_PATH = "kaggle_tps_dec_output/"

#### Just point tmlt in the direction of your data

#### Let it know what are index and target columns in your tabular data

#### what kind of problem type you are trying to resolve?

In [5]:
# create tmlt
tmlt = TMLT().prepare_data(
    train_file_path= DIRECTORY_PATH + TRAIN_FILE,
    test_file_path= DIRECTORY_PATH + TEST_FILE,
    #make sure to use right index and target columns
    idx_col="Id",
    target="Cover_Type",
    random_state=42,
    problem_type="multi_class_classification",
#     nrows=4000
)


# tmlt supports only below task type:
    # "binary_classification"
    # "multi_label_classification"
    # "multi_class_classification"
    # "regression"

2021-12-18 19:11:22,313 INFO 8 cores found, model and data parallel processing should worked!
2021-12-18 19:11:37,395 INFO DataFrame Memory usage decreased to 274.66 Mb (83.9% reduction)
2021-12-18 19:11:40,013 INFO DataFrame Memory usage decreased to 67.71 Mb (83.9% reduction)
2021-12-18 19:11:40,303 INFO The least class label is :5 and value count is: 1
2021-12-18 19:11:40,307 INFO The Original X shape is: (4000000, 55)
2021-12-18 19:11:40,422 INFO The X shape after least class duplicates appends is: (4000021, 55)
2021-12-18 19:11:42,082 INFO PreProcessing will include target(s) encoding!
2021-12-18 19:11:42,190 INFO categorical columns are None, Preprocessing will done accordingly!


In [6]:
print(type(tmlt.dfl.X))
print(tmlt.dfl.X.shape)
print(type(tmlt.dfl.y))
print(tmlt.dfl.y.shape)
print(type(tmlt.dfl.X_test))
print(tmlt.dfl.X_test.shape)

<class 'pandas.core.frame.DataFrame'>
(4000021, 54)
<class 'numpy.ndarray'>
(4000021,)
<class 'pandas.core.frame.DataFrame'>
(1000000, 54)


In [7]:
tmlt.dfl.X.tail()

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type31,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40
4000016,2953,114,39,97,111,981,181,209,184,7633,...,0,0,0,0,0,0,0,0,0,0
4000017,2953,114,39,97,111,981,181,209,184,7633,...,0,0,0,0,0,0,0,0,0,0
4000018,2953,114,39,97,111,981,181,209,184,7633,...,0,0,0,0,0,0,0,0,0,0
4000019,2953,114,39,97,111,981,181,209,184,7633,...,0,0,0,0,0,0,0,0,0,0
4000020,2953,114,39,97,111,981,181,209,184,7633,...,0,0,0,0,0,0,0,0,0,0


In [8]:
import pandas as pd
print(dict(pd.Series(tmlt.dfl.y).value_counts()))

{1: 2262087, 0: 1468136, 2: 195712, 6: 62261, 5: 11426, 3: 377, 4: 22}


In [9]:
# gc.collect()

### Training


##### create train valid dataframes for quick preprocessing and training

In [10]:
%%time
# create train, valid split to evaulate model on valid dataset
X_train, X_valid,  y_train_np, y_valid_np =  tmlt.dfl.create_train_valid(valid_size=0.2)

CPU times: user 928 ms, sys: 161 µs, total: 929 ms
Wall time: 927 ms


In [11]:
print(X_train.shape)
print(y_train_np.shape)
print(X_valid.shape)
print(y_valid_np.shape)

# print(X_train.columns.to_list())

(3200016, 54)
(3200016,)
(800005, 54)
(800005,)


In [12]:
# check for class values see if both train and valid have same class labels
print(dict(pd.Series(y_train_np).value_counts()))
print(dict(pd.Series(y_valid_np).value_counts()))

{1: 1809706, 0: 1174364, 2: 156645, 6: 49832, 5: 9146, 3: 305, 4: 18}
{1: 452381, 0: 293772, 2: 39067, 6: 12429, 5: 2280, 3: 72, 4: 4}


##### Now PreProcess X_train, X_valid

NOTE: Preprocessing gives back numpy arrays for pandas dataframe

In [13]:
%%time
X_train_np,  X_valid_np = tmlt.pp_fit_transform(X_train, X_valid)

#after getting processed np arrays delete pandas df
del [X_train, X_valid]

print(type(X_train_np))
print(X_train_np.shape)
# print(X_train_np)
print(type(X_valid_np))
print(X_valid_np.shape)

<class 'numpy.ndarray'>
(3200016, 54)
<class 'numpy.ndarray'>
(800005, 54)
CPU times: user 2.34 s, sys: 464 ms, total: 2.81 s
Wall time: 2.81 s


In [14]:
gc.collect()

24

#### Create a base xgb classifier model with your best guess params

In [15]:
# xgb_params = {
#     # your best guess params
#     'learning_rate':0.2,
#     'eval_metric':'mlogloss',
#     # must for xgb classifier otherwise warning will be shown
#     'use_label_encoder':False,
#     # because 42 is the answer for all the randomness in this universe
#     'random_state':42,
#     #for GPU
#     'tree_method': 'gpu_hist',
#     'predictor': 'gpu_predictor',
# }

# xgb_model = XGBClassifier(**xgb_params)

In [16]:
xgb_params = {
    'use_label_encoder': False,
    'learning_rate': 0.031717385787086945,
    'n_estimators': 700,
    'reg_lambda': 0.0818505295760825,
    'reg_alpha': 0.003802319140125198,
    'subsample': 0.7189482058736735,
    'colsample_bytree': 0.5676906823895052,
    'max_depth': 4,
    'tree_method': 'gpu_hist',
    'gpu_id': 0,
    'predictor': 'gpu_predictor',
}

xgb_model = XGBClassifier(**xgb_params)

In [17]:
# %%time
# # Now do model training
# xgb_model.fit(X_train_np, y_train_np,
#               verbose=True,
#               #detect & avoid overfitting
#               eval_set=[(X_valid_np, y_valid_np)],
#               eval_metric="mlogloss",
#               early_stopping_rounds=300
#              )

In [18]:
%%time
# Now do model training
xgb_model.fit(X_train_np, y_train_np,
              verbose=True,
              #detect & avoid overfitting
              eval_set=[(X_valid_np, y_valid_np)],
              eval_metric="mlogloss",
              early_stopping_rounds=300
             )

[0]	validation_0-mlogloss:1.87675
[1]	validation_0-mlogloss:1.79388
[2]	validation_0-mlogloss:1.73743
[3]	validation_0-mlogloss:1.69880
[4]	validation_0-mlogloss:1.65096
[5]	validation_0-mlogloss:1.61699
[6]	validation_0-mlogloss:1.58500
[7]	validation_0-mlogloss:1.52524
[8]	validation_0-mlogloss:1.49505
[9]	validation_0-mlogloss:1.44225
[10]	validation_0-mlogloss:1.41825
[11]	validation_0-mlogloss:1.36990
[12]	validation_0-mlogloss:1.33800
[13]	validation_0-mlogloss:1.31350
[14]	validation_0-mlogloss:1.29060
[15]	validation_0-mlogloss:1.25053
[16]	validation_0-mlogloss:1.22203
[17]	validation_0-mlogloss:1.18464
[18]	validation_0-mlogloss:1.16007
[19]	validation_0-mlogloss:1.12589
[20]	validation_0-mlogloss:1.10305
[21]	validation_0-mlogloss:1.08530
[22]	validation_0-mlogloss:1.06283
[23]	validation_0-mlogloss:1.03844
[24]	validation_0-mlogloss:1.02350
[25]	validation_0-mlogloss:1.00498
[26]	validation_0-mlogloss:0.99047
[27]	validation_0-mlogloss:0.96967
[28]	validation_0-mlogloss:0.9

[231]	validation_0-mlogloss:0.19507
[232]	validation_0-mlogloss:0.19473
[233]	validation_0-mlogloss:0.19438
[234]	validation_0-mlogloss:0.19389
[235]	validation_0-mlogloss:0.19355
[236]	validation_0-mlogloss:0.19333
[237]	validation_0-mlogloss:0.19301
[238]	validation_0-mlogloss:0.19266
[239]	validation_0-mlogloss:0.19243
[240]	validation_0-mlogloss:0.19206
[241]	validation_0-mlogloss:0.19185
[242]	validation_0-mlogloss:0.19157
[243]	validation_0-mlogloss:0.19126
[244]	validation_0-mlogloss:0.19094
[245]	validation_0-mlogloss:0.19061
[246]	validation_0-mlogloss:0.19021
[247]	validation_0-mlogloss:0.18983
[248]	validation_0-mlogloss:0.18942
[249]	validation_0-mlogloss:0.18909
[250]	validation_0-mlogloss:0.18877
[251]	validation_0-mlogloss:0.18849
[252]	validation_0-mlogloss:0.18812
[253]	validation_0-mlogloss:0.18775
[254]	validation_0-mlogloss:0.18744
[255]	validation_0-mlogloss:0.18703
[256]	validation_0-mlogloss:0.18669
[257]	validation_0-mlogloss:0.18629
[258]	validation_0-mlogloss:

[459]	validation_0-mlogloss:0.15305
[460]	validation_0-mlogloss:0.15296
[461]	validation_0-mlogloss:0.15286
[462]	validation_0-mlogloss:0.15274
[463]	validation_0-mlogloss:0.15267
[464]	validation_0-mlogloss:0.15260
[465]	validation_0-mlogloss:0.15247
[466]	validation_0-mlogloss:0.15241
[467]	validation_0-mlogloss:0.15231
[468]	validation_0-mlogloss:0.15224
[469]	validation_0-mlogloss:0.15217
[470]	validation_0-mlogloss:0.15208
[471]	validation_0-mlogloss:0.15202
[472]	validation_0-mlogloss:0.15196
[473]	validation_0-mlogloss:0.15190
[474]	validation_0-mlogloss:0.15173
[475]	validation_0-mlogloss:0.15157
[476]	validation_0-mlogloss:0.15148
[477]	validation_0-mlogloss:0.15137
[478]	validation_0-mlogloss:0.15129
[479]	validation_0-mlogloss:0.15121
[480]	validation_0-mlogloss:0.15113
[481]	validation_0-mlogloss:0.15102
[482]	validation_0-mlogloss:0.15094
[483]	validation_0-mlogloss:0.15083
[484]	validation_0-mlogloss:0.15072
[485]	validation_0-mlogloss:0.15058
[486]	validation_0-mlogloss:

[687]	validation_0-mlogloss:0.13608
[688]	validation_0-mlogloss:0.13600
[689]	validation_0-mlogloss:0.13595
[690]	validation_0-mlogloss:0.13589
[691]	validation_0-mlogloss:0.13581
[692]	validation_0-mlogloss:0.13574
[693]	validation_0-mlogloss:0.13569
[694]	validation_0-mlogloss:0.13565
[695]	validation_0-mlogloss:0.13559
[696]	validation_0-mlogloss:0.13554
[697]	validation_0-mlogloss:0.13547
[698]	validation_0-mlogloss:0.13543
[699]	validation_0-mlogloss:0.13537
CPU times: user 1min 11s, sys: 744 ms, total: 1min 12s
Wall time: 1min 7s


In [19]:
%%time
#predict
preds = xgb_model.predict(X_valid_np)
preds_probs = xgb_model.predict_proba(X_valid_np)

# Metrics
auc = roc_auc_score(y_valid_np, preds_probs, multi_class='ovr')
acc = accuracy_score(y_valid_np, preds)
lg_loss = log_loss(y_valid_np, preds_probs)

print(f"AUC is : {auc} , log loss is: {lg_loss}, while Accuracy is : {acc} ")

AUC is : 0.9967271198825289 , log loss is: 0.13537339522625522, while Accuracy is : 0.9489115693026918 
CPU times: user 4.87 s, sys: 140 ms, total: 5.01 s
Wall time: 2.83 s


In [20]:
# gc.collect()

#### Let's do Optuna based HyperParameter search to get best params for fit

In [21]:
# **Just make sure to supply an output directory path so hyperparameter search is saved**
study = tmlt.do_xgb_optuna_optimization(X_train_np, y_train_np, X_valid_np, y_valid_np, 
                                        optuna_db_path=OUTPUT_PATH, opt_timeout=360, use_gpu=True, verbose=True)
print(study.best_trial)

2021-12-18 19:12:56,867 INFO Optimization Direction is: maximize
[32m[I 2021-12-18 19:12:57,199][0m A new study created in RDB with name: tmlt_autoxgb[0m
2021-12-18 19:12:57,428 INFO final params {'learning_rate': 0.08320384031444016, 'n_estimators': 150, 'reg_lambda': 0.09967427846688841, 'reg_alpha': 2.0919151416672363e-08, 'subsample': 0.9055617386349193, 'colsample_bytree': 0.7918001490881874, 'max_depth': 3, 'tree_method': 'gpu_hist', 'gpu_id': 0, 'predictor': 'gpu_predictor'}
2021-12-18 19:12:57,430 INFO Training Started!


Parameters: { "eval_set" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




2021-12-18 19:13:11,331 INFO Training Ended!
2021-12-18 19:13:11,873 INFO accuracy_score: 0.9323116730520434
[32m[I 2021-12-18 19:13:11,919][0m Trial 0 finished with value: 0.9323116730520434 and parameters: {'learning_rate': 0.08320384031444016, 'n_estimators': 150, 'reg_lambda': 0.09967427846688841, 'reg_alpha': 2.0919151416672363e-08, 'subsample': 0.9055617386349193, 'colsample_bytree': 0.7918001490881874, 'max_depth': 3, 'early_stopping_rounds': 400}. Best is trial 0 with value: 0.9323116730520434.[0m
2021-12-18 19:13:12,098 INFO final params {'learning_rate': 0.09847921948714214, 'n_estimators': 70, 'reg_lambda': 0.0016673722721207227, 'reg_alpha': 10.72463097499535, 'subsample': 0.7347006549351925, 'colsample_bytree': 0.7877835708218863, 'max_depth': 8, 'tree_method': 'gpu_hist', 'gpu_id': 0, 'predictor': 'gpu_predictor'}
2021-12-18 19:13:12,099 INFO Training Started!


Parameters: { "eval_set" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




2021-12-18 19:13:23,827 INFO Training Ended!
2021-12-18 19:13:24,348 INFO accuracy_score: 0.9544852844669721
[32m[I 2021-12-18 19:13:24,398][0m Trial 1 finished with value: 0.9544852844669721 and parameters: {'learning_rate': 0.09847921948714214, 'n_estimators': 70, 'reg_lambda': 0.0016673722721207227, 'reg_alpha': 10.72463097499535, 'subsample': 0.7347006549351925, 'colsample_bytree': 0.7877835708218863, 'max_depth': 8, 'early_stopping_rounds': 284}. Best is trial 1 with value: 0.9544852844669721.[0m
2021-12-18 19:13:24,560 INFO final params {'learning_rate': 0.07270263838593004, 'n_estimators': 70, 'reg_lambda': 4.158686107543348, 'reg_alpha': 4.832344216776012e-05, 'subsample': 0.7744254884262353, 'colsample_bytree': 0.5991862306184299, 'max_depth': 4, 'tree_method': 'gpu_hist', 'gpu_id': 0, 'predictor': 'gpu_predictor'}
2021-12-18 19:13:24,562 INFO Training Started!


Parameters: { "eval_set" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




2021-12-18 19:13:32,573 INFO Training Ended!
2021-12-18 19:13:33,080 INFO accuracy_score: 0.9171005181217617
[32m[I 2021-12-18 19:13:33,111][0m Trial 2 finished with value: 0.9171005181217617 and parameters: {'learning_rate': 0.07270263838593004, 'n_estimators': 70, 'reg_lambda': 4.158686107543348, 'reg_alpha': 4.832344216776012e-05, 'subsample': 0.7744254884262353, 'colsample_bytree': 0.5991862306184299, 'max_depth': 4, 'early_stopping_rounds': 384}. Best is trial 1 with value: 0.9544852844669721.[0m
2021-12-18 19:13:33,291 INFO final params {'learning_rate': 0.060028759871909115, 'n_estimators': 200, 'reg_lambda': 0.02369049033959721, 'reg_alpha': 0.017469890276853566, 'subsample': 0.42167849406820923, 'colsample_bytree': 0.8558199886275242, 'max_depth': 8, 'tree_method': 'gpu_hist', 'gpu_id': 0, 'predictor': 'gpu_predictor'}
2021-12-18 19:13:33,292 INFO Training Started!


Parameters: { "eval_set" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




2021-12-18 19:14:06,388 INFO Training Ended!
2021-12-18 19:14:07,042 INFO accuracy_score: 0.9589290066937082
[32m[I 2021-12-18 19:14:07,102][0m Trial 3 finished with value: 0.9589290066937082 and parameters: {'learning_rate': 0.060028759871909115, 'n_estimators': 200, 'reg_lambda': 0.02369049033959721, 'reg_alpha': 0.017469890276853566, 'subsample': 0.42167849406820923, 'colsample_bytree': 0.8558199886275242, 'max_depth': 8, 'early_stopping_rounds': 381}. Best is trial 3 with value: 0.9589290066937082.[0m
2021-12-18 19:14:07,277 INFO final params {'learning_rate': 0.22460180743878044, 'n_estimators': 150, 'reg_lambda': 3.144893773482e-05, 'reg_alpha': 0.00023758525471934383, 'subsample': 0.2640308356915845, 'colsample_bytree': 0.7501402977241696, 'max_depth': 7, 'tree_method': 'gpu_hist', 'gpu_id': 0, 'predictor': 'gpu_predictor'}
2021-12-18 19:14:07,279 INFO Training Started!


Parameters: { "eval_set" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




2021-12-18 19:14:29,981 INFO Training Ended!
2021-12-18 19:14:30,586 INFO accuracy_score: 0.960716495521903
[32m[I 2021-12-18 19:14:30,611][0m Trial 4 finished with value: 0.960716495521903 and parameters: {'learning_rate': 0.22460180743878044, 'n_estimators': 150, 'reg_lambda': 3.144893773482e-05, 'reg_alpha': 0.00023758525471934383, 'subsample': 0.2640308356915845, 'colsample_bytree': 0.7501402977241696, 'max_depth': 7, 'early_stopping_rounds': 384}. Best is trial 4 with value: 0.960716495521903.[0m
2021-12-18 19:14:30,783 INFO final params {'learning_rate': 0.08751704194963776, 'n_estimators': 70, 'reg_lambda': 3.7769028139024638e-06, 'reg_alpha': 0.008098924177584226, 'subsample': 0.5573029584457505, 'colsample_bytree': 0.1442975801358519, 'max_depth': 1, 'tree_method': 'gpu_hist', 'gpu_id': 0, 'predictor': 'gpu_predictor'}
2021-12-18 19:14:30,784 INFO Training Started!


Parameters: { "eval_set" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




2021-12-18 19:14:36,683 INFO Training Ended!
2021-12-18 19:14:37,164 INFO accuracy_score: 0.8955606527459203
[32m[I 2021-12-18 19:14:37,187][0m Trial 5 finished with value: 0.8955606527459203 and parameters: {'learning_rate': 0.08751704194963776, 'n_estimators': 70, 'reg_lambda': 3.7769028139024638e-06, 'reg_alpha': 0.008098924177584226, 'subsample': 0.5573029584457505, 'colsample_bytree': 0.1442975801358519, 'max_depth': 1, 'early_stopping_rounds': 232}. Best is trial 4 with value: 0.960716495521903.[0m
2021-12-18 19:14:37,377 INFO final params {'learning_rate': 0.1355290970920128, 'n_estimators': 200, 'reg_lambda': 4.718494577681341, 'reg_alpha': 0.00011104324313160557, 'subsample': 0.5895752319252384, 'colsample_bytree': 0.20930792992141864, 'max_depth': 9, 'tree_method': 'gpu_hist', 'gpu_id': 0, 'predictor': 'gpu_predictor'}
2021-12-18 19:14:37,378 INFO Training Started!


Parameters: { "eval_set" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




2021-12-18 19:15:13,576 INFO Training Ended!
2021-12-18 19:15:14,330 INFO accuracy_score: 0.9510978056387147
[32m[I 2021-12-18 19:15:14,355][0m Trial 6 finished with value: 0.9510978056387147 and parameters: {'learning_rate': 0.1355290970920128, 'n_estimators': 200, 'reg_lambda': 4.718494577681341, 'reg_alpha': 0.00011104324313160557, 'subsample': 0.5895752319252384, 'colsample_bytree': 0.20930792992141864, 'max_depth': 9, 'early_stopping_rounds': 427}. Best is trial 4 with value: 0.960716495521903.[0m
2021-12-18 19:15:14,536 INFO final params {'learning_rate': 0.039131057793411916, 'n_estimators': 200, 'reg_lambda': 0.05898041689402695, 'reg_alpha': 0.18272546849762208, 'subsample': 0.5542545793591697, 'colsample_bytree': 0.37832542391291635, 'max_depth': 7, 'tree_method': 'gpu_hist', 'gpu_id': 0, 'predictor': 'gpu_predictor'}
2021-12-18 19:15:14,537 INFO Training Started!


Parameters: { "eval_set" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




2021-12-18 19:15:42,179 INFO Training Ended!
2021-12-18 19:15:42,803 INFO accuracy_score: 0.9403553727789201
[32m[I 2021-12-18 19:15:42,833][0m Trial 7 finished with value: 0.9403553727789201 and parameters: {'learning_rate': 0.039131057793411916, 'n_estimators': 200, 'reg_lambda': 0.05898041689402695, 'reg_alpha': 0.18272546849762208, 'subsample': 0.5542545793591697, 'colsample_bytree': 0.37832542391291635, 'max_depth': 7, 'early_stopping_rounds': 397}. Best is trial 4 with value: 0.960716495521903.[0m
2021-12-18 19:15:43,020 INFO final params {'learning_rate': 0.0231794139366893, 'n_estimators': 200, 'reg_lambda': 0.05514868823726771, 'reg_alpha': 1.706506915453636e-06, 'subsample': 0.7763880301240808, 'colsample_bytree': 0.27947864438356645, 'max_depth': 7, 'tree_method': 'gpu_hist', 'gpu_id': 0, 'predictor': 'gpu_predictor'}
2021-12-18 19:15:43,020 INFO Training Started!


Parameters: { "eval_set" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




2021-12-18 19:16:11,649 INFO Training Ended!
2021-12-18 19:16:12,289 INFO accuracy_score: 0.9118980506371835
[32m[I 2021-12-18 19:16:12,334][0m Trial 8 finished with value: 0.9118980506371835 and parameters: {'learning_rate': 0.0231794139366893, 'n_estimators': 200, 'reg_lambda': 0.05514868823726771, 'reg_alpha': 1.706506915453636e-06, 'subsample': 0.7763880301240808, 'colsample_bytree': 0.27947864438356645, 'max_depth': 7, 'early_stopping_rounds': 222}. Best is trial 4 with value: 0.960716495521903.[0m
2021-12-18 19:16:12,520 INFO final params {'learning_rate': 0.1495673382033785, 'n_estimators': 70, 'reg_lambda': 5.014993116284377e-05, 'reg_alpha': 8.058076988112284e-06, 'subsample': 0.3111559674675738, 'colsample_bytree': 0.29766445478666503, 'max_depth': 6, 'tree_method': 'gpu_hist', 'gpu_id': 0, 'predictor': 'gpu_predictor'}
2021-12-18 19:16:12,521 INFO Training Started!


Parameters: { "eval_set" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




2021-12-18 19:16:23,427 INFO Training Ended!
2021-12-18 19:16:23,951 INFO accuracy_score: 0.9349541565365217
[32m[I 2021-12-18 19:16:23,991][0m Trial 9 finished with value: 0.9349541565365217 and parameters: {'learning_rate': 0.1495673382033785, 'n_estimators': 70, 'reg_lambda': 5.014993116284377e-05, 'reg_alpha': 8.058076988112284e-06, 'subsample': 0.3111559674675738, 'colsample_bytree': 0.29766445478666503, 'max_depth': 6, 'early_stopping_rounds': 222}. Best is trial 4 with value: 0.960716495521903.[0m
2021-12-18 19:16:24,175 INFO final params {'learning_rate': 0.23192766617210214, 'n_estimators': 150, 'reg_lambda': 4.4357194951413e-08, 'reg_alpha': 3.638560252776617e-08, 'subsample': 0.144390321769515, 'colsample_bytree': 0.9983270521195565, 'max_depth': 4, 'tree_method': 'gpu_hist', 'gpu_id': 0, 'predictor': 'gpu_predictor'}
2021-12-18 19:16:24,177 INFO Training Started!


Parameters: { "eval_set" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




2021-12-18 19:16:39,890 INFO Training Ended!
2021-12-18 19:16:40,444 INFO accuracy_score: 0.9558877757014018
[32m[I 2021-12-18 19:16:40,475][0m Trial 10 finished with value: 0.9558877757014018 and parameters: {'learning_rate': 0.23192766617210214, 'n_estimators': 150, 'reg_lambda': 4.4357194951413e-08, 'reg_alpha': 3.638560252776617e-08, 'subsample': 0.144390321769515, 'colsample_bytree': 0.9983270521195565, 'max_depth': 4, 'early_stopping_rounds': 108}. Best is trial 4 with value: 0.960716495521903.[0m
2021-12-18 19:16:40,693 INFO final params {'learning_rate': 0.011032665300184525, 'n_estimators': 150, 'reg_lambda': 0.00015238647209723878, 'reg_alpha': 0.030847608009326028, 'subsample': 0.3206318888473396, 'colsample_bytree': 0.7796814725084281, 'max_depth': 9, 'tree_method': 'gpu_hist', 'gpu_id': 0, 'predictor': 'gpu_predictor'}
2021-12-18 19:16:40,693 INFO Training Started!


Parameters: { "eval_set" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




2021-12-18 19:17:14,630 INFO Training Ended!
2021-12-18 19:17:15,251 INFO accuracy_score: 0.9407066205836213
[32m[I 2021-12-18 19:17:15,283][0m Trial 11 finished with value: 0.9407066205836213 and parameters: {'learning_rate': 0.011032665300184525, 'n_estimators': 150, 'reg_lambda': 0.00015238647209723878, 'reg_alpha': 0.030847608009326028, 'subsample': 0.3206318888473396, 'colsample_bytree': 0.7796814725084281, 'max_depth': 9, 'early_stopping_rounds': 495}. Best is trial 4 with value: 0.960716495521903.[0m
2021-12-18 19:17:15,469 INFO final params {'learning_rate': 0.03509235657461732, 'n_estimators': 200, 'reg_lambda': 1.5429314627735677e-07, 'reg_alpha': 1.9609323751003085, 'subsample': 0.33308819373036525, 'colsample_bytree': 0.9795787157619746, 'max_depth': 6, 'tree_method': 'gpu_hist', 'gpu_id': 0, 'predictor': 'gpu_predictor'}
2021-12-18 19:17:15,469 INFO Training Started!


Parameters: { "eval_set" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




2021-12-18 19:17:38,402 INFO Training Ended!
2021-12-18 19:17:39,012 INFO accuracy_score: 0.9481565740214124
[32m[I 2021-12-18 19:17:39,049][0m Trial 12 finished with value: 0.9481565740214124 and parameters: {'learning_rate': 0.03509235657461732, 'n_estimators': 200, 'reg_lambda': 1.5429314627735677e-07, 'reg_alpha': 1.9609323751003085, 'subsample': 0.33308819373036525, 'colsample_bytree': 0.9795787157619746, 'max_depth': 6, 'early_stopping_rounds': 335}. Best is trial 4 with value: 0.960716495521903.[0m
2021-12-18 19:17:39,199 INFO final params {'learning_rate': 0.22622805120653375, 'n_estimators': 150, 'reg_lambda': 0.001672271663050091, 'reg_alpha': 0.006946534672337779, 'subsample': 0.17397266793143232, 'colsample_bytree': 0.582982867003998, 'max_depth': 8, 'tree_method': 'gpu_hist', 'gpu_id': 0, 'predictor': 'gpu_predictor'}
2021-12-18 19:17:39,200 INFO Training Started!


Parameters: { "eval_set" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




2021-12-18 19:18:04,393 INFO Training Ended!
2021-12-18 19:18:05,010 INFO accuracy_score: 0.9604514971781426
[32m[I 2021-12-18 19:18:05,034][0m Trial 13 finished with value: 0.9604514971781426 and parameters: {'learning_rate': 0.22622805120653375, 'n_estimators': 150, 'reg_lambda': 0.001672271663050091, 'reg_alpha': 0.006946534672337779, 'subsample': 0.17397266793143232, 'colsample_bytree': 0.582982867003998, 'max_depth': 8, 'early_stopping_rounds': 326}. Best is trial 4 with value: 0.960716495521903.[0m
2021-12-18 19:18:05,240 INFO final params {'learning_rate': 0.242002833433415, 'n_estimators': 150, 'reg_lambda': 8.897609225819913e-06, 'reg_alpha': 0.00032079679576163105, 'subsample': 0.10316480197057348, 'colsample_bytree': 0.5573988234896001, 'max_depth': 6, 'tree_method': 'gpu_hist', 'gpu_id': 0, 'predictor': 'gpu_predictor'}
2021-12-18 19:18:05,240 INFO Training Started!


Parameters: { "eval_set" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




2021-12-18 19:18:24,709 INFO Training Ended!
2021-12-18 19:18:25,315 INFO accuracy_score: 0.9587240079749502
[32m[I 2021-12-18 19:18:25,344][0m Trial 14 finished with value: 0.9587240079749502 and parameters: {'learning_rate': 0.242002833433415, 'n_estimators': 150, 'reg_lambda': 8.897609225819913e-06, 'reg_alpha': 0.00032079679576163105, 'subsample': 0.10316480197057348, 'colsample_bytree': 0.5573988234896001, 'max_depth': 6, 'early_stopping_rounds': 308}. Best is trial 4 with value: 0.960716495521903.[0m
2021-12-18 19:18:25,558 INFO final params {'learning_rate': 0.15083320474801157, 'n_estimators': 150, 'reg_lambda': 0.0008864272430670432, 'reg_alpha': 0.0015330465205195625, 'subsample': 0.21439902518585044, 'colsample_bytree': 0.6759549936116134, 'max_depth': 8, 'tree_method': 'gpu_hist', 'gpu_id': 0, 'predictor': 'gpu_predictor'}
2021-12-18 19:18:25,560 INFO Training Started!


Parameters: { "eval_set" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




2021-12-18 19:18:51,351 INFO Training Ended!
2021-12-18 19:18:51,952 INFO accuracy_score: 0.9603652477172018
[32m[I 2021-12-18 19:18:51,998][0m Trial 15 finished with value: 0.9603652477172018 and parameters: {'learning_rate': 0.15083320474801157, 'n_estimators': 150, 'reg_lambda': 0.0008864272430670432, 'reg_alpha': 0.0015330465205195625, 'subsample': 0.21439902518585044, 'colsample_bytree': 0.6759549936116134, 'max_depth': 8, 'early_stopping_rounds': 471}. Best is trial 4 with value: 0.960716495521903.[0m
2021-12-18 19:18:52,200 INFO final params {'learning_rate': 0.17758705988155682, 'n_estimators': 150, 'reg_lambda': 7.029382609118037e-07, 'reg_alpha': 7.425489034807764e-07, 'subsample': 0.4261127288921614, 'colsample_bytree': 0.5056735481121135, 'max_depth': 5, 'tree_method': 'gpu_hist', 'gpu_id': 0, 'predictor': 'gpu_predictor'}
2021-12-18 19:18:52,202 INFO Training Started!


Parameters: { "eval_set" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




2021-12-18 19:19:10,088 INFO Training Ended!
2021-12-18 19:19:10,652 INFO accuracy_score: 0.9553402791232555
[32m[I 2021-12-18 19:19:10,681][0m Trial 16 finished with value: 0.9553402791232555 and parameters: {'learning_rate': 0.17758705988155682, 'n_estimators': 150, 'reg_lambda': 7.029382609118037e-07, 'reg_alpha': 7.425489034807764e-07, 'subsample': 0.4261127288921614, 'colsample_bytree': 0.5056735481121135, 'max_depth': 5, 'early_stopping_rounds': 332}. Best is trial 4 with value: 0.960716495521903.[0m


FrozenTrial(number=4, values=[0.960716495521903], datetime_start=datetime.datetime(2021, 12, 18, 19, 14, 7, 108485), datetime_complete=datetime.datetime(2021, 12, 18, 19, 14, 30, 587883), params={'colsample_bytree': 0.7501402977241696, 'early_stopping_rounds': 384, 'learning_rate': 0.22460180743878044, 'max_depth': 7, 'n_estimators': 150, 'reg_alpha': 0.00023758525471934383, 'reg_lambda': 3.144893773482e-05, 'subsample': 0.2640308356915845}, distributions={'colsample_bytree': UniformDistribution(high=1.0, low=0.1), 'early_stopping_rounds': IntUniformDistribution(high=500, low=100, step=1), 'learning_rate': LogUniformDistribution(high=0.25, low=0.01), 'max_depth': IntUniformDistribution(high=9, low=1, step=1), 'n_estimators': CategoricalDistribution(choices=(70, 150, 200)), 'reg_alpha': LogUniformDistribution(high=100.0, low=1e-08), 'reg_lambda': LogUniformDistribution(high=100.0, low=1e-08), 'subsample': UniformDistribution(high=1.0, low=0.1)}, user_attrs={}, system_attrs={}, intermedi

##### now update the meta model with best params from study and then update the sklearn pipeline with this new model

In [22]:
xgb_params.update(study.best_trial.params)
print("xgb_params", xgb_params)
updated_xgb_model = XGBClassifier(**xgb_params)

xgb_params {'use_label_encoder': False, 'learning_rate': 0.22460180743878044, 'n_estimators': 150, 'reg_lambda': 3.144893773482e-05, 'reg_alpha': 0.00023758525471934383, 'subsample': 0.2640308356915845, 'colsample_bytree': 0.7501402977241696, 'max_depth': 7, 'tree_method': 'gpu_hist', 'gpu_id': 0, 'predictor': 'gpu_predictor', 'early_stopping_rounds': 384}


#### Let's Use K-Fold Training with best params

In [23]:
%%time
X_np, X_test_np = tmlt.pp_fit_transform(tmlt.dfl.X, tmlt.dfl.X_test)

#TODO: NEED TO THINK ABOUT IT
y_np = tmlt.dfl.y

CPU times: user 3.01 s, sys: 668 ms, total: 3.67 s
Wall time: 3.67 s


In [24]:
%%time
# k-fold training
xgb_model_metrics_score, xgb_model_test_preds = tmlt.do_kfold_training(X_np,
                                                                       y_np,
                                                                       X_test=X_test_np,
                                                                       n_splits=5,
                                                                       model=updated_xgb_model)

2021-12-18 19:19:15,428 INFO Training Started!


Parameters: { "early_stopping_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




2021-12-18 19:19:39,192 INFO Training Finished!
2021-12-18 19:19:39,193 INFO Predicting Val Probablities!
2021-12-18 19:19:39,669 INFO Predicting Val Score!
2021-12-18 19:19:40,173 INFO fold: 1 accuracy_score : 0.9600802494984406
2021-12-18 19:19:40,173 INFO Predicting Test Probablities!
2021-12-18 19:19:41,443 INFO Training Started!


Parameters: { "early_stopping_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




2021-12-18 19:20:05,108 INFO Training Finished!
2021-12-18 19:20:05,109 INFO Predicting Val Probablities!
2021-12-18 19:20:05,590 INFO Predicting Val Score!
2021-12-18 19:20:06,101 INFO fold: 2 accuracy_score : 0.960360198199009
2021-12-18 19:20:06,102 INFO Predicting Test Probablities!
2021-12-18 19:20:07,316 INFO Training Started!


Parameters: { "early_stopping_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




2021-12-18 19:20:31,191 INFO Training Finished!
2021-12-18 19:20:31,192 INFO Predicting Val Probablities!
2021-12-18 19:20:31,668 INFO Predicting Val Score!
2021-12-18 19:20:32,171 INFO fold: 3 accuracy_score : 0.9598439507802461
2021-12-18 19:20:32,171 INFO Predicting Test Probablities!
2021-12-18 19:20:33,417 INFO Training Started!


Parameters: { "early_stopping_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




2021-12-18 19:20:57,212 INFO Training Finished!
2021-12-18 19:20:57,213 INFO Predicting Val Probablities!
2021-12-18 19:20:57,695 INFO Predicting Val Score!
2021-12-18 19:20:58,207 INFO fold: 4 accuracy_score : 0.9600051999740001
2021-12-18 19:20:58,208 INFO Predicting Test Probablities!
2021-12-18 19:20:59,423 INFO Training Started!


Parameters: { "early_stopping_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




2021-12-18 19:21:23,212 INFO Training Finished!
2021-12-18 19:21:23,213 INFO Predicting Val Probablities!
2021-12-18 19:21:23,729 INFO Predicting Val Score!
2021-12-18 19:21:24,243 INFO fold: 5 accuracy_score : 0.959718951405243
2021-12-18 19:21:24,244 INFO Predicting Test Probablities!
2021-12-18 19:21:24,943 INFO  Mean Metrics Results from all Folds are: {'accuracy_score': 0.9600017099713878}


CPU times: user 2min 39s, sys: 3.7 s, total: 2min 43s
Wall time: 2min 10s


In [25]:
# predict on test dataset
if xgb_model_test_preds is not None:
    print(xgb_model_test_preds.shape)

(1000000,)


In [26]:
# # take weighted average of both k-fold models predictions
# final_preds = ((0.45 * sci_model_preds) + (0.55* xgb_model_test_preds)) / 2
# print(final_preds.shape)

#### Create Kaggle Predictions

In [27]:
X_np, X_test_np = tmlt.pp_fit_transform(tmlt.dfl.X, tmlt.dfl.X_test)

In [28]:
xgb_model_test_preds = xgb_model.predict(X_test_np)
print(xgb_model_test_preds.shape)

(1000000,)


In [29]:
print(xgb_model_test_preds[343:349])
xgb_model_test_preds = xgb_model_test_preds + 1
print(xgb_model_test_preds[343:349])
print(type(xgb_model_test_preds))
pd.Series(xgb_model_test_preds).value_counts()

[0 1 0 1 1 1]
[1 2 1 2 2 2]
<class 'numpy.ndarray'>


2    489146
1    419986
3     90167
7       674
6        20
4         7
dtype: int64

In [31]:
submission_file_name = 'thur_dec_18_1924_submission.csv'

sub = pd.read_csv(DIRECTORY_PATH + SAMPLE_SUB_FILE)
sub['Cover_Type'] = xgb_model_test_preds

sub.to_csv(submission_file_name, index=False)
print(f"{submission_file_name} saved!")

thur_dec_18_1924_submission.csv saved!
