In [2]:
%load_ext autoreload
%autoreload 2

# Getting Started Kaggle TPS Challenge with Tabular ML Toolkit

> A Tutorial to showcase usage of tabular_ml_toolkit (tmlt) library on Kaggle TPS Challenge Nov 2021.

> tabular_ml_toolkit is a helper library to jumpstart your machine learning project based on Tabular or Structured data.

> It comes with model parallelism and cutting edge hyperparameter search techniques.

> Under the hood TMLT uses optuna, xgboost and scikit-learn pipelines

## Install

`pip install -U tabular_ml_toolkit`

### How to Best Use tabular_ml_toolkit

Start with your favorite model and then just simply create **tmlt** with one API.

*Here we are using XGBClassifier, on  [Kaggle TPS Challenge (Nov 2021) data](https://www.kaggle.com/c/tabular-playground-series-nov-2021/data)*

In [3]:
from tabular_ml_toolkit.tmlt import *
from xgboost import XGBClassifier
import numpy as np
import gc
import pandas as pd
from collections import Counter

In [4]:
from sklearn.metrics import roc_auc_score, accuracy_score, log_loss, balanced_accuracy_score

In [5]:
# Dataset file names and Paths
DIRECTORY_PATH = "/home/pankaj/kaggle_datasets/tpc_dec_2021/"
TRAIN_FILE = "train.csv"
TEST_FILE = "test.csv"
SAMPLE_SUB_FILE = "sample_submission.csv"
OUTPUT_PATH = "kaggle_tps_dec_output/"

#### Just point tmlt in the direction of your data

#### Let it know what are idx and target columns in your tabular data

#### what kind of problem type you are trying to resolve?

In [6]:
# create tmlt
tmlt = TMLT().prepare_data(
    train_file_path= DIRECTORY_PATH + TRAIN_FILE,
    test_file_path= DIRECTORY_PATH + TEST_FILE,
    #make sure to use right index and target columns
    idx_col="Id",
    target="Cover_Type",
    random_state=42,
    problem_type="multi_class_classification",
#     nrows=4000
)


# tmlt supports only below task type:
    # "binary_classification"
    # "multi_label_classification"
    # "multi_class_classification"
    # "regression"

2021-12-19 20:02:52,232 INFO 8 cores found, model and data parallel processing should worked!
2021-12-19 20:03:08,765 INFO DataFrame Memory usage decreased to 274.66 Mb (83.9% reduction)
2021-12-19 20:03:11,789 INFO DataFrame Memory usage decreased to 67.71 Mb (83.9% reduction)
2021-12-19 20:03:12,090 INFO The least class label is :5 and value count is: 1
2021-12-19 20:03:12,094 INFO The Original X shape is: (4000000, 55)
2021-12-19 20:03:12,208 INFO The X shape after least class duplicates appends is: (4000021, 55)
2021-12-19 20:03:13,889 INFO PreProcessing will include target(s) encoding!
2021-12-19 20:03:14,000 INFO categorical columns are None, Preprocessing will done accordingly!


In [7]:
print(type(tmlt.dfl.X))
print(tmlt.dfl.X.shape)
print(type(tmlt.dfl.y))
print(tmlt.dfl.y.shape)
print(type(tmlt.dfl.X_test))
print(tmlt.dfl.X_test.shape)

<class 'pandas.core.frame.DataFrame'>
(4000021, 54)
<class 'numpy.ndarray'>
(4000021,)
<class 'pandas.core.frame.DataFrame'>
(1000000, 54)


In [8]:
tmlt.dfl.X

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type31,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40
0,3189,40,8,30,13,3270,206,234,193,4873,...,0,0,0,0,0,0,0,0,0,0
1,3026,182,5,280,29,3270,233,240,106,5423,...,0,0,0,0,0,0,0,0,0,0
2,3106,13,7,351,37,2914,208,234,137,5269,...,0,0,0,0,0,0,0,0,0,0
3,3022,276,13,192,16,3034,207,238,156,2866,...,0,0,0,0,0,0,0,0,0,0
4,2906,186,13,266,22,2916,231,231,154,2642,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4000016,2953,114,39,97,111,981,181,209,184,7633,...,0,0,0,0,0,0,0,0,0,0
4000017,2953,114,39,97,111,981,181,209,184,7633,...,0,0,0,0,0,0,0,0,0,0
4000018,2953,114,39,97,111,981,181,209,184,7633,...,0,0,0,0,0,0,0,0,0,0
4000019,2953,114,39,97,111,981,181,209,184,7633,...,0,0,0,0,0,0,0,0,0,0


In [9]:
%%time
print(dict(pd.Series(tmlt.dfl.y).value_counts()))

{1: 2262087, 0: 1468136, 2: 195712, 6: 62261, 5: 11426, 3: 377, 4: 22}
CPU times: user 13 ms, sys: 226 µs, total: 13.2 ms
Wall time: 13.2 ms


In [10]:
# %%time
# print(Counter(tmlt.dfl.y).items())

In [11]:
gc.collect()

0

### PreProcess X, y and X_test and apply SMOTEENN combine technique (oversample+undersample) to resample imbalance classses


In [12]:
%%time
X_np, y_np, X_test_np = tmlt.pp_fit_transform(tmlt.dfl.X, tmlt.dfl.y, tmlt.dfl.X_test)
print(X_np.shape)
print(type(X_np))
print(y_np.shape)
print(type(y_np))
print(X_test_np.shape)
print(type(X_test_np))

(4000021, 54)
<class 'numpy.ndarray'>
(4000021,)
<class 'numpy.ndarray'>
(1000000, 54)
<class 'numpy.ndarray'>
CPU times: user 3.34 s, sys: 656 ms, total: 3.99 s
Wall time: 4 s


In [13]:
print(dict(pd.Series(y_np).value_counts()))

{1: 2262087, 0: 1468136, 2: 195712, 6: 62261, 5: 11426, 3: 377, 4: 22}


In [14]:
gc.collect()

48


### PreProcess train, valid dataset before training

In [15]:
%%time
# create train, valid split to evaulate model on valid dataset
X_train_np, X_valid_np,  y_train_np, y_valid_np =  tmlt.dfl.create_train_valid(X_np, y_np, valid_size=0.2)

CPU times: user 1.87 s, sys: 75.7 ms, total: 1.94 s
Wall time: 1.94 s


In [16]:
print(X_train_np.shape)
print(type(X_train_np))
print(y_train_np.shape)
print(type(y_train_np))
print(X_valid_np.shape)
print(type(X_valid_np))
print(y_valid_np.shape)
print(type(y_valid_np))

(3200016, 54)
<class 'numpy.ndarray'>
(3200016,)
<class 'numpy.ndarray'>
(800005, 54)
<class 'numpy.ndarray'>
(800005,)
<class 'numpy.ndarray'>


In [35]:
# check for class values see if both train and valid have same class labels
print(dict(pd.Series(y_train_np).value_counts()))
print(dict(pd.Series(y_valid_np).value_counts()))

{1: 1809706, 0: 1174364, 2: 156645, 6: 49832, 5: 9146, 3: 305, 4: 18}
{1: 452381, 0: 293772, 2: 39067, 6: 12429, 5: 2280, 3: 72, 4: 4}


### Training

#### Create a base xgb classifier model with your best guess params

In [36]:
# xgb_params = {
#     'use_label_encoder': False,
#     'learning_rate': 0.22460180743878044,
#     'n_estimators': 150,
#     'reg_lambda': 3.144893773482e-05,
#     'reg_alpha': 0.00023758525471934383,
#     'subsample': 0.2640308356915845,
#     'colsample_bytree': 0.7501402977241696,
#     'max_depth': 7,
#     'tree_method': 'gpu_hist',
#     'gpu_id': 0,
#     'predictor': 'gpu_predictor',
#     'early_stopping_rounds': 384
# }
# xgb_model = XGBClassifier(**xgb_params)

In [37]:
# %%time
# # Now do model training
# xgb_model.fit(X_train_np, y_train_np,
#               verbose=True,
#               #detect & avoid overfitting
#               eval_set=[(X_valid_np, y_valid_np)],
#               eval_metric="mlogloss",
#               early_stopping_rounds=300
#              )

In [38]:
# %%time
# #predict
# preds = xgb_model.predict(X_valid_np)
# preds_probs = xgb_model.predict_proba(X_valid_np)

# # Metrics
# auc = roc_auc_score(y_valid_np, preds_probs, multi_class='ovr')
# acc = accuracy_score(y_valid_np, preds)
# lg_loss = log_loss(y_valid_np, preds_probs)
# imbalance classes metrics
# bas = balanced_accuracy_score(y_valid_np, preds)

# print(f"AUC is : {auc} , log loss is: {lg_loss}, Accuracy is : {acc} While Balance Accuracy Score is: {bas}")

Balance Accuracy Score is: 0.7567083002652698

In [39]:
# gc.collect()

### let's train using imbalance BalanceBaggingClassifier

In [40]:
# from imblearn.ensemble import BalancedBaggingClassifier
# from imblearn.ensemble import BalancedRandomForestClassifier
# from sklearn.tree import DecisionTreeClassifier

In [41]:
# %%time

# #model
# # bbc_model = BalancedBaggingClassifier(base_estimator=DecisionTreeClassifier(), sampling_strategy='auto',
# #                                 replacement=False, random_state=42)

# # model = BalancedBaggingClassifier(base_estimator=DecisionTreeClassifier(),
# #                                       #sampling_strategy='auto', replacement=False,
# #                                       random_state=42)

# model = BalancedRandomForestClassifier(n_estimators=1000, random_state=42)

# # training
# model.fit(X_train_np, y_train_np)
# gc.collect()

In [42]:
# %%time
# #predict
# preds = model.predict(X_valid_np)
# preds_probs = model.predict_proba(X_valid_np)

# # Metrics
# auc = roc_auc_score(y_valid_np, preds_probs, multi_class='ovr')
# acc = accuracy_score(y_valid_np, preds)
# lg_loss = log_loss(y_valid_np, preds_probs)
# #imbalance classes metrics
# bas = balanced_accuracy_score(y_valid_np, preds)

# print(f"AUC is : {auc} , log loss is: {lg_loss}, Accuracy is : {acc} While Balance Accuracy Score is: {bas}")
# gc.collect()

For n_estimators=100
While Balance Accuracy Score is: 0.8467083002652698

In [43]:
# %%time
# # Get Test Predictions
# model_test_preds = model.predict(X_test_np)
# print(model_test_preds.shape)
# gc.collect()

### For Meta Ensemble Models Training

#### Base Model 1: linear SVM model

In [44]:
# from sklearn.svm import LinearSVC

In [45]:
# %%time

# # OOF training and prediction on both train and test dataset by a given model
# #choose model
# linear_oof_model = LinearSVC(tol=1e-7, penalty='l2', dual=False, max_iter=2000, random_state=42)

# #fit and predict
# linear_oof_model_preds, linear_oof_model_test_preds = tmlt.do_oof_kfold_train_preds(n_splits=5,
#                                                                                     model=linear_oof_model,
#                                                                                     X = X_np,
#                                                                                     y = y_np,
#                                                                                     X_test = X_test_np)

# if linear_oof_model_preds is not None:
#     print(linear_oof_model_preds.shape)

# if linear_oof_model_test_preds is not None:    
#     print(linear_oof_model_test_preds.shape)

In [46]:
# gc.collect()

#### Base Model 2: Logistic Regression Model

In [47]:
# from sklearn.linear_model import LogisticRegression

In [48]:
# %%time

# # OOF training and prediction on both train and test dataset by a given model

# #choose model
# log_oof_model = LogisticRegression(multi_class='multinomial', solver='lbfg', random_state=42)

# #fit and predict
# log_oof_model_preds, log_oof_model_test_preds = tmlt.do_oof_kfold_train_preds(n_splits=5,
#                                                                                     model=log_oof_model,
#                                                                                     X = X_np,
#                                                                                     y = y_np,
#                                                                                     X_test = X_test_np)
# if log_oof_model_preds is not None:
#     print(log_oof_model_preds.shape)

# if log_oof_model_test_preds is not None:    
#     print(log_oof_model_test_preds.shape)

In [49]:
# gc.collect()

#### Base Model 3: SKLearn MLP

In [50]:
# from sklearn.neural_network import MLPClassifier

In [51]:
# %%time

# # OOF training and prediction on both train and test dataset by a given model

# #choose model
# mlp_oof_model = MLPClassifier(max_iter=1000, early_stopping=True)

# #update the model on sklearn pipeline
# # tmlt = tmlt.update_model(mlp_oof_model)

# # # lets see updated sklearn pipeline with new model
# # tmlt.spl

# #fit and predict
# mlp_oof_model_preds, mlp_oof_model_test_preds = tmlt.do_oof_kfold_train_preds(n_splits=5,
#                                                                                     model=mlp_oof_model,
#                                                                                     X = X_np,
#                                                                                     y = y_np,
#                                                                                     X_test = X_test_np)
# if mlp_oof_model_preds is not None:
#     print(mlp_oof_model_preds.shape)

# if mlp_oof_model_test_preds is not None:    
#     print(mlp_oof_model_test_preds.shape)

#### Base Model 4: TabNet

In [52]:
from pytorch_tabnet.tab_model import TabNetClassifier

In [53]:
%%time

# OOF training and prediction on both train and test dataset by a given model

#TOD0: Add tabnet_params and pass through do_oof_kfold_train_preds, keep it optional for non tabnet models
# GOTO tmlt update method with tabnet_params
tabnet_params = {
    'max_epochs': 10,
    'patience': 2,
    'batch_size': 4096*8*tmlt.IDEAL_CPU_CORES,
    'virtual_batch_size' : 512*8*tmlt.IDEAL_CPU_CORES
}

#choose model
tabnet_oof_model = TabNetClassifier(optimizer_params=dict(lr=0.02), verbose=1)

#fit and predict
tabnet_oof_model_preds, tabnet_oof_model_test_preds = tmlt.do_oof_kfold_train_preds(n_splits=5,
                                                                                    model=tabnet_oof_model,
                                                                                    X = X_np,
                                                                                    y = y_np,
                                                                                    X_test = X_test_np,
                                                                                    tabnet_params=tabnet_params)
gc.collect()

if tabnet_oof_model_preds is not None:
    print(tabnet_oof_model_preds.shape)

if tabnet_oof_model_test_preds is not None:
    print(tabnet_oof_model_test_preds.shape)

Device used : cuda


2021-12-19 20:28:46,445 INFO Training Started!


epoch 0  | loss: 1.25043 | val_0_logloss: 0.48621 |  0:00:20s
epoch 1  | loss: 0.31447 | val_0_logloss: 0.27513 |  0:00:40s
epoch 2  | loss: 0.24494 | val_0_logloss: 0.22531 |  0:01:01s
Stop training because you reached max_epochs = 3 with best_epoch = 2 and best_val_0_logloss = 0.22531
Best weights from best epoch are automatically used!


2021-12-19 20:30:07,351 INFO Training Finished!
2021-12-19 20:30:12,909 INFO fold: 1 OOF Model Metrics: 0.22530653664576633!
2021-12-19 20:30:17,018 INFO Training Started!


epoch 0  | loss: 0.22735 | val_0_logloss: 0.20195 |  0:00:20s
epoch 1  | loss: 0.18652 | val_0_logloss: 0.17483 |  0:00:40s
epoch 2  | loss: 0.16905 | val_0_logloss: 0.16774 |  0:01:00s
Stop training because you reached max_epochs = 3 with best_epoch = 2 and best_val_0_logloss = 0.16774
Best weights from best epoch are automatically used!


2021-12-19 20:31:34,942 INFO Training Finished!
2021-12-19 20:31:40,538 INFO fold: 2 OOF Model Metrics: 0.16773542057211552!
2021-12-19 20:31:44,674 INFO Training Started!


epoch 0  | loss: 0.17882 | val_0_logloss: 0.16435 |  0:00:20s
epoch 1  | loss: 0.15725 | val_0_logloss: 0.15378 |  0:00:40s
epoch 2  | loss: 0.14599 | val_0_logloss: 0.14168 |  0:01:01s
Stop training because you reached max_epochs = 3 with best_epoch = 2 and best_val_0_logloss = 0.14168
Best weights from best epoch are automatically used!


2021-12-19 20:33:03,108 INFO Training Finished!
2021-12-19 20:33:08,735 INFO fold: 3 OOF Model Metrics: 0.14167650704649876!
2021-12-19 20:33:12,871 INFO Training Started!


epoch 0  | loss: 0.14903 | val_0_logloss: 0.14054 |  0:00:20s
epoch 1  | loss: 0.13449 | val_0_logloss: 0.13128 |  0:00:40s
epoch 2  | loss: 0.12812 | val_0_logloss: 0.12455 |  0:01:01s
Stop training because you reached max_epochs = 3 with best_epoch = 2 and best_val_0_logloss = 0.12455
Best weights from best epoch are automatically used!


2021-12-19 20:34:31,055 INFO Training Finished!
2021-12-19 20:34:36,694 INFO fold: 4 OOF Model Metrics: 0.12454858805122924!
2021-12-19 20:34:40,854 INFO Training Started!


epoch 0  | loss: 0.14    | val_0_logloss: 0.12935 |  0:00:20s
epoch 1  | loss: 0.12309 | val_0_logloss: 0.12216 |  0:00:40s
epoch 2  | loss: 0.11894 | val_0_logloss: 0.12147 |  0:01:00s
Stop training because you reached max_epochs = 3 with best_epoch = 2 and best_val_0_logloss = 0.12147
Best weights from best epoch are automatically used!


2021-12-19 20:35:58,632 INFO Training Finished!
2021-12-19 20:36:04,209 INFO fold: 5 OOF Model Metrics: 0.12146745566406761!
2021-12-19 20:36:07,736 INFO Mean OOF Model Metrics: 0.15614690159593547!


(4000021,)
(1000000,)
CPU times: user 8min 14s, sys: 9.11 s, total: 8min 23s
Wall time: 7min 22s


#### Now add back based models predictions to X and X_test

In [None]:
# # add based model oof predictions back to X and X_test before Meta model training
# tmlt.dfl.X["linear_preds"] = linear_oof_model_preds
# tmlt.dfl.X_test["linear_preds"] = linear_oof_model_test_preds

# print(tmlt.dfl.X.shape)
# print(tmlt.dfl.X_test.shape)

In [None]:
# # add based model oof predictions back to X and X_test before Meta model training
# tmlt.dfl.X["log_reg_preds"] = log_oof_model_preds
# tmlt.dfl.X_test["log_reg_preds"] = log_oof_model_test_preds

# print(tmlt.dfl.X.shape)
# print(tmlt.dfl.X_test.shape)

In [None]:
# # add based model oof predictions back to X and X_test before Meta model training
# tmlt.dfl.X["mlp_preds"] = mlp_oof_model_preds
# tmlt.dfl.X_test["mlp_preds"] = mlp_oof_model_test_preds

# print(tmlt.dfl.X.shape)
# print(tmlt.dfl.X_test.shape)

In [54]:
# add based model oof predictions back to X and X_test before Meta model training
tmlt.dfl.X["tabnet_preds"] = tabnet_oof_model_preds
tmlt.dfl.X_test["tabnet_preds"] = tabnet_oof_model_test_preds

print(tmlt.dfl.X.shape)
print(tmlt.dfl.X_test.shape)

(4000021, 55)
(1000000, 55)


#### now just update the tmlt with this new X and X_test

In [55]:
tmlt = tmlt.update_dfl(X=tmlt.dfl.X, y=tmlt.dfl.y, X_test=tmlt.dfl.X_test)

2021-12-19 20:36:19,655 INFO categorical columns are None, Preprocessing will done accordingly!


#### For META Model Training

##### Now PreProcess X_train, X_valid

NOTE: Preprocessing gives back numpy arrays for pandas dataframe

In [56]:
%%time
X_np, y_np, X_test_np = tmlt.pp_fit_transform(tmlt.dfl.X, tmlt.dfl.y, tmlt.dfl.X_test)

print(X_np.shape)
print(type(X_np))
print(y_np.shape)
print(type(y_np))
print(X_test_np.shape)
print(type(X_test_np))

(4000021, 55)
<class 'numpy.ndarray'>
(4000021,)
<class 'numpy.ndarray'>
(1000000, 55)
<class 'numpy.ndarray'>
CPU times: user 21.9 s, sys: 1.68 s, total: 23.6 s
Wall time: 23.6 s


##### create train valid dataframes for training

In [57]:
%%time
# create train, valid split to evaulate model on valid dataset
X_train_np, X_valid_np,  y_train_np, y_valid_np =  tmlt.dfl.create_train_valid(X_np, y_np, valid_size=0.2)

print(X_train_np.shape)
print(type(X_train_np))
print(y_train_np.shape)
print(type(y_train_np))
print(X_valid_np.shape)
print(type(X_valid_np))
print(y_valid_np.shape)
print(type(y_valid_np))

(3200016, 55)
<class 'numpy.ndarray'>
(3200016,)
<class 'numpy.ndarray'>
(800005, 55)
<class 'numpy.ndarray'>
(800005,)
<class 'numpy.ndarray'>
CPU times: user 1.89 s, sys: 124 ms, total: 2.01 s
Wall time: 2.01 s


In [58]:
xgb_params = {
    'use_label_encoder': False,
    'learning_rate': 0.22460180743878044,
    'n_estimators': 1500,
    'reg_lambda': 3.144893773482e-05,
    'reg_alpha': 0.00023758525471934383,
    'subsample': 0.2640308356915845,
    'colsample_bytree': 0.7501402977241696,
    'max_depth': 7,
    'tree_method': 'gpu_hist',
    'gpu_id': 0,
    'predictor': 'gpu_predictor',
    'early_stopping_rounds': 384
}
xgb_model = XGBClassifier(**xgb_params)

In [61]:
%%time
# Now do model training
xgb_model.fit(X_train_np, y_train_np,
              verbose=False,
              #detect & avoid overfitting
              eval_set=[(X_valid_np, y_valid_np)],
              eval_metric="mlogloss",
              early_stopping_rounds=300
             )

#predict
preds = xgb_model.predict(X_valid_np)
preds_probs = xgb_model.predict_proba(X_valid_np)

# Metrics
auc = roc_auc_score(y_valid_np, preds_probs, multi_class='ovr')
acc = accuracy_score(y_valid_np, preds)

print(f"AUC is : {auc} while Accuracy is : {acc} ")

Parameters: { "early_stopping_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


AUC is : 0.9984581458222804 while Accuracy is : 0.9606552459047131 
CPU times: user 1min 50s, sys: 829 ms, total: 1min 51s
Wall time: 1min 45s


In [64]:
%%time
# Get Test Predictions
single_xgb_model_test_preds = xgb_model.predict(X_test_np)
print(single_xgb_model_test_preds.shape)
gc.collect()

(1000000,)
CPU times: user 2.27 s, sys: 83.3 ms, total: 2.36 s
Wall time: 989 ms


50

### WOW!!!!

#### For Meta Model, Let's do Optuna based HyperParameter search to get best params for fit

In [62]:
# # **Just make sure to supply an output directory path so hyperparameter search is saved**
# study = tmlt.do_xgb_optuna_optimization(X_train_np, y_train, X_valid_np, y_valid, optuna_db_path=OUTPUT_PATH, opt_timeout=360)
# print(study.best_trial)

##### now update the meta model with best params from study and then update the sklearn pipeline with this new model

In [63]:
# xgb_params.update(study.best_trial.params)
# xgb_params.update({'n_estimators': 1500})
# print("xgb_params", xgb_params)
# updated_xgb_model = XGBClassifier(**xgb_params)

#### Let's Use K-Fold Training with best params

In [71]:
%%time
# k-fold training
xgb_model_metrics_score, xgb_model_test_preds = tmlt.do_kfold_training(X_np,
                                                                       y_np,
                                                                       X_test=X_test_np,
                                                                       n_splits=5,
                                                                       model=xgb_model)
gc.collect()

2021-12-19 20:41:15,228 INFO Training Started!


Parameters: { "early_stopping_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




2021-12-19 20:45:08,161 INFO Training Finished!
2021-12-19 20:45:08,161 INFO Predicting Val Probablities!
2021-12-19 20:45:09,887 INFO Predicting Val Score!
2021-12-19 20:45:11,630 INFO fold: 1 accuracy_score : 0.9565015218654883
2021-12-19 20:45:11,630 INFO Predicting Test Scores!
2021-12-19 20:45:14,436 INFO Training Started!


Parameters: { "early_stopping_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




2021-12-19 20:49:05,940 INFO Training Finished!
2021-12-19 20:49:05,941 INFO Predicting Val Probablities!
2021-12-19 20:49:07,656 INFO Predicting Val Score!
2021-12-19 20:49:09,386 INFO fold: 2 accuracy_score : 0.9593527032364838
2021-12-19 20:49:09,387 INFO Predicting Test Scores!
2021-12-19 20:49:12,203 INFO Training Started!


Parameters: { "early_stopping_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




2021-12-19 20:53:05,990 INFO Training Finished!
2021-12-19 20:53:05,990 INFO Predicting Val Probablities!
2021-12-19 20:53:07,708 INFO Predicting Val Score!
2021-12-19 20:53:09,440 INFO fold: 3 accuracy_score : 0.9589389553052234
2021-12-19 20:53:09,440 INFO Predicting Test Scores!
2021-12-19 20:53:12,239 INFO Training Started!


Parameters: { "early_stopping_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




2021-12-19 20:57:04,648 INFO Training Finished!
2021-12-19 20:57:04,648 INFO Predicting Val Probablities!
2021-12-19 20:57:06,357 INFO Predicting Val Score!
2021-12-19 20:57:08,078 INFO fold: 4 accuracy_score : 0.9589502052489738
2021-12-19 20:57:08,079 INFO Predicting Test Scores!
2021-12-19 20:57:10,851 INFO Training Started!


Parameters: { "early_stopping_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




2021-12-19 21:01:02,420 INFO Training Finished!
2021-12-19 21:01:02,421 INFO Predicting Val Probablities!
2021-12-19 21:01:04,114 INFO Predicting Val Score!
2021-12-19 21:01:05,835 INFO fold: 5 accuracy_score : 0.9588439557802211
2021-12-19 21:01:05,836 INFO Predicting Test Scores!
2021-12-19 21:01:08,026 INFO  Mean Metrics Results from all Folds are: {'accuracy_score': 0.9585174682872781}


CPU times: user 20min 25s, sys: 6.16 s, total: 20min 31s
Wall time: 19min 53s


In [72]:
# predict on test dataset
if xgb_model_test_preds is not None:
    print(xgb_model_test_preds.shape)

(1000000,)


In [None]:
# # take weighted average of both k-fold models predictions
# final_preds = ((0.45 * sci_model_preds) + (0.55* xgb_model_test_preds)) / 2
# print(final_preds.shape)

#### Create Kaggle Predictions

In [79]:
test_preds = xgb_model_test_preds
print(type(test_preds))

<class 'numpy.ndarray'>


In [80]:
print(f"{dict(pd.Series(test_preds).value_counts())}")

{1.0: 494233, 0.0: 370376, 2.0: 69298, 6.0: 10159, 0.8: 7402, 0.2: 7188, 1.2: 5994, 0.6000000000000001: 5437, 0.4: 5235, 5.0: 2254, 1.7999999999999998: 2174, 2.4: 2105, 4.8: 2096, 1.4: 1965, 3.5999999999999996: 1722, 1.6: 1721, 2.6: 1254, 2.5999999999999996: 1027, 3.8: 949, 4.4: 943, 3.1999999999999997: 919, 1.8: 874, 1.4000000000000001: 860, 1.5999999999999999: 819, 1.2000000000000002: 700, 1.8000000000000003: 578, 4.2: 347, 3.4000000000000004: 211, 3.2: 190, 3.0: 162, 2.2: 120, 2.6000000000000005: 116, 2.1999999999999997: 115, 3.4: 99, 2.8000000000000003: 84, 2.8: 74, 3.6: 56, 1.9999999999999998: 54, 4.0: 46, 2.4000000000000004: 25, 4.6: 7, 3.0000000000000004: 4, 3.8000000000000003: 4, 2.9999999999999996: 4}


In [81]:
test_preds

array([1., 1., 1., ..., 1., 0., 2.])

In [91]:
test_preds_round = np.around(test_preds).dtype(int)
test_preds_round[:1000]

TypeError: 'numpy.dtype[float64]' object is not callable

In [88]:
print(f"{dict(pd.Series(test_preds_round).value_counts())}")

{1.0: 516591, 0.0: 382799, 2.0: 77883, 6.0: 10159, 5.0: 4357, 3.0: 4144, 4.0: 4067}


In [86]:
# target encoding changes 1 to 7 classes to 0 to 6
test_preds_final = test_preds_round + 1
test_preds_final

array([2., 2., 2., ..., 2., 1., 3.])

In [89]:
print(f"{dict(pd.Series(test_preds_final).value_counts())}")

{2.0: 516591, 1.0: 382799, 3.0: 77883, 7.0: 10159, 6.0: 4357, 4.0: 4144, 5.0: 4067}


In [78]:
submission_file_name = 'sun_dec_19_2109_submission.csv'

sub = pd.read_csv(DIRECTORY_PATH + SAMPLE_SUB_FILE)
sub['Cover_Type'] = test_preds

sub.to_csv(submission_file_name, index=False)
print(f"{submission_file_name} saved!")

sun_dec_19_2109_submission.csv saved!
