## Install

`pip install -U tabular_ml_toolkit`

*Here we are using XGBClassifier, on  [Kaggle TPS Challenge (Nov 2021) data](https://www.kaggle.com/c/tabular-playground-series-nov-2021/data)*

In [1]:
from tabular_ml_toolkit.tmlt import *
from xgboost import XGBClassifier
import numpy as np
import gc
import pandas as pd

In [2]:
from sklearn.metrics import roc_auc_score, accuracy_score, log_loss

In [3]:
# Dataset file names and Paths
DIRECTORY_PATH = "/home/pankaj/kaggle_datasets/tpc_dec_2021/"
TRAIN_FILE = "train.csv"
TEST_FILE = "test.csv"
SAMPLE_SUB_FILE = "sample_submission.csv"
OUTPUT_PATH = "kaggle_tps_dec_output/"

In [4]:
# create tmlt
tmlt = TMLT().prepare_data(
    train_file_path= DIRECTORY_PATH + TRAIN_FILE,
    test_file_path= DIRECTORY_PATH + TEST_FILE,
    #make sure to use right index and target columns
    idx_col="Id",
    target="Cover_Type",
    random_state=42,
    problem_type="multi_class_classification"
)

2021-12-20 16:31:16,286 INFO 8 cores found, model and data parallel processing should worked!
2021-12-20 16:31:32,266 INFO DataFrame Memory usage decreased to 274.66 Mb (83.9% reduction)
2021-12-20 16:31:34,993 INFO DataFrame Memory usage decreased to 67.71 Mb (83.9% reduction)
2021-12-20 16:31:35,288 INFO The least class label is :5 and value count is: 1
2021-12-20 16:31:35,293 INFO The Original X shape is: (4000000, 55)
2021-12-20 16:31:35,408 INFO The X shape after least class duplicates appends is: (4000021, 55)
2021-12-20 16:31:37,071 INFO PreProcessing will include target(s) encoding!
2021-12-20 16:31:37,180 INFO categorical columns are None, Preprocessing will done accordingly!


In [5]:
print(type(tmlt.dfl.X))
print(tmlt.dfl.X.shape)
print(type(tmlt.dfl.y))
print(tmlt.dfl.y.shape)
print(type(tmlt.dfl.X_test))
print(tmlt.dfl.X_test.shape)

<class 'pandas.core.frame.DataFrame'>
(4000021, 54)
<class 'numpy.ndarray'>
(4000021,)
<class 'pandas.core.frame.DataFrame'>
(1000000, 54)


In [6]:
print(dict(pd.Series(tmlt.dfl.y).value_counts()))

{1: 2262087, 0: 1468136, 2: 195712, 6: 62261, 5: 11426, 3: 377, 4: 22}


#### PreProcess X, y and X_test

<!-- and apply SMOTEENN combine technique (oversample+undersample) to resample imbalance classses -->


In [7]:
X_np, y_np, X_test_np = tmlt.pp_fit_transform(tmlt.dfl.X, tmlt.dfl.y, tmlt.dfl.X_test)
print(X_np.shape)
print(type(X_np))
print(y_np.shape)
print(type(y_np))
print(X_test_np.shape)
print(type(X_test_np))

(4000021, 54)
<class 'numpy.ndarray'>
(4000021,)
<class 'numpy.ndarray'>
(1000000, 54)
<class 'numpy.ndarray'>


In [8]:
print(dict(pd.Series(y_np).value_counts()))

{1: 2262087, 0: 1468136, 2: 195712, 6: 62261, 5: 11426, 3: 377, 4: 22}


In [9]:
gc.collect()

48

### For Meta Ensemble Models Training

#### Base Model 1: TabNet

In [10]:
from pytorch_tabnet.tab_model import TabNetClassifier

In [11]:
# OOF training and prediction on both train and test dataset by a given model

#TOD0: Add tabnet_params and pass through do_oof_kfold_train_preds, keep it optional for non tabnet models
# GOTO tmlt update method with tabnet_params
tabnet_params = {
    'max_epochs': 10,
    'patience': 2,
    'batch_size': 4096*6*tmlt.IDEAL_CPU_CORES,
    'virtual_batch_size' : 512*6*tmlt.IDEAL_CPU_CORES
}

#choose model
tabnet_oof_model = TabNetClassifier(optimizer_params=dict(lr=0.02), verbose=1)

#fit and predict
tabnet_oof_model_preds, tabnet_oof_model_test_preds = tmlt.do_oof_kfold_train_preds(n_splits=5,
                                                                                    model=tabnet_oof_model,
                                                                                    X = X_np,
                                                                                    y = y_np,
                                                                                    X_test = X_test_np,
                                                                                    tabnet_params=tabnet_params)
gc.collect()

if tabnet_oof_model_preds is not None:
    print(tabnet_oof_model_preds.shape)

if tabnet_oof_model_test_preds is not None:
    print(tabnet_oof_model_test_preds.shape)

Device used : cuda


2021-12-20 16:31:42,277 INFO Training Started!


epoch 0  | loss: 1.64566 | val_0_logloss: 0.7088  |  0:00:19s
epoch 1  | loss: 0.42927 | val_0_logloss: 0.3567  |  0:00:39s
epoch 2  | loss: 0.30277 | val_0_logloss: 0.31855 |  0:00:59s
epoch 3  | loss: 0.27035 | val_0_logloss: 0.27764 |  0:01:19s
epoch 4  | loss: 0.2512  | val_0_logloss: 0.24447 |  0:01:39s
epoch 5  | loss: 0.23766 | val_0_logloss: 0.23266 |  0:01:58s
epoch 6  | loss: 0.2281  | val_0_logloss: 0.22314 |  0:02:18s
epoch 7  | loss: 0.21536 | val_0_logloss: 0.21099 |  0:02:38s
epoch 8  | loss: 0.20126 | val_0_logloss: 0.21066 |  0:02:58s
epoch 9  | loss: 0.19007 | val_0_logloss: 0.18278 |  0:03:17s
Stop training because you reached max_epochs = 10 with best_epoch = 9 and best_val_0_logloss = 0.18278
Best weights from best epoch are automatically used!


2021-12-20 16:35:19,302 INFO Training Finished!
2021-12-20 16:35:24,966 INFO fold: 1 OOF Model Metrics: 0.1827772870072086!
2021-12-20 16:35:29,120 INFO Training Started!


epoch 0  | loss: 0.21469 | val_0_logloss: 0.18665 |  0:00:19s
epoch 1  | loss: 0.17476 | val_0_logloss: 0.16753 |  0:00:39s
epoch 2  | loss: 0.16039 | val_0_logloss: 0.15348 |  0:00:58s
epoch 3  | loss: 0.1508  | val_0_logloss: 0.14996 |  0:01:18s
epoch 4  | loss: 0.14364 | val_0_logloss: 0.14239 |  0:01:38s
epoch 5  | loss: 0.13699 | val_0_logloss: 0.13611 |  0:01:57s
epoch 6  | loss: 0.13403 | val_0_logloss: 0.15121 |  0:02:17s
epoch 7  | loss: 0.12962 | val_0_logloss: 0.12613 |  0:02:36s
epoch 8  | loss: 0.12704 | val_0_logloss: 0.12459 |  0:02:56s
epoch 9  | loss: 0.12414 | val_0_logloss: 0.12519 |  0:03:16s
Stop training because you reached max_epochs = 10 with best_epoch = 8 and best_val_0_logloss = 0.12459
Best weights from best epoch are automatically used!


2021-12-20 16:39:01,776 INFO Training Finished!
2021-12-20 16:39:07,405 INFO fold: 2 OOF Model Metrics: 0.12459373409085422!
2021-12-20 16:39:11,501 INFO Training Started!


epoch 0  | loss: 0.1597  | val_0_logloss: 0.13646 |  0:00:19s
epoch 1  | loss: 0.13051 | val_0_logloss: 0.12656 |  0:00:39s
epoch 2  | loss: 0.12371 | val_0_logloss: 0.12304 |  0:00:58s
epoch 3  | loss: 0.11951 | val_0_logloss: 0.11906 |  0:01:18s
epoch 4  | loss: 0.11653 | val_0_logloss: 0.11701 |  0:01:37s
epoch 5  | loss: 0.11451 | val_0_logloss: 0.11349 |  0:01:57s
epoch 6  | loss: 0.1125  | val_0_logloss: 0.11324 |  0:02:16s
epoch 7  | loss: 0.11117 | val_0_logloss: 0.1102  |  0:02:36s
epoch 8  | loss: 0.10994 | val_0_logloss: 0.1094  |  0:02:55s
epoch 9  | loss: 0.10875 | val_0_logloss: 0.11873 |  0:03:15s
Stop training because you reached max_epochs = 10 with best_epoch = 8 and best_val_0_logloss = 0.1094
Best weights from best epoch are automatically used!


2021-12-20 16:42:43,502 INFO Training Finished!
2021-12-20 16:42:49,171 INFO fold: 3 OOF Model Metrics: 0.10939878690928634!
2021-12-20 16:42:53,340 INFO Training Started!


epoch 0  | loss: 0.13354 | val_0_logloss: 0.11966 |  0:00:19s
epoch 1  | loss: 0.11402 | val_0_logloss: 0.11186 |  0:00:39s
epoch 2  | loss: 0.1095  | val_0_logloss: 0.10926 |  0:00:58s
epoch 3  | loss: 0.10741 | val_0_logloss: 0.10641 |  0:01:18s
epoch 4  | loss: 0.1081  | val_0_logloss: 0.10683 |  0:01:38s
epoch 5  | loss: 0.10605 | val_0_logloss: 0.10861 |  0:01:57s

Early stopping occurred at epoch 5 with best_epoch = 3 and best_val_0_logloss = 0.10641
Best weights from best epoch are automatically used!


2021-12-20 16:45:07,781 INFO Training Finished!
2021-12-20 16:45:13,410 INFO fold: 4 OOF Model Metrics: 0.10640530068344113!
2021-12-20 16:45:17,495 INFO Training Started!


epoch 0  | loss: 0.13787 | val_0_logloss: 0.12041 |  0:00:19s
epoch 1  | loss: 0.1122  | val_0_logloss: 0.11299 |  0:00:39s
epoch 2  | loss: 0.10739 | val_0_logloss: 0.12172 |  0:00:59s
epoch 3  | loss: 0.10551 | val_0_logloss: 0.10786 |  0:01:18s
epoch 4  | loss: 0.10471 | val_0_logloss: 0.10572 |  0:01:38s
epoch 5  | loss: 0.10364 | val_0_logloss: 0.11127 |  0:01:57s
epoch 6  | loss: 0.10259 | val_0_logloss: 0.10323 |  0:02:17s
epoch 7  | loss: 0.10212 | val_0_logloss: 0.10639 |  0:02:36s
epoch 8  | loss: 0.10176 | val_0_logloss: 0.10419 |  0:02:56s

Early stopping occurred at epoch 8 with best_epoch = 6 and best_val_0_logloss = 0.10323
Best weights from best epoch are automatically used!


2021-12-20 16:48:30,510 INFO Training Finished!
2021-12-20 16:48:36,165 INFO fold: 5 OOF Model Metrics: 0.1032339081470943!
2021-12-20 16:48:39,778 INFO Mean OOF Model Metrics: 0.1252818033675769!


(4000021,)
(1000000,)


#### Now add back based models predictions to X and X_test

In [13]:
# add based model oof predictions back to X and X_test before Meta model training
tmlt.dfl.X["tabnet_preds"] = tabnet_oof_model_preds
tmlt.dfl.X_test["tabnet_preds"] = tabnet_oof_model_test_preds

print(tmlt.dfl.X.shape)
print(tmlt.dfl.X_test.shape)

(4000021, 55)
(1000000, 55)


#### now just update the tmlt with this new X and X_test

In [14]:
tmlt = tmlt.update_dfl(X=tmlt.dfl.X, y=tmlt.dfl.y, X_test=tmlt.dfl.X_test)

2021-12-20 16:48:41,714 INFO categorical columns are None, Preprocessing will done accordingly!


#### For META Model Training

##### Now PreProcess updated X, y, X_test

NOTE: Preprocessing gives back numpy arrays for pandas dataframe

In [15]:
X_np, y_np, X_test_np = tmlt.pp_fit_transform(tmlt.dfl.X, tmlt.dfl.y, tmlt.dfl.X_test)

print(X_np.shape)
print(type(X_np))
print(y_np.shape)
print(type(y_np))
print(X_test_np.shape)
print(type(X_test_np))

(4000021, 55)
<class 'numpy.ndarray'>
(4000021,)
<class 'numpy.ndarray'>
(1000000, 55)
<class 'numpy.ndarray'>


In [39]:
gc.collect()

0

#### For Meta Model, Let's do Optuna based HyperParameter search to get best params for fit

In [40]:
%%time
# create train, valid split to evaulate model on valid dataset
X_train_np, X_valid_np,  y_train_np, y_valid_np =  tmlt.dfl.create_train_valid(X_np, y_np, valid_size=0.2)

print(X_train_np.shape)
print(type(X_train_np))
print(y_train_np.shape)
print(type(y_train_np))
print(X_valid_np.shape)
print(type(X_valid_np))
print(y_valid_np.shape)
print(type(y_valid_np))

(3200016, 55)
<class 'numpy.ndarray'>
(3200016,)
<class 'numpy.ndarray'>
(800005, 55)
<class 'numpy.ndarray'>
(800005,)
<class 'numpy.ndarray'>
CPU times: user 1.92 s, sys: 92 ms, total: 2.01 s
Wall time: 2.01 s


In [46]:
# **Just make sure to supply an output directory path so hyperparameter search is saved**
study = tmlt.do_xgb_optuna_optimization(X_train_np, y_train_np, X_valid_np, y_valid_np,
                                        optuna_db_path=OUTPUT_PATH, opt_timeout=720)
print(study.best_trial)

2021-12-20 19:40:02,965 INFO Optimization Direction is: maximize
[32m[I 2021-12-20 19:40:02,980][0m Using an existing study with name 'tmlt_autoxgb' instead of creating a new one.[0m
2021-12-20 19:40:03,296 INFO final params {'learning_rate': 0.08927538038963782, 'n_estimators': 70, 'reg_lambda': 1.3729952949458818e-06, 'reg_alpha': 1.660148933599981e-06, 'subsample': 0.12798062790415102, 'colsample_bytree': 0.6236595537308991, 'max_depth': 3, 'tree_method': 'hist', 'booster': 'gbtree', 'gamma': 5.642432394837139e-05, 'grow_policy': 'lossguide'}
2021-12-20 19:40:03,297 INFO Training Started!


Parameters: { "eval_set" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




2021-12-20 19:41:12,732 INFO Training Ended!
2021-12-20 19:41:13,502 INFO accuracy_score: 0.9462415859900876
[32m[I 2021-12-20 19:41:13,537][0m Trial 1 finished with value: 0.9462415859900876 and parameters: {'learning_rate': 0.08927538038963782, 'n_estimators': 70, 'reg_lambda': 1.3729952949458818e-06, 'reg_alpha': 1.660148933599981e-06, 'subsample': 0.12798062790415102, 'colsample_bytree': 0.6236595537308991, 'max_depth': 3, 'early_stopping_rounds': 424, 'tree_method': 'hist', 'booster': 'gbtree', 'gamma': 5.642432394837139e-05, 'grow_policy': 'lossguide'}. Best is trial 1 with value: 0.9462415859900876.[0m
2021-12-20 19:41:13,789 INFO final params {'learning_rate': 0.03498153801071581, 'n_estimators': 70, 'reg_lambda': 8.689474220867741e-07, 'reg_alpha': 0.0001144337197787049, 'subsample': 0.7678175053610523, 'colsample_bytree': 0.7144759497123172, 'max_depth': 7, 'tree_method': 'approx', 'booster': 'gbtree', 'gamma': 9.826303946378805e-06, 'grow_policy': 'lossguide'}
2021-12-20 

Parameters: { "eval_set" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




2021-12-20 20:13:19,376 INFO Training Ended!
2021-12-20 20:13:21,039 INFO accuracy_score: 0.9520665495840651
[32m[I 2021-12-20 20:13:21,069][0m Trial 2 finished with value: 0.9520665495840651 and parameters: {'learning_rate': 0.03498153801071581, 'n_estimators': 70, 'reg_lambda': 8.689474220867741e-07, 'reg_alpha': 0.0001144337197787049, 'subsample': 0.7678175053610523, 'colsample_bytree': 0.7144759497123172, 'max_depth': 7, 'early_stopping_rounds': 111, 'tree_method': 'approx', 'booster': 'gbtree', 'gamma': 9.826303946378805e-06, 'grow_policy': 'lossguide'}. Best is trial 2 with value: 0.9520665495840651.[0m


FrozenTrial(number=2, values=[0.9520665495840651], datetime_start=datetime.datetime(2021, 12, 20, 19, 41, 13, 543204), datetime_complete=datetime.datetime(2021, 12, 20, 20, 13, 21, 40156), params={'booster': 'gbtree', 'colsample_bytree': 0.7144759497123172, 'early_stopping_rounds': 111, 'gamma': 9.826303946378805e-06, 'grow_policy': 'lossguide', 'learning_rate': 0.03498153801071581, 'max_depth': 7, 'n_estimators': 70, 'reg_alpha': 0.0001144337197787049, 'reg_lambda': 8.689474220867741e-07, 'subsample': 0.7678175053610523, 'tree_method': 'approx'}, distributions={'booster': CategoricalDistribution(choices=('gbtree', 'gblinear')), 'colsample_bytree': UniformDistribution(high=1.0, low=0.1), 'early_stopping_rounds': IntUniformDistribution(high=500, low=100, step=1), 'gamma': LogUniformDistribution(high=1.0, low=1e-08), 'grow_policy': CategoricalDistribution(choices=('depthwise', 'lossguide')), 'learning_rate': LogUniformDistribution(high=0.25, low=0.01), 'max_depth': IntUniformDistribution

In [47]:
xgb_params.update(study.best_trial.params)
# xgb_params.update({'n_estimators': 1500})
print("xgb_params", xgb_params)
updated_xgb_model = XGBClassifier(**xgb_params)

xgb_params {'use_label_encoder': False, 'learning_rate': 0.03498153801071581, 'n_estimators': 70, 'reg_lambda': 8.689474220867741e-07, 'reg_alpha': 0.0001144337197787049, 'subsample': 0.7678175053610523, 'colsample_bytree': 0.7144759497123172, 'max_depth': 7, 'tree_method': 'approx', 'gpu_id': 0, 'predictor': 'gpu_predictor', 'early_stopping_rounds': 111, 'booster': 'gbtree', 'gamma': 9.826303946378805e-06, 'grow_policy': 'lossguide'}


#### Let's Use K-Fold Training with best params

In [48]:
%%time
# k-fold training
xgb_model_metrics_score, xgb_model_test_preds = tmlt.do_kfold_training(X_np,
                                                                       y_np,
                                                                       X_test=X_test_np,
                                                                       n_splits=5,
                                                                       model=xgb_model)
gc.collect()

2021-12-20 20:13:22,169 INFO Training Started!


Parameters: { "early_stopping_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




2021-12-20 20:17:10,145 INFO Training Finished!
2021-12-20 20:17:10,146 INFO Predicting Val Probablities!
2021-12-20 20:17:11,832 INFO Predicting Val Score!
2021-12-20 20:17:13,534 INFO fold: 1 accuracy_score : 0.9551077805763714
2021-12-20 20:17:13,535 INFO Predicting Test Scores!
2021-12-20 20:17:16,293 INFO Training Started!


Parameters: { "early_stopping_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




2021-12-20 20:21:05,119 INFO Training Finished!
2021-12-20 20:21:05,120 INFO Predicting Val Probablities!
2021-12-20 20:21:06,816 INFO Predicting Val Score!
2021-12-20 20:21:08,525 INFO fold: 2 accuracy_score : 0.9586202068989655
2021-12-20 20:21:08,526 INFO Predicting Test Scores!
2021-12-20 20:21:11,276 INFO Training Started!


Parameters: { "early_stopping_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




2021-12-20 20:25:00,295 INFO Training Finished!
2021-12-20 20:25:00,295 INFO Predicting Val Probablities!
2021-12-20 20:25:01,971 INFO Predicting Val Score!
2021-12-20 20:25:03,670 INFO fold: 3 accuracy_score : 0.9590564547177264
2021-12-20 20:25:03,671 INFO Predicting Test Scores!
2021-12-20 20:25:06,434 INFO Training Started!


Parameters: { "early_stopping_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




2021-12-20 20:28:55,225 INFO Training Finished!
2021-12-20 20:28:55,226 INFO Predicting Val Probablities!
2021-12-20 20:28:56,923 INFO Predicting Val Score!
2021-12-20 20:28:58,643 INFO fold: 4 accuracy_score : 0.9592277038614807
2021-12-20 20:28:58,643 INFO Predicting Test Scores!
2021-12-20 20:29:01,411 INFO Training Started!


Parameters: { "early_stopping_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




2021-12-20 20:32:50,323 INFO Training Finished!
2021-12-20 20:32:50,324 INFO Predicting Val Probablities!
2021-12-20 20:32:52,008 INFO Predicting Val Score!
2021-12-20 20:32:53,715 INFO fold: 5 accuracy_score : 0.9588027059864701
2021-12-20 20:32:53,716 INFO Predicting Test Scores!
2021-12-20 20:32:55,905 INFO  Mean Metrics Results from all Folds are: {'accuracy_score': 0.9581629704082029}


CPU times: user 20min 4s, sys: 6.21 s, total: 20min 10s
Wall time: 19min 34s


0

2021-12-20 17:08:30,867 INFO Training Finished!
2021-12-20 17:08:30,868 INFO Predicting Val Probablities!
2021-12-20 17:08:32,559 INFO Predicting Val Score!
2021-12-20 17:08:34,262 INFO fold: 5 accuracy_score : 0.9588027059864701
2021-12-20 17:08:34,263 INFO Predicting Test Scores!
2021-12-20 17:08:36,422 INFO  Mean Metrics Results from all Folds are: {'accuracy_score': 0.9581629704082029}

In [49]:
# predict on test dataset
if xgb_model_test_preds is not None:
    print(xgb_model_test_preds.shape)

(1000000,)


#### Create Kaggle Predictions

In [51]:
# tabnet_oof_model_test_preds

In [52]:
test_preds = xgb_model_test_preds
print(type(test_preds))

<class 'numpy.ndarray'>


In [53]:
test_preds_round = np.around(test_preds).astype(int)
test_preds_round[:1000]

array([1, 1, 1, 1, 1, 2, 1, 0, 1, 2, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1,
       0, 1, 2, 1, 1, 2, 1, 1, 2, 1, 1, 0, 2, 1, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 2, 1, 1, 0, 0, 1, 1,
       2, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 0, 0, 0, 1, 0, 1, 1, 2, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 3, 6, 1, 0, 1, 1,
       1, 1, 6, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 6, 1, 0, 0, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 2, 0, 0, 1, 0, 0,
       0, 1, 0, 0, 1, 1, 6, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1,
       1, 0, 0, 1, 1, 0, 4, 3, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1,
       0, 5, 0, 1, 1, 0, 0, 1, 0, 1, 5, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 1, 1, 2, 0, 1, 1, 0, 0, 0, 2, 1, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 2, 1, 0, 0, 0, 1, 1, 0,
       0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1,

In [54]:
print(f"{dict(pd.Series(test_preds_round).value_counts())}")

{1: 516503, 0: 383003, 2: 77484, 6: 9688, 5: 4584, 4: 4464, 3: 4274}


In [55]:
# target encoding changes 1 to 7 classes to 0 to 6
test_preds_round = test_preds_round + 1
print(type(test_preds_round))

<class 'numpy.ndarray'>


In [59]:
print(f"{dict(pd.Series(test_preds_round).value_counts())}")

{2: 516503, 1: 383003, 3: 77484, 7: 9688, 6: 4584, 5: 4464, 4: 4274}


In [58]:
submission_file_name = 'mon_dec_20_2209_submission.csv'

sub = pd.read_csv(DIRECTORY_PATH + SAMPLE_SUB_FILE)
sub['Cover_Type'] = test_preds_round

sub.to_csv(submission_file_name, index=False)
print(f"{submission_file_name} saved!")

mon_dec_20_2209_submission.csv saved!
