In [1]:
%load_ext autoreload
%autoreload 2

# Getting Started Kaggle TPS Challenge with Tabular ML Toolkit

> A Tutorial to showcase usage of tabular_ml_toolkit (tmlt) library on Kaggle TPS Challenge Nov 2021.

> tabular_ml_toolkit is a helper library to jumpstart your machine learning project based on Tabular or Structured data.

> It comes with model parallelism and cutting edge hyperparameter search techniques.

> Under the hood TMLT uses optuna, xgboost and scikit-learn pipelines

## Install

`pip install -U tabular_ml_toolkit`

### How to Best Use tabular_ml_toolkit

Start with your favorite model and then just simply create **tmlt** with one API.

*Here we are using XGBClassifier, on  [Kaggle TPS Challenge (Nov 2021) data](https://www.kaggle.com/c/tabular-playground-series-nov-2021/data)*

In [2]:
from tabular_ml_toolkit.tmlt import *
from xgboost import XGBClassifier
import numpy as np
import gc

In [3]:
from sklearn.metrics import roc_auc_score, accuracy_score

In [4]:
# Dataset file names and Paths
DIRECTORY_PATH = "/home/pankaj/kaggle_datasets/tpc_dec_2021/"
TRAIN_FILE = "train.csv"
TEST_FILE = "test.csv"
SAMPLE_SUB_FILE = "sample_submission.csv"
OUTPUT_PATH = "kaggle_tps_dec_output/"

#### Just point tmlt in the direction of your data

#### Let it know what are idx and target columns in your tabular data

#### what kind of problem type you are trying to resolve?

In [5]:
# create tmlt
tmlt = TMLT().prepare_data(
    train_file_path= DIRECTORY_PATH + TRAIN_FILE,
    test_file_path= DIRECTORY_PATH + TEST_FILE,
    #make sure to use right index and target columns
    idx_col="Id",
    target="Cover_Type",
    random_state=42,
    problem_type="multi_class_classification",
#     nrows=4000
)


# tmlt supports only below task type:
    # "binary_classification"
    # "multi_label_classification"
    # "multi_class_classification"
    # "regression"

2021-12-15 17:02:33,026 INFO 8 cores found, model and data parallel processing should worked!
2021-12-15 17:02:49,147 INFO DataFrame Memory usage decreased to 274.66 Mb (83.9% reduction)
2021-12-15 17:02:51,956 INFO DataFrame Memory usage decreased to 67.71 Mb (83.9% reduction)
2021-12-15 17:02:52,264 INFO The least class label is :5 and value count is: 1
2021-12-15 17:02:52,268 INFO The time took to concat 12 rows: 0.001493215560913086
2021-12-15 17:02:52,268 INFO The X shape BEFORE append is: (4000000, 55)
2021-12-15 17:02:52,385 INFO The time took to append 1 dataframe to existing one!: 0.11675643920898438
2021-12-15 17:02:52,386 INFO The X shape AFTER append is: (4000012, 55)
2021-12-15 17:02:54,021 INFO PreProcessing will include target(s) encoding!
2021-12-15 17:02:54,129 INFO categorical columns are None, Preprocessing will done accordingly!


In [6]:
print(type(tmlt.dfl.X))
print(tmlt.dfl.X.shape)
print(type(tmlt.dfl.y))
print(tmlt.dfl.y.shape)
print(type(tmlt.dfl.X_test))
print(tmlt.dfl.X_test.shape)

<class 'pandas.core.frame.DataFrame'>
(4000012, 54)
<class 'numpy.ndarray'>
(4000012,)
<class 'pandas.core.frame.DataFrame'>
(1000000, 54)


In [7]:
tmlt.dfl.X

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type31,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40
0,3189,40,8,30,13,3270,206,234,193,4873,...,0,0,0,0,0,0,0,0,0,0
1,3026,182,5,280,29,3270,233,240,106,5423,...,0,0,0,0,0,0,0,0,0,0
2,3106,13,7,351,37,2914,208,234,137,5269,...,0,0,0,0,0,0,0,0,0,0
3,3022,276,13,192,16,3034,207,238,156,2866,...,0,0,0,0,0,0,0,0,0,0
4,2906,186,13,266,22,2916,231,231,154,2642,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4000007,2953,114,39,97,111,981,181,209,184,7633,...,0,0,0,0,0,0,0,0,0,0
4000008,2953,114,39,97,111,981,181,209,184,7633,...,0,0,0,0,0,0,0,0,0,0
4000009,2953,114,39,97,111,981,181,209,184,7633,...,0,0,0,0,0,0,0,0,0,0
4000010,2953,114,39,97,111,981,181,209,184,7633,...,0,0,0,0,0,0,0,0,0,0


In [8]:
import pandas as pd
print(dict(pd.Series(tmlt.dfl.y).value_counts()))

{1: 2262087, 0: 1468136, 2: 195712, 6: 62261, 5: 11426, 3: 377, 4: 13}


In [9]:
gc.collect()

0

### Training


##### create train valid dataframes for quick preprocessing and training

In [10]:
%%time
# create train, valid split to evaulate model on valid dataset
X_train, X_valid,  y_train, y_valid =  tmlt.dfl.create_train_valid(valid_size=0.2)

CPU times: user 984 ms, sys: 0 ns, total: 984 ms
Wall time: 983 ms


In [11]:
print(X_train.shape)
print(y_train.shape)
print(X_valid.shape)
print(y_valid.shape)

# print(X_train.columns.to_list())

(3200009, 54)
(3200009,)
(800003, 54)
(800003,)


In [12]:
# check for class values see if both train and valid have same class labels
print(dict(pd.Series(y_train).value_counts()))
print(dict(pd.Series(y_valid).value_counts()))

{1: 1809818, 0: 1174197, 2: 156714, 6: 49866, 5: 9111, 3: 293, 4: 10}
{1: 452269, 0: 293939, 2: 38998, 6: 12395, 5: 2315, 3: 84, 4: 3}


##### Now PreProcess X_train, X_valid

NOTE: Preprocessing gives back numpy arrays for pandas dataframe

In [13]:
%%time
X_train_np,  X_valid_np = tmlt.pp_fit_transform(X_train, X_valid)

print(type(X_train_np))
print(X_train_np.shape)
# print(X_train_np)
print(type(X_valid_np))
print(X_valid_np.shape)
# print(X_valid_np)
print(type(y_valid))
print(type(y_train))

<class 'numpy.ndarray'>
(3200009, 54)
<class 'numpy.ndarray'>
(800003, 54)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
CPU times: user 2.47 s, sys: 536 ms, total: 3.01 s
Wall time: 3.01 s


In [14]:
gc.collect()

48

#### Create a base xgb classifier model with your best guess params

In [15]:
xgb_params = {
    # your best guess params
    'learning_rate':0.1,
    'eval_metric':'mlogloss',
    # must for xgb classifier otherwise warning will be shown
    'use_label_encoder':False,
    # because 42 is the answer for all the randomness of this universe
    'random_state':42,
    #for GPU
    'tree_method': 'gpu_hist',
    'predictor': 'gpu_predictor',
}

xgb_model = XGBClassifier(**xgb_params)

In [16]:
%%time
# Now do model training
xgb_model.fit(X_train_np, y_train,
              verbose=False,
              #detect & avoid overfitting
              eval_set=[(X_valid_np, y_valid)],
              eval_metric="mlogloss",
              early_stopping_rounds=300
             )

CPU times: user 19.3 s, sys: 548 ms, total: 19.9 s
Wall time: 14.9 s


In [17]:
%%time
#predict
preds = xgb_model.predict(X_valid_np)
preds_probs = xgb_model.predict_proba(X_valid_np)

# Metrics
auc = roc_auc_score(y_valid, preds_probs, multi_class='ovr')
acc = accuracy_score(y_valid, preds)

print(f"AUC is : {auc} while Accuracy is : {acc} ")

AUC is : 0.997025867656879 while Accuracy is : 0.95200267998995 
CPU times: user 4.23 s, sys: 147 ms, total: 4.37 s
Wall time: 2.19 s


In [18]:
gc.collect()

0

### For Meta Ensemble Models Training

##### Make sure to PreProcess the data

In [19]:
%%time
X_np, X_test_np = tmlt.pp_fit_transform(tmlt.dfl.X, tmlt.dfl.X_test)
y_np = tmlt.dfl.y

CPU times: user 2.73 s, sys: 668 ms, total: 3.4 s
Wall time: 3.4 s


#### Base Model 1: linear SVM model

In [20]:
# from sklearn.svm import LinearSVC

In [21]:
# %%time

# # OOF training and prediction on both train and test dataset by a given model
# #choose model
# linear_oof_model = LinearSVC(tol=1e-7, penalty='l2', dual=False, max_iter=2000, random_state=42)

# #fit and predict
# linear_oof_model_preds, linear_oof_model_test_preds = tmlt.do_oof_kfold_train_preds(n_splits=5,
#                                                                                     model=linear_oof_model,
#                                                                                     X = X_np,
#                                                                                     y = y_np,
#                                                                                     X_test = X_test_np)

# if linear_oof_model_preds is not None:
#     print(linear_oof_model_preds.shape)

# if linear_oof_model_test_preds is not None:    
#     print(linear_oof_model_test_preds.shape)

In [22]:
# gc.collect()

#### Base Model 2: Logistic Regression Model

In [23]:
# from sklearn.linear_model import LogisticRegression

In [24]:
# %%time

# # OOF training and prediction on both train and test dataset by a given model

# #choose model
# log_oof_model = LogisticRegression(multi_class='multinomial', solver='lbfg', random_state=42)

# #fit and predict
# log_oof_model_preds, log_oof_model_test_preds = tmlt.do_oof_kfold_train_preds(n_splits=5,
#                                                                                     model=log_oof_model,
#                                                                                     X = X_np,
#                                                                                     y = y_np,
#                                                                                     X_test = X_test_np)
# if log_oof_model_preds is not None:
#     print(log_oof_model_preds.shape)

# if log_oof_model_test_preds is not None:    
#     print(log_oof_model_test_preds.shape)

In [25]:
# gc.collect()

#### Base Model 3: SKLearn MLP

In [26]:
# from sklearn.neural_network import MLPClassifier

In [27]:
# %%time

# # OOF training and prediction on both train and test dataset by a given model

# #choose model
# mlp_oof_model = MLPClassifier(max_iter=1000, early_stopping=True)

# #update the model on sklearn pipeline
# # tmlt = tmlt.update_model(mlp_oof_model)

# # # lets see updated sklearn pipeline with new model
# # tmlt.spl

# #fit and predict
# mlp_oof_model_preds, mlp_oof_model_test_preds = tmlt.do_oof_kfold_train_preds(n_splits=5,
#                                                                                     model=mlp_oof_model,
#                                                                                     X = X_np,
#                                                                                     y = y_np,
#                                                                                     X_test = X_test_np)
# if mlp_oof_model_preds is not None:
#     print(mlp_oof_model_preds.shape)

# if mlp_oof_model_test_preds is not None:    
#     print(mlp_oof_model_test_preds.shape)

#### Base Model 4: TabNet

In [28]:
from pytorch_tabnet.tab_model import TabNetClassifier

In [29]:
%%time

# OOF training and prediction on both train and test dataset by a given model

#choose model
tabnet_oof_model = TabNetClassifier(optimizer_params=dict(lr=0.02), verbose=1)

#fit and predict
tabnet_oof_model_preds, tabnet_oof_model_test_preds = tmlt.do_oof_kfold_train_preds(n_splits=5,
                                                                                    model=tabnet_oof_model,
                                                                                    X = X_np,
                                                                                    y = y_np,
                                                                                    X_test = X_test_np)

if tabnet_oof_model_preds is not None:
    print(tabnet_oof_model_preds.shape)

if tabnet_oof_model_test_preds is not None:
    print(tabnet_oof_model_test_preds.shape)

Device used : cuda


2021-12-15 17:03:20,336 INFO Training Started!


epoch 0  | loss: 0.53432 | val_0_logloss: 0.24197 |  0:00:23s
epoch 1  | loss: 0.2018  | val_0_logloss: 0.17972 |  0:00:47s
epoch 2  | loss: 0.16707 | val_0_logloss: 0.15196 |  0:01:10s
epoch 3  | loss: 0.14941 | val_0_logloss: 0.14011 |  0:01:33s
epoch 4  | loss: 0.14068 | val_0_logloss: 0.13515 |  0:01:56s
Stop training because you reached max_epochs = 5 with best_epoch = 4 and best_val_0_logloss = 0.13515
Best weights from best epoch are automatically used!


2021-12-15 17:05:37,806 INFO Training Finished!


ValueError: Classification metrics can't handle a mix of multiclass and continuous targets

#### Now add back based models predictions to X and X_test

In [30]:
# # add based model oof predictions back to X and X_test before Meta model training
# tmlt.dfl.X["linear_preds"] = linear_oof_model_preds
# tmlt.dfl.X_test["linear_preds"] = linear_oof_model_test_preds

# print(tmlt.dfl.X.shape)
# print(tmlt.dfl.X_test.shape)

In [31]:
# # add based model oof predictions back to X and X_test before Meta model training
# tmlt.dfl.X["log_reg_preds"] = log_oof_model_preds
# tmlt.dfl.X_test["log_reg_preds"] = log_oof_model_test_preds

# print(tmlt.dfl.X.shape)
# print(tmlt.dfl.X_test.shape)

In [32]:
# # add based model oof predictions back to X and X_test before Meta model training
# tmlt.dfl.X["mlp_preds"] = mlp_oof_model_preds
# tmlt.dfl.X_test["mlp_preds"] = mlp_oof_model_test_preds

# print(tmlt.dfl.X.shape)
# print(tmlt.dfl.X_test.shape)

In [33]:
# add based model oof predictions back to X and X_test before Meta model training
tmlt.dfl.X["tabnet_preds"] = tabnet_oof_model_preds
tmlt.dfl.X_test["tabnet_preds"] = tabnet_oof_model_test_preds

print(tmlt.dfl.X.shape)
print(tmlt.dfl.X_test.shape)

NameError: name 'tabnet_oof_model_preds' is not defined

In [None]:
# now just update the tmlt with this new X and X_test

In [None]:
tmlt = tmlt.update_dfl(X=tmlt.dfl.X, y=tmlt.dfl.y, X_test=tmlt.dfl.X_test)

#### For META Model Training

##### create train valid dataframes for quick preprocessing and training

In [None]:
%%time
# create train, valid split to evaulate model on valid dataset
X_train, X_valid,  y_train, y_valid =  tmlt.dfl.create_train_valid(valid_size=0.2)

print(X_train.shape)
print(y_train.shape)
print(X_valid.shape)
print(y_valid.shape)

# print(X_train.columns.to_list())

##### Now PreProcess X_train, X_valid

NOTE: Preprocessing gives back numpy arrays for pandas dataframe

In [None]:
%%time
X_train_np,  X_valid_np = tmlt.pp_fit_transform(X_train, X_valid)

print(type(X_train_np))
print(X_train_np.shape)
# print(X_train_np)
print(type(X_valid_np))
print(X_valid_np.shape)
# print(X_valid_np)
print(type(y_valid))
print(type(y_train))

In [None]:
# xgb_params = {
#     'objective': 'binary:logistic', 
#     'use_label_encoder': False,
#     'n_estimators': 40000,
#     'learning_rate': 0.18515462875481553,
#     'subsample': 0.97, 
#     'colsample_bytree': 0.32,
#     'max_depth': 1,
#     'booster': 'gbtree',
#     'gamma': 0.2, 
#     'tree_method': 'gpu_hist',
#     'reg_lambda': 0.11729916523488974, 
#     'reg_alpha': 0.6318827156945853,
#     'random_state': 42,
#     'n_jobs': 4, 
#     'min_child_weight': 256,
#     #for GPU
# #     'tree_method': 'gpu_hist',
# #     'predictor': 'gpu_predictor',
#     }

In [None]:
xgb_params = {
    'learning_rate': 0.21761562020600114,
    'eval_metric': 'mlogloss',
    'use_label_encoder': False,
    'random_state': 42,
    'booster': 'gblinear',
    'colsample_bytree': 0.1027132584989078,
    'early_stopping_rounds': 171,
    'max_depth': 6,
    'n_estimators': 7000,
    'reg_alpha': 9.583579660175245e-06,
    'reg_lambda': 9.238315962782784e-05,
    'subsample': 0.4464473710560276,
    'tree_method': 'approx',
    #for GPU
#     'tree_method': 'gpu_hist',
#     'predictor': 'gpu_predictor',
}

In [None]:
%%time
# Now do model training
xgb_model.fit(X_train_np, y_train,
              verbose=False,
              #detect & avoid overfitting
              eval_set=[(X_train_np, y_train), (X_valid_np, y_valid)],
              eval_metric="mlogloss",
              early_stopping_rounds=300
             )

#predict
preds = xgb_model.predict(X_valid_np)
preds_probs = xgb_model.predict_proba(X_valid_np)

# Metrics
auc = roc_auc_score(y_valid, preds_probs, multi_class='ovr')
acc = accuracy_score(y_valid, preds)

print(f"AUC is : {auc} while Accuracy is : {acc} ")

### WOW!!!!

#### For Meta Model, Let's do Optuna based HyperParameter search to get best params for fit

In [None]:
# **Just make sure to supply an output directory path so hyperparameter search is saved**
study = tmlt.do_xgb_optuna_optimization(optuna_db_path=OUTPUT_PATH, opt_timeout=360)
print(study.best_trial)

##### now update the meta model with best params from study and then update the sklearn pipeline with this new model

In [None]:
xgb_params.update(study.best_trial.params)
print("xgb_params", xgb_params)
updated_xgb_model = XGBClassifier(**xgb_params)

#### Let's Use K-Fold Training with best params

In [None]:
%%time
X_np, X_test_np = tmlt.pp_fit_transform(tmlt.dfl.X, tmlt.dfl.X_test)
y_np = tmlt.dfl.y

In [None]:
%%time
# k-fold training
xgb_model_metrics_score, xgb_model_test_preds = tmlt.do_kfold_training(X_np,
                                                                       y_np,
                                                                       X_test=X_test_np,
                                                                       n_splits=5,
                                                                       model=updated_xgb_model)

In [None]:
# predict on test dataset
if xgb_model_test_preds is not None:
    print(xgb_model_test_preds.shape)

In [None]:
# # take weighted average of both k-fold models predictions
# final_preds = ((0.45 * sci_model_preds) + (0.55* xgb_model_test_preds)) / 2
# print(final_preds.shape)

#### Create Kaggle Predictions

In [None]:
# sub = pd.read_csv(DIRECTORY_PATH + SAMPLE_SUB_FILE)
# sub['target'] = xgb_model_test_preds
# sub.to_csv('submission.csv', index=False)