# Getting Started Kaggle TPS Challenge with Tabular ML Toolkit

> A Tutorial to showcase usage of tabular_ml_toolkit library on Kaggle TPS Challenge Nov 2021.

> tabular_ml_toolkit is a superfast helper library to speedup your machine learning project based on Tabular or Structured data.

> It comes with model parallelism and cutting edge hyperparameter tuning techniques.

## Install

`pip install -U tabular_ml_toolkit`

## How to Best Use tabular_ml_toolkit

Start with your favorite model and then just simply create MLPipeline with one API.

*You can use MLPipeline to quickly train any model which supports scikit-lear fit and transform methods.*

*For example, Here we are using LogisticRegression from Scikit-Learn, on  [Kaggle TPS Challenge (Nov 2021) data](https://www.kaggle.com/c/tabular-playground-series-nov-2021/data)*

In [1]:
from tabular_ml_toolkit.tmlt import *
from sklearn.linear_model import LogisticRegression
# from sklearn.neural_network import BernoulliRBM, MLPClassifier
from sklearn.metrics import roc_auc_score, accuracy_score
import pandas as pd
import numpy as np

# for visualizing pipeline
from sklearn import set_config
set_config(display="diagram")

# just to measure fit performance
import time



In [2]:
# Dataset file names and Paths
DIRECTORY_PATH = "/Users/pankajmathur/kaggle_datasets/tps_nov_2021/"
TRAIN_FILE = "train.csv"
TEST_FILE = "test.csv"
SAMPLE_SUB_FILE = "sample_submission.csv"

In [3]:
# # create scikit-learn ml model
scikit_model = LogisticRegression(random_state=42)

In [4]:
# createm ml pipeline for scikit-learn model
tmlt = TMLT().prepare_data_for_training(
    train_file_path= DIRECTORY_PATH + TRAIN_FILE,
    #test_file_path= DIRECTORY_PATH + TEST_FILE,
    #make sure to use right index and target column
    idx_col="id",
    target="target",
    model=scikit_model,
    random_state=42,
    problem_type="classification")

2021-11-20 01:16:06,989 INFO 8 cores found, parallel processing is enabled!
2021-11-20 01:16:25,134 INFO DataFrame Memory usage decreased to 119.59 Mb (74.4% reduction)
2021-11-20 01:16:25,134 INFO No test_file_path given, so training will continue without it!
2021-11-20 01:16:27,488 INFO PreProcessing will include target(s) encoding!
2021-11-20 01:16:27,505 INFO categorical columns are None, Preprocessing will done accordingly!


In [5]:
tmlt.spl

In [6]:
print(type(tmlt.dfl.y))
# print(tmlt.dfl.y.values[10])
# print(type(tmlt.dfl.y.values[10]))
tmlt.dfl.y

<class 'numpy.ndarray'>


array([0, 0, 0, ..., 0, 1, 1])

In [7]:
error!

SyntaxError: invalid syntax (2390782538.py, line 1)

In [None]:
tmlt.dfl.create_train_valid(valid_size=0.2)

In [None]:
# Quick check on dataframe shapes
print(f"X_train shape is {tmlt.dfl.X_train.shape}" )
print(f"X_valid shape is {tmlt.dfl.X_valid.shape}" )
print(f"y_train shape is {tmlt.dfl.y_train.shape}")
print(f"y_valid shape is {tmlt.dfl.y_valid.shape}")

In [None]:
# Fit
start = time.time()
# Now fit
tmlt.spl.fit(tmlt.dfl.X_train, tmlt.dfl.y_train)
end = time.time()
print("Fit Time:", end - start)

#predict
preds = tmlt.spl.predict(tmlt.dfl.X_valid)
preds_probs = tmlt.spl.predict_proba(tmlt.dfl.X_valid)[:, 1]

# Metrics
auc = roc_auc_score(tmlt.dfl.y_valid, preds_probs)
acc = accuracy_score(tmlt.dfl.y_valid, preds)

print(f"AUC is : {auc} while Accuracy is : {acc} ")

In [None]:
error1

##### Let's do cross validation to get more clear picture on dataset

In [None]:
# cross_validation
scores = tmlt.do_cross_validation(cv=5, scoring='roc_auc')
print("scores:", scores)
print("Average auc score:", scores.mean())

##### Let's do HyperParams Search using Tune Grid Search

In [None]:
tmlt.spl

In [None]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.impute import SimpleImputer

In [None]:
# let's tune data preprocessing and model hyperparams
param_grid = {
    "preprocessor__num_cols__scaler": [StandardScaler(), MinMaxScaler()],
    "preprocessor__num_cols__imputer": [SimpleImputer(strategy='median'),
                                                 SimpleImputer(strategy='most_frequent')],
#     'model__solver': ['lbfgs', 'saga', 'newton-cg', 'sag', 'liblinear'],
    'model__max_iter': [100, 1000]
}

start = time.time()
# Now do tune grid search
tune_search = tmlt.do_tune_grid_search(param_grid=param_grid,
                                       cv=5,
                                       scoring='roc_auc',
                                      early_stopping=False,
                                      time_budget_s=60)
end = time.time()
print("Grid Search Time:", end - start)

print("Best params:")
print(tune_search.best_params_)

print(f"Internal CV Metrics score: {(tune_search.best_score_):.3f}")

**Now, Let's update the PreProcessor and Model with best params**

In [None]:
pp_params = tmlt.get_preprocessor_best_params(tune_search)
tmlt.update_preprocessor(**pp_params)
tmlt.spl

In [None]:
model_params = tmlt.get_model_best_params(tune_search)
scikit_model = RandomForestRegressor(**model_params)
tmlt.update_model(scikit_model)
tmlt.spl

In [None]:
# Fit
start = time.time()
# Now fit
tmlt.spl.fit(tmlt.dfl.X_train, tmlt.dfl.y_train)
end = time.time()
print("Fit Time:", end - start)

#predict
preds = tmlt.spl.predict(tmlt.dfl.X_valid)
preds_probs = tmlt.spl.predict_proba(tmlt.dfl.X_valid)[:, 1]

# Metrics
auc = roc_auc_score(tmlt.dfl.y_valid, preds_probs)
acc = accuracy_score(tmlt.dfl.y_valid, preds)

print(f"AUC is : {auc} while Accuracy is : {acc} ")

#### Let's Use K-Fold Training

In [8]:
# check current pipeline
tmlt.spl

In [9]:
# fit and predict
sci_model_k_fold, sci_model_metrics_score = tmlt.do_kfold_training(n_splits=5, metrics=roc_auc_score)
print("mean metrics score:", np.mean(sci_model_metrics_score))

2021-11-20 01:16:40,388 INFO fold: 1 , roc_auc_score: 0.7358018099857513
2021-11-20 01:16:51,835 INFO fold: 2 , roc_auc_score: 0.7356148010371628
2021-11-20 01:17:03,251 INFO fold: 3 , roc_auc_score: 0.7377649766620092
2021-11-20 01:17:14,913 INFO fold: 4 , roc_auc_score: 0.7378358812062005
2021-11-20 01:17:26,483 INFO fold: 5 , roc_auc_score: 0.7386355213553095
2021-11-20 01:17:26,487 INFO  mean metrics score: 0.7371305980492867


UnboundLocalError: local variable 'test_preds' referenced before assignment

In [None]:
print(f"Mean metrics_score is : {round(np.mean(metrics_score)*100,2)}")

In [None]:
# predict
sci_model_preds = tmlt.do_k_fold_prediction(k_fold=sci_model_k_fold)

print(sci_model_preds.shape)

In background `prepare_data_for_training` and `prepare_data_for_cv`  methods loads your input data into Pandas DataFrame, seprates X(features) and y(target).

Then `prepare_data_for_training` methods split X(features) into X_train, y_train, X_valid and y_valid DataFrames.
However, `prepare_data_for_cv`  method do not split but let's cross validation split internally X and y DataFrames.

Then both methods preprocess all numerical and categorical type data found in these DataFrames using scikit-learn pipelines. Then it bundle preprocessed data with your given model and return an MLPipeline object, this class instance has dataframeloader, preprocessor and scikit-lean pipeline instances, so you can call fit methods on X_train and y_train and predict methods on X_valid or X_test.

Please check detail documentation and source code for more details.

*NOTE: If you want to customize data and preprocessing steps you can do so by using `DataFrameLoader` and `PreProessor` classes. Check detail documentations for these classes for more options.*

#### Let's Use XGBosst on MLPipeline

*You can also use MLPipeline with XGBoost model, Just make sure to install XGBooost first depending upon your OS.*

*After that all steps remains same. Here is example using XGBRegressor with [Melbourne Home Sale price data](https://www.kaggle.com/estrotococo/home-data-for-ml-course)*

In [None]:
#!pip install -U xgboost

In [None]:
# Best way to install xgboost if you are on macosx and windows machine is using conda
# !conda install -c conda-forge xgboost

In [None]:
# set xgb_params
xgb_params = {
#     'learning_rate': 0.3,
#     'max_depth': 9,
#     'booster': 'gbtree',
    'eval_metric': 'auc',
    'use_label_encoder': False,
    'random_state': 42,
    # for GPU
    #     'tree_method': 'gpu_hist',
#     'predictor': 'gpu_predictor'
}

In [None]:
from xgboost import XGBClassifier
# create xgb Classifier model
xgb_model = XGBClassifier(**xgb_params)

In [None]:
# Update pipeline with xgb model
tmlt.update_model(xgb_model)
tmlt.spl

In [None]:
# Fit
start = time.time()
# Now fit
tmlt.spl.fit(tmlt.dfl.X_train, tmlt.dfl.y_train)
end = time.time()
print("Fit Time:", end - start)

#predict
preds = tmlt.spl.predict(tmlt.dfl.X_valid)
preds_probs = tmlt.spl.predict_proba(tmlt.dfl.X_valid)[:, 1]

# Metrics
auc = roc_auc_score(tmlt.dfl.y_valid, preds_probs)
acc = accuracy_score(tmlt.dfl.y_valid, preds)

print(f"AUC is : {auc} while Accuracy is : {acc} ")

##### Let's do HyperParmas search for XGBClassifier

In [None]:
# let's tune data preprocessing and model hyperparams
param_grid = {
#     "preprocessor__num_cols__scaler": [StandardScaler(), MinMaxScaler()],
#     "preprocessor__low_card_cat_cols__imputer": [SimpleImputer(strategy='constant'),
#                                                  SimpleImputer(strategy='most_frequent')],
    'model__booster': ['gbtree', 'gblinear'],
#     'model__max_iter': [100, 1000]
}

start = time.time()
# Now do tune grid search
tune_search = tmlt.do_tune_grid_search(param_grid=param_grid,
                                       cv=3,
                                       scoring='roc_auc',
                                      early_stopping=False)
end = time.time()
print("Grid Search Time:", end - start)

print("Best params:")
print(tune_search.best_params_)

print(f"Internal CV Metrics score: {(tune_search.best_score_):.3f}")

Indeed CV score was increased using hyper parameter tunning, let's use best_params to do k-fold training

#### Let's Use K-Fold Training for xgb model

In [None]:
# set new xgb_params
xgb_params = {
#     'learning_rate': 0.05,
    'booster': 'gbtree',
    'eval_metric': 'auc',
    'use_label_encoder': False,
    'random_state': 42,
    # for GPU
    #     'tree_method': 'gpu_hist',
#     'predictor': 'gpu_predictor'
}

In [None]:
# Update pipeline with xgb model
tmlt.update_model(xgb_model)
tmlt.spl

In [None]:
# fit and predict
xgb_k_fold, xgb_k_metrics_score = tmlt.do_k_fold_training(n_splits=5, metrics=roc_auc_score)
print("mean metrics score:", np.mean(xgb_k_metrics_score))

In [None]:
print(f"Mean metrics_score is : {round(np.mean(xgb_k_metrics_score)*100,2)}")

In [None]:
# predict
xgb_preds = tmlt.do_k_fold_prediction(k_fold=xgb_k_fold)

print(xgb_preds.shape)

In [None]:
# take weighted average of both k-fold models predictions
final_preds = ((0.45 * sci_model_preds) + (0.55* xgb_pred)) / 2
print(final_preds.shape)

#### Create Kaggle Predictions

In [None]:
sub = pd.read_csv(DIRECTORY_PATH + SAMPLE_SUB_FILE)
sub['target'] = final_preds
sub.to_csv('submission.csv', index=False)

In [None]:
# hide
# run the script to build 

from nbdev.export import notebook2script; notebook2script()