In [1]:
# default_exp Kaggle_TPS_Tutorial

# Getting Started Kaggle TPS Challenge with Tabular ML Toolkit

> A Tutorial to showcase usage of Tabular_ML_Toolkit library on Kaggle TPS Challenge Nov 2021

## Install

`pip install -U tabular_ml_toolkit`

## How to Best Use tabular_ml_toolkit

Start with your favorite model and then just simply create MLPipeline with one API.

*You can use MLPipeline to quickly train any model which supports scikit-lear fit and transform methods.*

*For example, Here we are using LogisticRegression from Scikit-Learn, on  [Kaggle TPS Challenge (Nov 2021) data](https://www.kaggle.com/c/tabular-playground-series-nov-2021/data)*

In [2]:
#export
from tabular_ml_toolkit.MLPipeline import *
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score
import pandas as pd
import numpy as np

In [3]:
# Dataset file names and Paths
DIRECTORY_PATH = "/Users/pankajmathur/kaggle_datasets/tps_nov_2021/"
TRAIN_FILE = "train.csv"
TEST_FILE = "test.csv"
SAMPLE_SUB_FILE = "sample_submission.csv"

In [4]:
# # create scikit-learn ml model
scikit_model = LogisticRegression(solver='liblinear', random_state=42)

In [5]:
# createm ml pipeline for scikit-learn model
sci_ml_pl = MLPipeline().prepare_data_for_training(
    train_file_path= DIRECTORY_PATH + TRAIN_FILE,
    test_file_path= DIRECTORY_PATH + TEST_FILE,
    #make sure to use right index and target column
    idx_col="id",
    target="target",
    model=scikit_model,
    random_state=42,
    valid_size=0.2)

both high_card_cat_cols and low_card_cat_cols are None


In [7]:
# for visualizing pipeline
from sklearn import set_config

set_config(display="diagram")
sci_ml_pl.spl

In [8]:
# Quick check on dataframe shapes
print(f"X_train shape is {sci_ml_pl.dfl.X_train.shape}" )
print(f"X_valid shape is {sci_ml_pl.dfl.X_valid.shape}" )
print(f"y_train shape is {sci_ml_pl.dfl.y_train.shape}")
print(f"y_valid shape is {sci_ml_pl.dfl.y_valid.shape}")

X_train shape is (480000, 100)
X_valid shape is (120000, 100)
y_train shape is (480000,)
y_valid shape is (120000,)


In [9]:
# Fit & Predict
sci_ml_pl.spl.fit(sci_ml_pl.dfl.X_train, sci_ml_pl.dfl.y_train)
preds = sci_ml_pl.spl.predict(sci_ml_pl.dfl.X_valid)
preds_probs = sci_ml_pl.spl.predict_proba(sci_ml_pl.dfl.X_valid)[:, 1]

In [10]:
# quick check on predictions and predictions probabilities shape
print(f"preds shape is {preds.shape}" )
print(f"preds_probs shape is {preds_probs.shape}" )

preds shape is (120000,)
preds_probs shape is (120000,)


In [11]:
# Metrics
auc = roc_auc_score(sci_ml_pl.dfl.y_valid, preds_probs)
acc = accuracy_score(sci_ml_pl.dfl.y_valid, preds)

print(f"AUC is : {auc} while Accuracy is : {acc} ")

AUC is : 0.7506636457905501 while Accuracy is : 0.7385833333333334 


In background `prepare_data_for_training` and `prepare_data_for_cv`  methods loads your input data into Pandas DataFrame, seprates X(features) and y(target).

Then `prepare_data_for_training` methods split X(features) into X_train, y_train, X_valid and y_valid DataFrames.
However, `prepare_data_for_cv`  method do not split but let's cross validation split internally X and y DataFrames.

Then both methods preprocess all numerical and categorical type data found in these DataFrames using scikit-learn pipelines. Then it bundle preprocessed data with your given model and return an MLPipeline object, this class instance has dataframeloader, preprocessor and scikit-lean pipeline instances, so you can call fit methods on X_train and y_train and predict methods on X_valid or X_test.

Please check detail documentation and source code for more details.

*NOTE: If you want to customize data and preprocessing steps you can do so by using `DataFrameLoader` and `PreProessor` classes. Check detail documentations for these classes for more options.*

#### Let's Use XGBosst on MLPipeline

*You can also use MLPipeline with XGBoost model, Just make sure to install XGBooost first depending upon your OS.*

*After that all steps remains same. Here is example using XGBRegressor with [Melbourne Home Sale price data](https://www.kaggle.com/estrotococo/home-data-for-ml-course)*

In [12]:
#!pip install -U xgboost

In [13]:
# Best way to install xgboost if you are on macosx and windows machine is using conda
# !conda install -c conda-forge xgboost

In [14]:
# # set xgb_params
# xgb_params = {
#     'n_estimators': 1000,
#     'learning_rate': 0.01,
# #     'max_depth': 9,
#     'booster': 'gbtree',
#     'eval_metric': 'auc',
# #     'tree_method': 'gpu_hist',
# #     'predictor': 'gpu_predictor',
#     'use_label_encoder': False,
#     'random_state': 42
# }

In [15]:
# from xgboost import XGBClassifier
# # create xgb Classifier model
# xgb_model = XGBClassifier(**xgb_params)

In [16]:
# # createm ml pipeline for xgb model
# xgb_ml_pl = MLPipeline().prepare_data_for_training(
#     train_file_path= DIRECTORY_PATH+TRAIN_FILE,
#     test_file_path= DIRECTORY_PATH+TEST_FILE,
#     #make sure to use right index and target column
#     idx_col="id",
#     target="target",
#     model=xgb_model,
#     random_state=42,
#     valid_size=0.2)

In [17]:
# # for visualizing pipeline
# from sklearn import set_config

# set_config(display="diagram")
# xgb_ml_pl.spl

In [18]:
# # Fit & Predict
# xgb_ml_pl.spl.fit(xgb_ml_pl.dfl.X_train,
#                               xgb_ml_pl.dfl.y_train)
# preds = xgb_ml_pl.spl.predict(xgb_ml_pl.dfl.X_valid)
# preds_probs = xgb_ml_pl.spl.predict_proba(xgb_ml_pl.dfl.X_valid)[:, 1]

In [19]:
# # Metrics
# auc = roc_auc_score(xgb_ml_pl.dfl.y_valid, preds_probs)
# acc = accuracy_score(xgb_ml_pl.dfl.y_valid, preds)

# print(f"AUC is : {auc} while Accuracy is : {acc} ")

#### Let's do Grid Search for HyperParameters Tunning for Scikit Model on our MLPipeline

In [20]:
# # create blanket scikit-learn LogisticRegression model
# scikit_model = LogisticRegression(solver='liblinear', random_state=42)

# # createm ml pipeline for scikit-learn model
# sci_ml_pl = MLPipeline().prepare_data_for_training(
#     train_file_path= DIRECTORY_PATH+TRAIN_FILE,
#     test_file_path= DIRECTORY_PATH+TEST_FILE,
#     idx_col="id",
#     target="target",
#     model=scikit_model,
#     random_state=42)

In [21]:
# # for visualizing pipeline
# from sklearn import set_config

# set_config(display="diagram")
# sci_ml_pl.spl

In [22]:
# param_grid = {
# #     "preprocessor__num__imputer__strategy": ["constant", "mean", "median"],
# #     "preprocessor__low_cad_cat__imputer__strategy": ["most_frequent", "constant"],
#     "model__solver": ["newton-cg", "lbfgs", "liblinear"]
# }

# grid_search = sci_ml_pl.do_grid_search(param_grid=param_grid, cv=3,
#                                        scoring='roc_auc')

# print("Best params:")
# print(grid_search.best_params_)

# print(f"Internal CV Metrics score: {-1*(grid_search.best_score_):.3f}")

#### Let's Use K-Fold Training with best params from grid search

In [23]:
# create scikit-learn LosisticRegression model with best params from grid search
scikit_model = LogisticRegression(solver='liblinear',
                                  random_state=42)

In [24]:
# createm ml pipeline for scikit-learn model
sci_ml_pl = MLPipeline().prepare_data_for_training(
    train_file_path= DIRECTORY_PATH+TRAIN_FILE,
    test_file_path= DIRECTORY_PATH+TEST_FILE,
    idx_col="id",
    target="target",
    model=scikit_model,
    random_state=42)

both high_card_cat_cols and low_card_cat_cols are None


In [25]:
# for visualizing pipeline
from sklearn import set_config

set_config(display="diagram")
sci_ml_pl.spl

In [27]:
# print(len(sci_ml_pl.dfl.final_cols))
# sci_ml_pl.dfl.final_cols

In [None]:
# fit and predict
k_fold, metrics_score = sci_ml_pl.do_k_fold_training(n_splits=15, metrics=roc_auc_score)
print("mean metrics score:", np.mean(metrics_score))

In [None]:
print(f"Mean metrics_score is : {round(np.mean(metrics_score)*100,2)}")

#### Create Kaggle Predictions

In [None]:
# predict
preds = sci_ml_pl.do_k_fold_prediction(k_fold=k_fold)

print(preds.shape)

In [None]:
sub = pd.read_csv(DIRECTORY_PATH + SAMPLE_SUB_FILE)
sub['target'] = preds
sub.to_csv('submission.csv', index=False)

In [None]:
# hide
# run the script to build 

from nbdev.export import notebook2script; notebook2script()