In [None]:
# default_exp Kaggle_TPS_Tutorial

# Getting Started Kaggle TPS Challenge with Tabular ML Toolkit

> A Tutorial to showcase usage of tabular_ml_toolkit library on Kaggle TPS Challenge Nov 2021.

> tabular_ml_toolkit is a superfast helper library to speedup your machine learning project based on Tabular or Structured data.

> It comes with model parallelism and cutting edge hyperparameter tuning techniques.

## Install

`pip install -U tabular_ml_toolkit`

## How to Best Use tabular_ml_toolkit

Start with your favorite model and then just simply create MLPipeline with one API.

*You can use MLPipeline to quickly train any model which supports scikit-lear fit and transform methods.*

*For example, Here we are using LogisticRegression from Scikit-Learn, on  [Kaggle TPS Challenge (Nov 2021) data](https://www.kaggle.com/c/tabular-playground-series-nov-2021/data)*

In [None]:
from tabular_ml_toolkit.MLPipeline import *
from sklearn.linear_model import LogisticRegression
# from sklearn.neural_network import BernoulliRBM, MLPClassifier
from sklearn.metrics import roc_auc_score, accuracy_score
import pandas as pd
import numpy as np

# for visualizing pipeline
from sklearn import set_config
set_config(display="diagram")

# just to measure fit performance
import time

In [None]:
# Dataset file names and Paths
DIRECTORY_PATH = "/Users/pamathur/kaggle_datasets/tps_nov_2021/"
TRAIN_FILE = "train.csv"
TEST_FILE = "test.csv"
SAMPLE_SUB_FILE = "sample_submission.csv"

In [None]:
# # create scikit-learn ml model
scikit_model = LogisticRegression(random_state=42)

In [None]:
# createm ml pipeline for scikit-learn model
tmlt = MLPipeline().prepare_data_for_training(
    train_file_path= DIRECTORY_PATH + TRAIN_FILE,
    test_file_path= DIRECTORY_PATH + TEST_FILE,
    #make sure to use right index and target column
    idx_col="id",
    target="target",
    model=scikit_model,
    random_state=42)

In [None]:
tmlt.spl

In [None]:
tmlt.dfl.create_train_valid(valid_size=0.2)

In [None]:
# Quick check on dataframe shapes
print(f"X_train shape is {tmlt.dfl.X_train.shape}" )
print(f"X_valid shape is {tmlt.dfl.X_valid.shape}" )
print(f"y_train shape is {tmlt.dfl.y_train.shape}")
print(f"y_valid shape is {tmlt.dfl.y_valid.shape}")

X_train shape is (480000, 100)
X_valid shape is (120000, 100)
y_train shape is (480000,)
y_valid shape is (120000,)


In [None]:
# Fit
start = time.time()
# Now fit
tmlt.spl.fit(tmlt.dfl.X_train, tmlt.dfl.y_train)
end = time.time()
print("Fit Time:", end - start)

#predict
preds = tmlt.spl.predict(tmlt.dfl.X_valid)
preds_probs = tmlt.spl.predict_proba(tmlt.dfl.X_valid)[:, 1]

# Metrics
auc = roc_auc_score(tmlt.dfl.y_valid, preds_probs)
acc = accuracy_score(tmlt.dfl.y_valid, preds)

print(f"AUC is : {auc} while Accuracy is : {acc} ")

Fit Time: 24.178173065185547
AUC is : 0.7506636452348747 while Accuracy is : 0.7385833333333334 


### Fit Time: 59.285420179367065
AUC is : 0.7550680676073396 while Accuracy is : 0.7422916666666667 

In [None]:
# let' see available metrics from sklearn
import sklearn
sorted(sklearn.metrics.SCORERS.keys())

['accuracy',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'average_precision',
 'balanced_accuracy',
 'completeness_score',
 'explained_variance',
 'f1',
 'f1_macro',
 'f1_micro',
 'f1_samples',
 'f1_weighted',
 'fowlkes_mallows_score',
 'homogeneity_score',
 'jaccard',
 'jaccard_macro',
 'jaccard_micro',
 'jaccard_samples',
 'jaccard_weighted',
 'max_error',
 'mutual_info_score',
 'neg_brier_score',
 'neg_log_loss',
 'neg_mean_absolute_error',
 'neg_mean_absolute_percentage_error',
 'neg_mean_gamma_deviance',
 'neg_mean_poisson_deviance',
 'neg_mean_squared_error',
 'neg_mean_squared_log_error',
 'neg_median_absolute_error',
 'neg_root_mean_squared_error',
 'normalized_mutual_info_score',
 'precision',
 'precision_macro',
 'precision_micro',
 'precision_samples',
 'precision_weighted',
 'r2',
 'rand_score',
 'recall',
 'recall_macro',
 'recall_micro',
 'recall_samples',
 'recall_weighted',
 'roc_auc',
 'roc_auc_ovo',
 'roc_auc_ovo_weighted',
 'roc_auc_ovr',
 'roc_auc_ovr_we

##### Let's do cross validation

In [None]:
# cross_validation
scores = tmlt.do_cross_validation(cv=5, scoring='roc_auc')
print("scores:", scores)
print("Average auc score:", scores.mean())

scores: [0.7444891  0.74708114 0.7553452  0.72882056 0.74817016]
Average auc score: 0.7447812310949615


##### Let's do HyperParams Search using Tune Grid Search

    These are LogisticRegression params that can be tuned:
    
    
    penalty='l2',
    *,
    dual=False,
    tol=0.0001,
    C=1.0,
    fit_intercept=True,
    intercept_scaling=1,
    class_weight=None,
    random_state=None,
    solver='lbfgs',
    max_iter=100,
    multi_class='auto',
    verbose=0,
    warm_start=False,
    n_jobs=None,
    l1_ratio=None,

In [None]:
# let's tune data preprocessing and model hyperparams
param_grid = {
#     "preprocessor__num_cols__scaler": [StandardScaler(), MinMaxScaler()],
#     "preprocessor__low_card_cat_cols__imputer": [SimpleImputer(strategy='constant'),
#                                                  SimpleImputer(strategy='most_frequent')],
#     'model__solver': ['lbfgs', 'saga', 'newton-cg', 'sag', 'liblinear'],
    'model__max_iter': [100, 1000]
}

start = time.time()
# Now do tune grid search
tune_search = tmlt.do_tune_grid_search(param_grid=param_grid,
                                       cv=5,
                                       scoring='roc_auc',
                                      early_stopping=False)
end = time.time()
print("Grid Search Time:", end - start)

print("Best params:")
print(tune_search.best_params_)

print(f"Internal CV Metrics score: {(tune_search.best_score_):.3f}")



Grid Search Time: 174.76603078842163
Best params:
{'model__max_iter': 100}
Internal CV Metrics score: 0.745


Grid Search Time: 476.33784890174866
Best params:
{'model__solver': 'sag'}
Internal CV Metrics score: 0.745

Now, Let's update the Model with best params

for example, use numerical scaler to MinMax instead StandardScaler

use resolver = 'sag'

In [None]:
# from sklearn.preprocessing import MinMaxScaler

# tmlt.update_preprocessor(num_scaler=MinMaxScaler())
# # check the udpated pipeline
# tmlt.spl

In [None]:
scikit_model_tuned = LogisticRegression(random_state=42,
                                       solver='sag')

# Update pipeline with new model
tmlt.update_model(scikit_model_tuned)
tmlt.spl

In [None]:
# Fit
start = time.time()
# Now fit
tmlt.spl.fit(tmlt.dfl.X_train, tmlt.dfl.y_train)
end = time.time()
print("Fit Time:", end - start)

#predict
preds = tmlt.spl.predict(tmlt.dfl.X_valid)
preds_probs = tmlt.spl.predict_proba(tmlt.dfl.X_valid)[:, 1]

# Metrics
auc = roc_auc_score(tmlt.dfl.y_valid, preds_probs)
acc = accuracy_score(tmlt.dfl.y_valid, preds)

print(f"AUC is : {auc} while Accuracy is : {acc} ")

Fit Time: 23.976211071014404
AUC is : 0.7506636005030084 while Accuracy is : 0.7386 


#### Let's Use K-Fold Training

In [None]:
# check current pipeline
tmlt.spl

In [None]:
# fit and predict
sci_model_k_fold, sci_model_metrics_score = tmlt.do_k_fold_training(n_splits=10, metrics=roc_auc_score)
print("mean metrics score:", np.mean(sci_model_metrics_score))

KeyboardInterrupt: 

In [None]:
print(f"Mean metrics_score is : {round(np.mean(metrics_score)*100,2)}")

In [None]:
# predict
sci_model_preds = tmlt.do_k_fold_prediction(k_fold=sci_model_k_fold)

print(sci_model_preds.shape)

In background `prepare_data_for_training` and `prepare_data_for_cv`  methods loads your input data into Pandas DataFrame, seprates X(features) and y(target).

Then `prepare_data_for_training` methods split X(features) into X_train, y_train, X_valid and y_valid DataFrames.
However, `prepare_data_for_cv`  method do not split but let's cross validation split internally X and y DataFrames.

Then both methods preprocess all numerical and categorical type data found in these DataFrames using scikit-learn pipelines. Then it bundle preprocessed data with your given model and return an MLPipeline object, this class instance has dataframeloader, preprocessor and scikit-lean pipeline instances, so you can call fit methods on X_train and y_train and predict methods on X_valid or X_test.

Please check detail documentation and source code for more details.

*NOTE: If you want to customize data and preprocessing steps you can do so by using `DataFrameLoader` and `PreProessor` classes. Check detail documentations for these classes for more options.*

#### Let's Use XGBosst on MLPipeline

*You can also use MLPipeline with XGBoost model, Just make sure to install XGBooost first depending upon your OS.*

*After that all steps remains same. Here is example using XGBRegressor with [Melbourne Home Sale price data](https://www.kaggle.com/estrotococo/home-data-for-ml-course)*

In [None]:
#!pip install -U xgboost

In [None]:
# Best way to install xgboost if you are on macosx and windows machine is using conda
# !conda install -c conda-forge xgboost

In [None]:
# set xgb_params
xgb_params = {
#     'learning_rate': 0.3,
#     'max_depth': 9,
#     'booster': 'gbtree',
    'eval_metric': 'auc',
    'use_label_encoder': False,
    'random_state': 42,
    # for GPU
    #     'tree_method': 'gpu_hist',
#     'predictor': 'gpu_predictor'
}

In [None]:
from xgboost import XGBClassifier
# create xgb Classifier model
xgb_model = XGBClassifier(**xgb_params)

In [None]:
# Update pipeline with xgb model
tmlt.update_model(xgb_model)
tmlt.spl

In [None]:
# Fit
start = time.time()
# Now fit
tmlt.spl.fit(tmlt.dfl.X_train, tmlt.dfl.y_train)
end = time.time()
print("Fit Time:", end - start)

#predict
preds = tmlt.spl.predict(tmlt.dfl.X_valid)
preds_probs = tmlt.spl.predict_proba(tmlt.dfl.X_valid)[:, 1]

# Metrics
auc = roc_auc_score(tmlt.dfl.y_valid, preds_probs)
acc = accuracy_score(tmlt.dfl.y_valid, preds)

print(f"AUC is : {auc} while Accuracy is : {acc} ")

Fit Time: 237.51949214935303
AUC is : 0.7306965219658297 while Accuracy is : 0.6909 


##### Let's do HyperParmas search for XGBClassifier

In [None]:
# let's tune data preprocessing and model hyperparams
param_grid = {
#     "preprocessor__num_cols__scaler": [StandardScaler(), MinMaxScaler()],
#     "preprocessor__low_card_cat_cols__imputer": [SimpleImputer(strategy='constant'),
#                                                  SimpleImputer(strategy='most_frequent')],
    'model__booster': ['gbtree', 'gblinear'],
#     'model__max_iter': [100, 1000]
}

start = time.time()
# Now do tune grid search
tune_search = tmlt.do_tune_grid_search(param_grid=param_grid,
                                       cv=3,
                                       scoring='roc_auc',
                                      early_stopping=False)
end = time.time()
print("Grid Search Time:", end - start)

print("Best params:")
print(tune_search.best_params_)

print(f"Internal CV Metrics score: {(tune_search.best_score_):.3f}")



[2m[36m(_Trainable pid=74442)[0m Parameters: { "colsample_bylevel", "colsample_bynode", "colsample_bytree", "gamma", "interaction_constraints", "max_delta_step", "max_depth", "min_child_weight", "monotone_constraints", "num_parallel_tree", "predictor", "subsample", "tree_method" } might not be used.
[2m[36m(_Trainable pid=74442)[0m 
[2m[36m(_Trainable pid=74442)[0m   This could be a false alarm, with some parameters getting used by language bindings but
[2m[36m(_Trainable pid=74442)[0m   then being mistakenly passed down to XGBoost core, or some parameter actually being used
[2m[36m(_Trainable pid=74442)[0m   but getting flagged wrongly here. Please open an issue if you find any such cases.
[2m[36m(_Trainable pid=74442)[0m 
[2m[36m(_Trainable pid=74442)[0m 


SIGINT received (e.g. via Ctrl+C), ending Ray Tune run. This will try to checkpoint the experiment state one last time. Press CTRL+C one more time (or send SIGINT/SIGKILL/SIGTERM) to skip. 
Trials did not complete: [_Trainable_c7eac_00000]
Experiment has been interrupted, but the most recent state was saved. You can continue running this experiment by passing `resume=True` to `tune.run()`


Parameters: { "colsample_bylevel", "colsample_bynode", "colsample_bytree", "gamma", "interaction_constraints", "max_delta_step", "max_depth", "min_child_weight", "monotone_constraints", "num_parallel_tree", "predictor", "subsample", "tree_method" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Grid Search Time: 131.64094710350037
Best params:
{'model__booster': 'gblinear'}
Internal CV Metrics score: 0.742


Indeed CV score was increased using hyper parameter tunning, let's use best_params to do k-fold training

#### Let's Use K-Fold Training for xgb model

In [None]:
# set new xgb_params
xgb_params = {
#     'learning_rate': 0.05,
    'booster': 'gbtree',
    'eval_metric': 'auc',
    'use_label_encoder': False,
    'random_state': 42,
    # for GPU
    #     'tree_method': 'gpu_hist',
#     'predictor': 'gpu_predictor'
}

In [None]:
# Update pipeline with xgb model
tmlt.update_model(xgb_model)
tmlt.spl

In [None]:
# fit and predict
xgb_k_fold, xgb_k_metrics_score = tmlt.do_k_fold_training(n_splits=5, metrics=roc_auc_score)
print("mean metrics score:", np.mean(xgb_k_metrics_score))

In [None]:
print(f"Mean metrics_score is : {round(np.mean(xgb_k_metrics_score)*100,2)}")

In [None]:
# predict
xgb_preds = tmlt.do_k_fold_prediction(k_fold=xgb_k_fold)

print(xgb_preds.shape)

In [None]:
# take weighted average of both k-fold models predictions
final_preds = ((0.45 * sci_model_preds) + (0.55* xgb_pred)) / 2
print(final_preds.shape)

#### Create Kaggle Predictions

In [None]:
sub = pd.read_csv(DIRECTORY_PATH + SAMPLE_SUB_FILE)
sub['target'] = final_preds
sub.to_csv('submission.csv', index=False)

In [None]:
# hide
# run the script to build 

from nbdev.export import notebook2script; notebook2script()