In [None]:
# default_exp Optuna_Tutorial

# Getting Started Kaggle TPS Challenge with Tabular ML Toolkit

> A Tutorial to showcase usage of Tabular_ML_Toolkit and Optuna on Kaggle TPS Challenge Nov 2021

## Install

`pip install -U tabular_ml_toolkit`

## How to Best Use tabular_ml_toolkit

Start with your favorite model and then just simply create MLPipeline with one API.

*You can use MLPipeline to quickly train any model which supports scikit-lear fit and transform methods.*

*For example, Here we are using LogisticRegression from Scikit-Learn, on  [Kaggle TPS Challenge (Nov 2021) data](https://www.kaggle.com/c/tabular-playground-series-nov-2021/data)*

In [None]:
from tabular_ml_toolkit.MLPipeline import *
import sklearn.model_selection
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score
import pandas as pd
import numpy as np
# import optuna

In [None]:
# Dataset file names and Paths
DIRECTORY_PATH = "/Users/pankajmathur/kaggle_datasets/tps_nov_2021/"
TRAIN_FILE = "train.csv"
TEST_FILE = "test.csv"
SAMPLE_SUB_FILE = "sample_submission.csv"

In [None]:
# # create a starter scikit-learn ml model
scikit_model = LogisticRegression(solver='liblinear', random_state=42)
# scikit_model = RandomForestClassifier(random_state=42)

In [None]:
# create an ml_pipeline for scikit-learn model
sci_ml_pl = MLPipeline().prepare_data_for_cv(
    train_file_path= DIRECTORY_PATH + TRAIN_FILE,
    test_file_path= DIRECTORY_PATH + TEST_FILE,
    #make sure to use right index and target column
    idx_col="id",
    target="target",
    model=scikit_model,
    random_state=42,
    cv_cols_type="all")

In [None]:
# class Objective(object):
#     def __init__(self, dataframeloader):
#         self.X = dataframeloader.X_cv[1000:2000]
#         self.y = dataframeloader.y[1000:2000]

#     def __call__(self, trial):
#         x, y = self.X, self.y

# #         classifier_name = trial.suggest_categorical("classifier", ["LogisticRegression", "RandomForest"])
# #         if classifier_name == "LogisticRegression":
#         log_reg_solver = trial.suggest_categorical("log_reg_solver",
#                                                    ['liblinear','lbfgs','newton-cg','sag','saga'])
#         classifier_obj = LogisticRegression(
#             solver = log_reg_solver,
#             random_state = 42)
# #         else:
# #             rf_max_depth = trial.suggest_int("rf_max_depth", 2, 32, log=True)
# #             classifier_obj = RandomForestClassifier(
# #                 max_depth = rf_max_depth,
# #                 n_estimators = 10,
# #                 random_state = 42)

#         score = sklearn.model_selection.cross_val_score(classifier_obj, x, y, n_jobs=-1, cv=3)
#         accuracy = score.mean()
#         return accuracy

In [None]:
# # Load the dataset in advance for reusing it each trial execution.
# objective = Objective(sci_ml_pl.dataframeloader)

# # iris = sklearn.datasets.load_iris()
# # objective = Objective(iris)

In [None]:
# study = optuna.create_study(direction="maximize")
# study.optimize(objective, n_trials=100)
# print(study.best_trial)

In [None]:
# import optuna
# import joblib
# import dask.distributed
# import dask_optuna

# def objective(trial):
#     x = trial.suggest_uniform("x", -10, 10)
#     return (x - 2) ** 2

# with dask.distributed.Client() as client:
#     # Create a study using Dask-compatible storage
#     storage = dask_optuna.DaskStorage()
#     study = optuna.create_study(storage=storage)
#     # Optimize in parallel on your Dask cluster
#     with joblib.parallel_backend("dask"):
#         study.optimize(objective, n_trials=100, n_jobs=-1)
#     print(f"best_params = {study.best_params}")

TypeError: Can't instantiate abstract class DaskStorage with abstract methods get_study_directions, set_study_directions, set_trial_values

In [None]:
optuna.visualization.plot_contour(study, params=['classifier', 'log_reg_solver'])

In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
optuna.visualization.plot_slice(study, params=['log_reg_solver', 'rf_max_depth'])

#### Now create the model with best params from study

In [None]:
scikit_model = LogisticRegression(solver='liblinear', random_state=42)

In [None]:
# for visualizing pipeline
from sklearn import set_config

set_config(display="diagram")
sci_ml_pl.scikit_pipeline

In [None]:
# Quick check on dataframe shapes
print(f"X_train shape is {sci_ml_pl.dataframeloader.X_train.shape}" )
print(f"X_valid shape is {sci_ml_pl.dataframeloader.X_valid.shape}" )
print(f"y_train shape is {sci_ml_pl.dataframeloader.y_train.shape}")
print(f"y_valid shape is {sci_ml_pl.dataframeloader.y_valid.shape}")


AttributeError: 'NoneType' object has no attribute 'shape'

In [None]:
# Fit & Predict
sci_ml_pl.scikit_pipeline.fit(sci_ml_pl.dataframeloader.X_train, sci_ml_pl.dataframeloader.y_train)
preds = sci_ml_pl.scikit_pipeline.predict(sci_ml_pl.dataframeloader.X_valid)
preds_probs = sci_ml_pl.scikit_pipeline.predict_proba(sci_ml_pl.dataframeloader.X_valid)[:, 1]

In [None]:
# quick check on predictions and predictions probabilities shape
print(f"preds shape is {preds.shape}" )
print(f"preds_probs shape is {preds_probs.shape}" )

In [None]:
# Metrics
auc = roc_auc_score(sci_ml_pl.dataframeloader.y_valid, preds_probs)
acc = accuracy_score(sci_ml_pl.dataframeloader.y_valid, preds)

print(f"AUC is : {auc} while Accuracy is : {acc} ")

In background `prepare_data_for_training` and `prepare_data_for_cv`  methods loads your input data into Pandas DataFrame, seprates X(features) and y(target).

Then `prepare_data_for_training` methods split X(features) into X_train, y_train, X_valid and y_valid DataFrames.
However, `prepare_data_for_cv`  method do not split but let's cross validation split internally X and y DataFrames.

Then both methods preprocess all numerical and categorical type data found in these DataFrames using scikit-learn pipelines. Then it bundle preprocessed data with your given model and return an MLPipeline object, this class instance has dataframeloader, preprocessor and scikit-lean pipeline instances, so you can call fit methods on X_train and y_train and predict methods on X_valid or X_test.

Please check detail documentation and source code for more details.

*NOTE: If you want to customize data and preprocessing steps you can do so by using `DataFrameLoader` and `PreProessor` classes. Check detail documentations for these classes for more options.*

#### Let's Use XGBosst on MLPipeline

*You can also use MLPipeline with XGBoost model, Just make sure to install XGBooost first depending upon your OS.*

*After that all steps remains same. Here is example using XGBRegressor with [Melbourne Home Sale price data](https://www.kaggle.com/estrotococo/home-data-for-ml-course)*

In [None]:
#!pip install -U xgboost

In [None]:
# Best way to install xgboost if you are on macosx and windows machine is using conda
# !conda install -c conda-forge xgboost

In [None]:
# set xgb_params
xgb_params = {
    'n_estimators': 1000,
    'learning_rate': 0.01,
#     'max_depth': 9,
    'booster': 'gbtree',
    'eval_metric': 'auc',
#     'tree_method': 'gpu_hist',
#     'predictor': 'gpu_predictor',
    'use_label_encoder': False,
    'random_state': 42
}

In [None]:
from xgboost import XGBClassifier
# create xgb Classifier model
xgb_model = XGBClassifier(**xgb_params)

In [None]:
# createm ml pipeline for xgb model
xgb_ml_pl = MLPipeline().prepare_data_for_training(
    train_file_path= DIRECTORY_PATH+TRAIN_FILE,
    test_file_path= DIRECTORY_PATH+TEST_FILE,
    #make sure to use right index and target column
    idx_col="id",
    target="target",
    model=xgb_model,
    random_state=42,
    valid_size=0.2)

In [None]:
# for visualizing pipeline
from sklearn import set_config

set_config(display="diagram")
xgb_ml_pl.scikit_pipeline

In [None]:
# Fit & Predict
xgb_ml_pl.scikit_pipeline.fit(xgb_ml_pl.dataframeloader.X_train,
                              xgb_ml_pl.dataframeloader.y_train)
preds = xgb_ml_pl.scikit_pipeline.predict(xgb_ml_pl.dataframeloader.X_valid)
preds_probs = xgb_ml_pl.scikit_pipeline.predict_proba(xgb_ml_pl.dataframeloader.X_valid)[:, 1]

In [None]:
# Metrics
auc = roc_auc_score(xgb_ml_pl.dataframeloader.y_valid, preds_probs)
acc = accuracy_score(xgb_ml_pl.dataframeloader.y_valid, preds)

print(f"AUC is : {auc} while Accuracy is : {acc} ")

#### Let's do Grid Search for HyperParameters Tunning for Scikit Model on our MLPipeline

In [None]:
# create blanket scikit-learn LogisticRegression model
scikit_model = LogisticRegression(solver='liblinear', random_state=42)

# createm ml pipeline for scikit-learn model
sci_ml_pl = MLPipeline().prepare_data_for_cv(
    train_file_path= DIRECTORY_PATH+TRAIN_FILE,
    test_file_path= DIRECTORY_PATH+TEST_FILE,
    idx_col="id",
    target="target",
    model=scikit_model,
    random_state=42,
    cv_cols_type = "all") #cv_cols_type = all|num|cat

In [None]:
# for visualizing pipeline
from sklearn import set_config

set_config(display="diagram")
sci_ml_pl.scikit_pipeline

In [None]:
param_grid = {
#     "preprocessor__num__imputer__strategy": ["constant", "mean", "median"],
#     "preprocessor__low_cad_cat__imputer__strategy": ["most_frequent", "constant"],
    "model__solver": ["newton-cg", "lbfgs", "liblinear"]
}

grid_search = sci_ml_pl.do_grid_search(param_grid=param_grid, cv=5,
                                       scoring='roc_auc')

print("Best params:")
print(grid_search.best_params_)

print(f"Internal CV Metrics score: {-1*(grid_search.best_score_):.3f}")

#### Let's Use K-Fold Training with best params from grid search

In [None]:
# create scikit-learn LosisticRegression model with best params from grid search
scikit_model = LogisticRegression(solver='liblinear',
                                  random_state=42)

In [None]:
# createm ml pipeline for scikit-learn model
sci_ml_pl = MLPipeline().prepare_data_for_k_fold(
    train_file_path= DIRECTORY_PATH+TRAIN_FILE,
    test_file_path= DIRECTORY_PATH+TEST_FILE,
    idx_col="id",
    target="target",
    model=scikit_model,
    random_state=42)

In [None]:
# for visualizing pipeline
from sklearn import set_config

set_config(display="diagram")
sci_ml_pl.scikit_pipeline

In [None]:
sci_ml_pl.dataframeloader.X_cv.head()

# sci_ml_pl.dataframeloader.y.head()

Unnamed: 0_level_0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f90,f91,f92,f93,f94,f95,f96,f97,f98,f99
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.106643,3.59437,132.804,3.18428,0.081971,1.18859,3.73238,2.26627,2.09959,0.01233,...,0.010739,1.09862,0.013331,-0.011715,0.052759,0.0654,4.21125,1.97877,0.085974,0.240496
1,0.125021,1.67336,76.5336,3.37825,0.0994,5.09366,1.27562,-0.471318,4.54594,0.037706,...,0.135838,3.46017,0.017054,0.124863,0.154064,0.606848,-0.267928,2.57786,-0.020877,0.024719
2,0.03633,1.49747,233.546,2.19435,0.026914,3.12694,5.05687,3.84946,1.80187,0.056995,...,0.11731,4.883,0.085222,0.032396,0.116092,-0.001688,-0.520069,2.14112,0.124464,0.148209
3,-0.014077,0.246,779.967,1.89064,0.006948,1.53112,2.698,4.51733,4.50332,0.123494,...,-0.015347,3.47439,-0.017103,-0.0081,0.062013,0.041193,0.511657,1.9686,0.040017,0.044873
4,-0.003259,3.71542,156.128,2.14772,0.018284,2.09859,4.15492,-0.038236,3.37145,0.034166,...,0.013781,1.91059,-0.042943,0.105616,0.125072,0.037509,1.04379,1.07481,-0.012819,0.072798


In [None]:
print(len(sci_ml_pl.dataframeloader.final_cols))
# sci_ml_pl.dataframeloader.final_cols

100


In [None]:
# fit and predict
k_fold, metrics_score = sci_ml_pl.do_k_fold_training(n_splits=10, metrics=roc_auc_score)
print("mean metrics score:", np.mean(metrics_score))

fold: 1 , roc_auc_score: 0.7391650397657262
fold: 2 , roc_auc_score: 0.7326477012689827
fold: 3 , roc_auc_score: 0.7375102681452796
fold: 4 , roc_auc_score: 0.739240317272354
fold: 5 , roc_auc_score: 0.7361418197089848
fold: 6 , roc_auc_score: 0.7378693659707141
fold: 7 , roc_auc_score: 0.7371973875563819
fold: 8 , roc_auc_score: 0.737605489427764
fold: 9 , roc_auc_score: 0.7382445308505611
fold: 10 , roc_auc_score: 0.7378412635681217
mean metrics score: 0.737346318353487


In [None]:
print(f"Mean metrics_score is : {round(np.mean(metrics_score)*100,2)}")

Mean metrics_score is : 73.73


#### Create Kaggle Predictions

In [None]:
# predict
preds = sci_ml_pl.do_k_fold_prediction(k_fold=k_fold)

print(preds.shape)

In [None]:
sub = pd.read_csv(DIRECTORY_PATH + SAMPLE_SUB_FILE)
sub['target'] = preds
sub.to_csv('submission.csv', index=False)

In [None]:
# hide
# run the script to build 

from nbdev.export import notebook2script; notebook2script()