In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# default_exp utility

In [3]:
# export
from tabular_ml_toolkit.dataframeloader import *
from tabular_ml_toolkit.preprocessor import *
from tabular_ml_toolkit.logger import *
from tabular_ml_toolkit.xgb_optuna_objective import *

In [4]:
# export
# hide

import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.metrics import roc_auc_score, accuracy_score, log_loss, f1_score, precision_score, recall_score
from sklearn.model_selection import cross_val_score, GridSearchCV, StratifiedKFold

# for Optuna
import optuna

#for XGB
import xgboost

#for TabNet
from pytorch_tabnet.tab_model import TabNetClassifier, TabNetRegressor
from pytorch_tabnet.multitask import TabNetMultiTaskClassifier

# for finding n_jobs in all sklearn estimators
from sklearn.utils import all_estimators
import inspect

# Just to compare fit times
import time

# for os specific settings
import os

In [5]:
# export

#helper method to find ideal cpu cores
def find_ideal_cpu_cores():
    if os.cpu_count() > 2:
        ideal_cpu_cores = os.cpu_count()-1
        logger.info(f"{os.cpu_count()} cores found, model and data parallel processing should worked!")
    else:
        ideal_cpu_cores = None
        logger.info(f"{os.cpu_count()} cores found, model and data parallel processing may NOT worked!")
    return ideal_cpu_cores

#Helper method to find all sklearn estimators with support for parallelism aka n_jobs
def check_has_n_jobs():
    has_n_jobs = ['XGBRegressor', 'XGBClassifier']
    for est in all_estimators():
        s = inspect.signature(est[1])
        if 'n_jobs' in s.parameters:
            has_n_jobs.append(est[0])
    return has_n_jobs

In [6]:
# #export

# def use_modin():
    
#     #settings for modin
#     import ray
#     ray.init()
#     import os
#     os.environ["MODIN_ENGINE"] = "ray"
#     import modin.pandas as pd

In [7]:
# export

def fetch_tabnet_params_for_problem_type(problem_type):
    if problem_type == "binary_classification":
        tabnet_model = TabNetClassifier
        direction = "maximize"
        eval_metric = "auc"
        #val_preds_metrics = [roc_auc_score, log_loss, accuracy_score, f1_score, precision_score, recall_score]
        val_preds_metrics = roc_auc_score

    elif problem_type == "multi_label_classification":
        tabnet_model = TabNetClassifier
        direction = "maximize"
        eval_metric = "auc"
        #val_preds_metrics = [roc_auc_score, log_loss, accuracy_score, f1_score, precision_score, recall_score]
        val_preds_metrics = accuracy_score
    
    elif problem_type == "multi_class_classification":
        tabnet_model = TabNetMultiTaskClassifier
        direction = "minimize"
        eval_metric = "logloss"
        #val_preds_metrics = [log_loss, roc_auc_score, accuracy_score, f1_score, precision_score, recall_score]
        val_preds_metrics = accuracy_score

    elif problem_type == "regression":
        tabnet_model = TabNetRegression
        direction = "minimize"
        eval_metric = "rmse"
        #val_preds_metrics = [mean_absolute_error, mean_squared_error, r2_score]
        val_preds_metrics = mean_absolute_error
    else:
        raise NotImplementedError

    return tabnet_model, val_preds_metrics, eval_metric, direction

def fetch_xgb_params_for_problem_type(problem_type):
    if problem_type == "binary_classification":
        xgb_model = xgboost.XGBClassifier
        direction = "maximize"
        eval_metric = "auc"
        #val_preds_metrics = [roc_auc_score, log_loss, accuracy_score, f1_score, precision_score, recall_score]
        val_preds_metrics = roc_auc_score

    elif problem_type == "multi_label_classification":
        xgb_model = xgboost.XGBClassifier
        direction = "maximize"
        eval_metric = "auc"
        #val_preds_metrics = [roc_auc_score, log_loss, accuracy_score, f1_score, precision_score, recall_score]
        val_preds_metrics = accuracy_score
    
    elif problem_type == "multi_class_classification":
        xgb_model = xgboost.XGBClassifier
        direction = "minimize"
        eval_metric = "mlogloss"
        #val_preds_metrics = [log_loss, roc_auc_score, accuracy_score, f1_score, precision_score, recall_score]
        val_preds_metrics = accuracy_score

    elif problem_type == "regression":
        xgb_model = xgboost.XGBRegressor
        direction = "minimize"
        eval_metric = "rmse"
        #val_preds_metrics = [mean_absolute_error, mean_squared_error, r2_score]
        val_preds_metrics = mean_absolute_error
        
    else:
        raise NotImplementedError

    return xgb_model, val_preds_metrics, eval_metric, direction

def fetch_skl_params_for_problem_type(problem_type):
    if problem_type == "binary_classification":
        direction = "maximize"
        #val_preds_metrics = [roc_auc_score, log_loss, accuracy_score, f1_score, precision_score, recall_score]
        val_preds_metrics = roc_auc_score
        

    elif problem_type == "multi_label_classification":
        direction = "maximize"
        #val_preds_metrics = [roc_auc_score, log_loss, accuracy_score, f1_score, precision_score, recall_score]
        val_preds_metrics = accuracy_score
    
    elif problem_type == "multi_class_classification":
        direction = "minimize"
        #val_preds_metrics = [log_loss, roc_auc_score, accuracy_score, f1_score, precision_score, recall_score]
        val_preds_metrics = accuracy_score

    elif problem_type == "regression":
        direction = "minimize"
        #val_preds_metrics = [mean_absolute_error, mean_squared_error, r2_score]
        val_preds_metrics = mean_absolute_error
    else:
        raise NotImplementedError

    return val_preds_metrics, direction


def kfold_dict_mean(kfold_metrics_results):
    mean_metrics_results = {}
    for single_fold_metrics_results in kfold_metrics_results:
        for key in single_fold_metrics_results.keys():
            if key in mean_metrics_results:
                mean_metrics_results[key] += single_fold_metrics_results[key] / len(kfold_metrics_results)
            else:
                mean_metrics_results[key] = single_fold_metrics_results[key] / len(kfold_metrics_results)
    
    return mean_metrics_results

In [13]:
def clip_splits(train,val):
    """
    Fixes the stratification problem of StratifiedKfold
    Finds outstanding target values in train, valid splits and removes indexes of those train and valid splits from consideration for further correct definition of X_train,X_val,y_train,y_val
    """
    tr_target_unique=temp.iloc[train].target.unique()
    val_target_unique=temp.iloc[val].target.unique()
 
    if len(tr_target_unique) > len(val_target_unique):
        outstanding_target_class = list(set(tr_target_unique)- set(val_target_unique))
        ix_to_remove_from_train = []
    
        for i in outstanding_target_class:
            ix_to_remove_from_train.extend(   
            temp.iloc[train].reset_index(
            drop=True).index[temp.target.iloc[train] == i].tolist())
            train = list(train)
 
        for index in sorted(ix_to_remove_from_train, reverse=True):
            del train[index] 
 
    elif len(tr_target_unique) < len(val_target_unique):
 
        outstanding_target_class = list(set(val_target_unique)-  
                                   set(tr_target_unique))
        ix_to_remove_from_val = []
 
        for i in outstanding_target_class:
            ix_to_remove_from_val.extend(
            temp.iloc[val].reset_index( drop=True).index[temp.target.iloc[val] == i].tolist())
        val = list(val)
        for index in sorted(ix_to_remove_from_val, reverse=True):
            del val[index]
    return train, val

In [14]:
# # helper method for updating preprocessor in pipeline

# # to create params value dict from grid_search object
# def get_preprocessor_best_params_from_grid_search(grid_search_object:object):
#     pp_best_params = {}
#     for k in grid_search_object.best_params_:
#         #print(k)
#         if 'preprocessor' in k:
#             key = k.split('__')[1] + "__" + k.split('__')[2] 
#             pp_best_params[key] = grid_search_object.best_params_[k]
#     return pp_best_params

# # helper method for update_model
# def get_model_best_params_from_grid_search(grid_search_object:object):
#     model_best_params = {}
#     for k in grid_search_object.best_params_:
#         #print(k)
#         if 'model' in k:
#             key = k.split('__')[1]
#             model_best_params[key] = grid_search_object.best_params_[k]
#     return model_best_params

In [15]:
# hide
# run the script to build 

from nbdev.export import notebook2script; notebook2script()

Converted 00_dataframeloader.ipynb.
Converted 01_preprocessor.ipynb.
Converted 02_tmlt.ipynb.
Converted 04_xgb_optuna_objective.ipynb.
Converted Kaggle_TPS_Dec_Tutorial.ipynb.
Converted Kaggle_TPS_Nov_Tutorial.ipynb.
Converted index.ipynb.
Converted logger.ipynb.
Converted utility.ipynb.
Converted xgb_tabular_ml_toolkit.ipynb.
