In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# default_exp utility

In [3]:
# export
from tabular_ml_toolkit.dataframeloader import *
from tabular_ml_toolkit.preprocessor import *
from tabular_ml_toolkit.logger import *
from tabular_ml_toolkit.optuna_objective import *

In [4]:
# export
# hide
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.metrics import roc_auc_score, accuracy_score, log_loss, f1_score, precision_score, recall_score
from sklearn.model_selection import cross_val_score, GridSearchCV, StratifiedKFold
# for Optuna
import optuna
#for XGB
import xgboost

# for finding n_jobs in all sklearn estimators
from sklearn.utils import all_estimators
import inspect

# Just to compare fit times
import time

# for os specific settings
import os

In [5]:
# export

#helper method to find ideal cpu cores
def find_ideal_cpu_cores():
    if os.cpu_count() > 2:
        ideal_cpu_cores = os.cpu_count()-1
        logger.info(f"{os.cpu_count()} cores found, model and data parallel processing should worked!")
    else:
        ideal_cpu_cores = None
        logger.info(f"{os.cpu_count()} cores found, model and data parallel processing may NOT worked!")
    return ideal_cpu_cores

#Helper method to find all sklearn estimators with support for parallelism aka n_jobs
def check_has_n_jobs():
    has_n_jobs = ['XGBRegressor', 'XGBClassifier']
    for est in all_estimators():
        s = inspect.signature(est[1])
        if 'n_jobs' in s.parameters:
            has_n_jobs.append(est[0])
    return has_n_jobs

In [None]:
# #export

# def use_modin():
    
#     #settings for modin
#     import ray
#     ray.init()
#     import os
#     os.environ["MODIN_ENGINE"] = "ray"
#     import modin.pandas as pd

In [6]:
# export


# Inspired from AutoXGB Library, Thanks to https://github.com/abhishekkrthakur

def fetch_params_for_problem_type(problem_type):
    if problem_type == "classification":
        xgb_model = xgboost.XGBClassifier
        direction = "minimize"
        eval_metric = "logloss"
        val_preds_metrics = [log_loss, roc_auc_score, accuracy_score, f1_score, precision_score, recall_score]


    elif problem_type == "multi_class_classification":
        xgb_model = xgboost.XGBClassifier
        direction = "minimize"
        eval_metric = "mlogloss"
        val_preds_metrics = [log_loss, roc_auc_score, accuracy_score, f1_score, precision_score, recall_score]

    elif problem_type == "regression":
        xgb_model = xgboost.XGBRegressor
        direction = "minimize"
        eval_metric = "rmse"
        val_preds_metrics = [mean_absolute_error, mean_squared_error, r2_score]
    else:
        raise NotImplementedError

    return xgb_model, val_preds_metrics, eval_metric, direction

def kfold_dict_mean(kfold_metrics_results):
    mean_metrics_results = {}
    for single_fold_metrics_results in kfold_metrics_results:
        for key in single_fold_metrics_results.keys():
            if key in mean_metrics_results:
                mean_metrics_results[key] += single_fold_metrics_results[key] / len(kfold_metrics_results)
            else:
                mean_metrics_results[key] = single_fold_metrics_results[key] / len(kfold_metrics_results)
    
    return mean_metrics_results

In [7]:
test_list = [

{'log_loss': 0.6621403547748923, 'roc_auc_score': 0.6182978723404255, 'accuracy_score': 0.605, 'f1_score': 0.3629032258064516, 'precision_score': 0.5421686746987951, 'recall_score': 0.2727272727272727},

{'log_loss': 0.6640257256105542, 'roc_auc_score': 0.6078916827852998, 'accuracy_score': 0.61875, 'f1_score': 0.38383838383838387, 'precision_score': 0.5757575757575758, 'recall_score': 0.2878787878787879},

{'log_loss': 0.662113243713975, 'roc_auc_score': 0.6047582205029014, 'accuracy_score': 0.6225, 'f1_score': 0.37860082304526754, 'precision_score': 0.5897435897435898, 'recall_score': 0.2787878787878788},

{'log_loss': 0.6577297036349773, 'roc_auc_score': 0.6376876944582225, 'accuracy_score': 0.63375, 'f1_score': 0.40325865580448067, 'precision_score': 0.61875, 'recall_score': 0.2990936555891239},

{'log_loss': 0.6580264708772302, 'roc_auc_score': 0.6333331186106584, 'accuracy_score': 0.60875, 'f1_score': 0.37274549098196397, 'precision_score': 0.5535714285714286, 'recall_score': 0.2809667673716012}

]

[

{'log_loss': 0.6621403547748923, 'roc_auc_score': 0.6182978723404255, 'accuracy_score': 0.605, 'f1_score': 0.3629032258064516, 'precision_score': 0.5421686746987951, 'recall_score': 0.2727272727272727},

{'log_loss': 0.6640257256105542, 'roc_auc_score': 0.6078916827852998, 'accuracy_score': 0.61875, 'f1_score': 0.38383838383838387, 'precision_score': 0.5757575757575758, 'recall_score': 0.2878787878787879},

{'log_loss': 0.662113243713975, 'roc_auc_score': 0.6047582205029014, 'accuracy_score': 0.6225, 'f1_score': 0.37860082304526754, 'precision_score': 0.5897435897435898, 'recall_score': 0.2787878787878788},

{'log_loss': 0.6577297036349773, 'roc_auc_score': 0.6376876944582225, 'accuracy_score': 0.63375, 'f1_score': 0.40325865580448067, 'precision_score': 0.61875, 'recall_score': 0.2990936555891239},

{'log_loss': 0.6580264708772302, 'roc_auc_score': 0.6333331186106584, 'accuracy_score': 0.60875, 'f1_score': 0.37274549098196397, 'precision_score': 0.5535714285714286, 'recall_score': 0.2809667673716012}

] 

In [8]:
# # helper method for updating preprocessor in pipeline

# # to create params value dict from grid_search object
# def get_preprocessor_best_params_from_grid_search(grid_search_object:object):
#     pp_best_params = {}
#     for k in grid_search_object.best_params_:
#         #print(k)
#         if 'preprocessor' in k:
#             key = k.split('__')[1] + "__" + k.split('__')[2] 
#             pp_best_params[key] = grid_search_object.best_params_[k]
#     return pp_best_params

# # helper method for update_model
# def get_model_best_params_from_grid_search(grid_search_object:object):
#     model_best_params = {}
#     for k in grid_search_object.best_params_:
#         #print(k)
#         if 'model' in k:
#             key = k.split('__')[1]
#             model_best_params[key] = grid_search_object.best_params_[k]
#     return model_best_params

In [9]:
# hide
# run the script to build 

from nbdev.export import notebook2script; notebook2script()

Converted 00_dataframeloader.ipynb.
Converted 01_preprocessor.ipynb.
Converted 02_tmlt.ipynb.
Converted 04_optuna_objective.ipynb.
Converted 13_Kaggle_TPS_Tutorial.ipynb.
Converted BAK_index.ipynb.
Converted index.ipynb.
Converted logger.ipynb.
Converted utility.ipynb.
