## import packages

In [None]:
import sys
import time
from pathlib import Path
import os

import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import TargetEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import log_loss

from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier

from sklearn.model_selection import KFold

from sklearn.model_selection import GridSearchCV

from sklearn.calibration import CalibrationDisplay

import matplotlib.pyplot as plt
import seaborn as sns

import datetime
import pickle 

from sklearn.model_selection import StratifiedKFold

## set up for imports of .py modules by adding path to sys.path

In [None]:
path = Path(os.getcwd())
path = str(path)
print(path)
sys.path.insert(1, path)

## import python modules

In [None]:
import utils.sml_utils as sml_utils
import utils.bin_class_utils as class_utils
import utils.glue_old_to_new as gotn
import utils.assign_and_lab_utils as al_utils
import utils.classification_utils as class_utils_2
import utils.reg_model_selection_utils as reg_ms_utils

## helpful functions

In [None]:
print('no functions')  # all helpful functions should go in this cell to preserve the cell count

## parameters

In [None]:
# path to data
path_to_data = 'second_data_set/synth_2_class_5000_0_5_0_5_15_6_4_1_4_0_0_0_4_w_noms.csv'
task = 'classification'

# step 1 parameters - check for missingness in target vector
target_attr = 'y'

# step 2 parameters

# step 3 parameters - train/test split
test_size = 0.20
train_test_split_random_state = 42

# step 4 parameters - train/validation split
train_validation_split = False
val_size = 0.30
train_validation_split_random_state = 42

# step 4.25 parameters - ttrain/probability calibration split
train_prob_cal_split = True  # used to calibrate classifier probability
train_prob_cal_split_random_state = 42
train_prob_cal_split_size = 0.20

# step 4.5 parameters - train/classification threshold tuning split
train_class_threshold_tune_split = True  # used to set the classification threshold
train_class_threshold_tune_split_random_state = 42
train_class_threshold_tune_split_size = 0.20

# step 5 parameters - identify attributes with missingness above threshold
missingness_threshold = 0.20

# step 11 parameters - build a composite estimator
sgd_class_random_state = 42
target_encoder_random_state = 42
class_weight = 'balanced'
rf_random_state = 42
dtc_stub_random_state = 42
adaboost_random_state = 42
svc_random_state = 42

# step 12 parameters - model selection / model assessment parameters cross validation parameters
scoring = ['neg_log_loss']  # must be in a list even if only one scoring metric
kwargs = {'return_indices': False}  # if true the indices of the cross validation split are returned
max_iter = 1000  # max number of epochs for SGDClassifier

# step 12 parameters - maximal control k-fold cross validation splitter parameters
kfold_n_splits = 10  # number of folds in k-fold cross validation
kfold_shuffle = True
kfold_random_state = 42

# step 14 parameters - tune hyperparameters of short-listed composite estimators
gs_cv_kfold_n_splits = 10
gs_cv_kfold_shuffle = True
gs_cv_kfold_random_state = 24
show_all_params = True
speed_up = False  # if True random stratified sample taken before GridSearchCV
frac = 0.10  # fraction taken for speed up
spd_up_random_state = 42  # speed up random state

# step 16 parameters - calibrate the classifiers
calibrate_classifiers = True
frac_of_val_for_cal = 0.5  # 1 - frac to validate cal

# step 18 parameters - model selection - configured for no model selection
script_select = False  # default = False - True if you want to let the script select the model with the lowest log loss out of GridsearchCV
hand_select = False  # default = False - if True you must select a model by hand and identify the model row index in grid_search_cv_results_df
hand_select_index = None # do not change this line
if hand_select:
    hand_select_index = None  # default = None - if hand select enter best model index from grid_search_cv_results_df

# step 19 - tune classification threshold
classification_threshold = None  # default = None

## set up to time script run time

In [None]:
start = time.time()

## read in the data and get the size of the data

In [None]:
df = pd.read_csv(path_to_data)
print(df.shape)
df.head()

## out of pipeline preprocessing

### These operations cannot be completed in the scikit-learn pipeline.

### They should be identified and passed on the the data engineer as tasks conducted during extract/transform/load (ETL) if the model goes to production.

## 1. check for missingness in target vector and dedup

In [None]:
print(df.shape)
df = df.dropna(subset=target_attr)
print(df.shape)

In [None]:
print(df.shape)
df = df.drop_duplicates()
print(df.shape)

## 2. label binarize

In [None]:
if pd.api.types.infer_dtype(df[target_attr]) == 'string':
    df, le_name_mapping = class_utils.label_binarize_binary(df, target_attr, print_results=True)
else:
    print(f'df[target_attr] is not a string attribute')

## 3. train/test split

In [None]:
train_cap_x_df, train_y_df = \
    sml_utils.perform_the_train_test_split(
    df, 
    test_size, 
    train_test_split_random_state, 
    val=False,
    stratify=True
)

## 4. train/validation split

In [None]:
if train_validation_split:
    train_cap_x_df, train_y_df = \
        sml_utils.perform_the_train_test_split(
            pd.concat([train_cap_x_df, train_y_df], axis=1), 
            val_size, 
            train_validation_split_random_state, 
            val=True,
            stratify=True
    )

## 4.25 train / probability calibration split

In [None]:
if train_prob_cal_split:
    train_cap_x_df, train_y_df = \
        sml_utils.perform_the_train_test_split(
            pd.concat([train_cap_x_df, train_y_df], axis=1), 
            train_prob_cal_split_size, 
            train_prob_cal_split_random_state, 
            prob_cal=True,
            stratify=True
    )

## 4.5 train / classification threshold tuning split

In [None]:
if train_class_threshold_tune_split:
    train_cap_x_df, train_y_df = \
        sml_utils.perform_the_train_test_split(
            pd.concat([train_cap_x_df, train_y_df], axis=1), 
            train_class_threshold_tune_split_size, 
            train_class_threshold_tune_split_random_state, 
            classification_threshold=True,
            stratify=True
    )

## 5. identify attributes with  missingness above threshold

In [None]:
return_dict = sml_utils.get_missingness(train_cap_x_df, missingness_threshold)
missingness_drop_list = return_dict['missingness_drop_list']

## 6. identify non machine learning attributes

In [None]:
sml_utils.check_for_complete_unique_attrs(train_cap_x_df)

In [None]:
non_ml_attr_list = ['attr_0']

## 7. identify attributes to exclude from machine learning

In [None]:
train_cap_x_df.columns

In [None]:
ml_attr_drop_list = []

## 8. establish machine learning attribute configuration

In [None]:
ml_ignore_list = missingness_drop_list + non_ml_attr_list + ml_attr_drop_list
ml_ignore_list

In [None]:
train_cap_x_df.columns

In [None]:
df.dtypes

In [None]:
# identify the remaining numerical attributes to be used in machine learning and enter them into the 
# numerical_attr list below.

numerical_attr = ['attr_1', 'attr_2', 'attr_4', 'attr_7', 'attr_8', 'attr_9', 'attr_11', 'attr_13', 'attr_15']

# identify the remaining nominal attributes to be used in machine learning and enter them into the 
# nominal_attr list below.

nominal_attr = ['attr_3', 'attr_6', 'attr_10', 'attr_12', 'attr_14']

assert(train_cap_x_df.shape[1] == len(ml_ignore_list) + len(nominal_attr) + len(numerical_attr))  # got them all?

print(f'ml_ignore_list: {ml_ignore_list}')
print(f'\nnumerical_attr: {numerical_attr}')
print(f'nominal_attr: {nominal_attr}')

print(f'\nnumber of machine learning attributes: {len(numerical_attr) + len(nominal_attr)}')
print(f'\nnumerical_attr and nominal_attr: {numerical_attr + nominal_attr}')

## 9. assess target attribute imbalance

In [None]:
train_y_df[target_attr].unique().tolist()

In [None]:
train_y_df[target_attr].value_counts(normalize=True)

## 10. steps to deal with target attribute imbalance if required

To be completed later.

## 11. build default composite estimators

### build a dictionary of default estimators

In [None]:
estimators_list = [

    ('SGDClassifier', SGDClassifier(
        loss='log_loss',  # 'hinge'
        penalty='l2', 
        alpha=0.0001, 
        l1_ratio=0.15, 
        fit_intercept=True, 
        max_iter=max_iter, 
        tol=0.001, 
        shuffle=True, 
        verbose=0, 
        epsilon=0.1, 
        n_jobs=-1,  # None
        random_state=sgd_class_random_state,  # None
        learning_rate='optimal', 
        eta0=0.0, 
        power_t=0.5, 
        early_stopping=True,  # False,
        validation_fraction=0.1, 
        n_iter_no_change=5, 
        class_weight=class_weight,  # None
        warm_start=False, 
        average=False)
    ),

    ('RandomForestClassifier', RandomForestClassifier(
        n_estimators=100,
        criterion='gini', 
        max_depth=None, 
        min_samples_split=2, 
        min_samples_leaf=1, 
        min_weight_fraction_leaf=0.0, 
        max_features='sqrt', 
        max_leaf_nodes=None, 
        min_impurity_decrease=0.0, 
        bootstrap=True, 
        oob_score=False, 
        n_jobs=-1,  # None, 
        random_state=rf_random_state,  # None
        verbose=0, 
        warm_start=False, 
        class_weight=class_weight,  # None
        ccp_alpha=0.0, 
        max_samples=None, 
        monotonic_cst=None)
    ),

    ('AdaBoostClassifier', AdaBoostClassifier(
        estimator=DecisionTreeClassifier(
            criterion='gini', 
            splitter='best', 
            max_depth=1,  # None
            min_samples_split=2, 
            min_samples_leaf=1, 
            min_weight_fraction_leaf=0.0, 
            max_features=None, 
            random_state=dtc_stub_random_state,  # None
            max_leaf_nodes=None, 
            min_impurity_decrease=0.0, 
            class_weight=class_weight, 
            ccp_alpha=0.0, 
            monotonic_cst=None
        ),
        n_estimators=50, 
        learning_rate=1.0, 
        algorithm='SAMME',
        random_state=adaboost_random_state)  # None
    )

]

estimator_dict = {
    
    estimators_list[0][0]: estimators_list[0][1],  # SGDClassifier

    # estimators_list[1][0]: estimators_list[1][1],  # RandomForestClassifier
    
    # estimators_list[2][0]: estimators_list[2][1],  # AdaBoostClassifier

    # 'VotingClassifier': VotingClassifier(
    #     estimators=[
    #         (estimators_list[0][0], estimators_list[0][1]),  # SGDClassifier
    #         (estimators_list[1][0], estimators_list[1][1]),  # RandomForestClassifier
    #         (estimators_list[2][0], estimators_list[2][1])  # AdaBoostClassifier
    #     ],
    #     voting='soft',  # 'hard'
    #     weights=[1.0, 1.0, 1.0],  # None, 
    #     n_jobs=-1,  # None
    #     flatten_transform=True, 
    #     verbose=False
    # )
}

### build a preprocessing pipeline

In [None]:
numerical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer()),
        ("scaler", StandardScaler())
    ]
)

In [None]:
nominal_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy='most_frequent')),
        ("target_encoder", TargetEncoder(
                    categories='auto', 
                    target_type='binary', 
                    smooth='auto', 
                    cv=5, 
                    shuffle=True, 
                    random_state=target_encoder_random_state
                )
        ),
        ("scaler", StandardScaler())
    ]
)

In [None]:
preprocessor = ColumnTransformer(
        transformers=[
            ('nominal', nominal_transformer, nominal_attr),
            ('numerical', numerical_transformer, numerical_attr)
        ]
)

## 12. survey (fit and evaluate with cost function and ranking metrics)  default composite estimators

### survey candidate default models by fitting them on the whole train set

In [None]:
return_dict = sml_utils.model_survey_fit(preprocessor, estimator_dict, train_cap_x_df, train_y_df)
trained_estimator_dict = return_dict['trained_estimator_dict']

### estimate the test error rate using k-fold cross validation - use KFold splitter

In [None]:
# get the maximal control k-fold cross validation splitter
splitter = StratifiedKFold(
    n_splits=kfold_n_splits,
    shuffle=kfold_shuffle,
    random_state=kfold_random_state
)
splitter.get_n_splits(train_cap_x_df, train_y_df[target_attr])

# perform cross validation on models
sml_utils.model_survey_cross_val_and_analysis(
    preprocessor, 
    estimator_dict, 
    train_cap_x_df, 
    train_y_df, 
    scoring, 
    splitter, 
    target_attr, 
    trained_estimator_dict, 
    task=task,
    **kwargs
)

### get the ranking metrics

In [None]:
# get the maximal control k-fold cross validation splitter
splitter = StratifiedKFold(
    n_splits=kfold_n_splits,
    shuffle=kfold_shuffle,
    random_state=kfold_random_state
)
splitter.get_n_splits(train_cap_x_df, train_y_df[target_attr])

# perform cross validation on models
sml_utils.model_survey_cross_val_and_analysis(
    preprocessor=preprocessor, 
    estimator_dict=estimator_dict, 
    train_cap_x_df=train_cap_x_df, 
    train_y_df=train_y_df, 
    scoring=['average_precision', 'roc_auc'], 
    splitter=splitter, 
    target_attr=target_attr, 
    trained_estimator_dict=trained_estimator_dict, 
    task=task,
    **kwargs
)

## 13. short list default composite estimators

### We are going to promote all models.

In [None]:
estimator_dict.keys()

In [None]:
# del estimator_dict['ElasticNet']

# estimator_dict

## 14. tune hyperparameters of short-listed composite estimators

### demonstrate the numpy logspace function

In [None]:
list(np.logspace(0.7, 2, num=20))

### demonstrate the numpy arange function

In [None]:
list(np.arange(0.0, 1.1, step=0.1))

### demonstrate python range function

In [None]:
list(range(5, 96, 15))

### set up the hyperparameter space for the grid search

In [None]:
preproc_param_grid = {
    'preprocessor__numerical__imputer__strategy': ['mean', 'median'],
    'preprocessor__nominal__target_encoder__smooth': ['auto']
}

sgd_classifier_param_grid = preproc_param_grid | {

    'estimator__penalty': [],
    'estimator__alpha': [],
    'estimator__l1_ratio': []
    
}

rf_classifier_param_grid = preproc_param_grid | {
    
    'estimator__n_estimators': [],
    'estimator__max_depth': [],
    'estimator__min_samples_leaf': [],
    'estimator__max_features': [],
    'estimator__max_samples': []
    
}

adaboost_classifier_param_grid = preproc_param_grid | {
    
    'estimator__estimator__max_depth': [],
    'estimator__estimator__min_samples_leaf': [],
    'estimator__n_estimators': [],
    'estimator__learning_rate': []
    
}

voting_classifier_param_grid = preproc_param_grid | {
    
    'estimator__SGDClassifier__penalty': [],  
    'estimator__SGDClassifier__alpha': [],  
    'estimator__SGDClassifier__l1_ratio': [],  
    
    'estimator__RandomForestClassifier__n_estimators': [],  
    'estimator__RandomForestClassifier__max_depth': [],  
    'estimator__RandomForestClassifier__min_samples_leaf': [],  
    'estimator__RandomForestClassifier__max_features': [],  
    'estimator__RandomForestClassifier__max_samples': [],  #

    'estimator__AdaBoostClassifier__estimator__max_depth': [],
    'estimator__AdaBoostClassifier__estimator__min_samples_leaf': [],
    'estimator__AdaBoostClassifier__n_estimators': [],
    'estimator__AdaBoostClassifier__learning_rate': []
}

param_grids = {
    'SGDClassifier': sgd_classifier_param_grid,
    'RandomForestClassifier': rf_classifier_param_grid,
    'AdaBoostClassifier': adaboost_classifier_param_grid,
    'VotingClassifier': voting_classifier_param_grid
}

### perform the grid search cross validation

In [None]:
# get the maximal control k-fold cross validation splitter
splitter = StratifiedKFold(
    n_splits=gs_cv_kfold_n_splits,
    shuffle=gs_cv_kfold_shuffle,
    random_state=gs_cv_kfold_random_state
)
splitter.get_n_splits(train_cap_x_df, train_y_df[target_attr])

# collect grid seach cv results here
tuned_estimator_dict = {}

if speed_up:
    print(f'before speed_up train_cap_x_df.shape: {train_cap_x_df.shape}')
    print(f'before speed_up train_y_df.value_counts():\n{train_y_df.value_counts()}')
    train_cap_x_df, train_y_df = sml_utils.sample_data_objects_for_speed_up(train_cap_x_df, train_y_df, frac=frac, 
                                                                            random_state=spd_up_random_state)
    print(f'\nafter speed_up train_cap_x_df.shape: {train_cap_x_df.shape}')
    print(f'after speed_up train_y_df.value_counts():\n{train_y_df.value_counts()}')

for estimator_name, estimator in estimator_dict.items():
    
    print(f'\n{estimator_name}')

    composite_estimator = \
    Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('estimator', estimator)
        ]
    )
    
    grid_search_cv = GridSearchCV(
        estimator=composite_estimator, 
        param_grid=param_grids[estimator_name], 
        scoring=scoring,  # in preparation for multi metric evaluation scoring variable is a list
        n_jobs=None, 
        refit=scoring[0],  # when scoring is a list we must specify which scoring method is used for the refit
        cv=splitter, 
        verbose=0, 
        pre_dispatch='2*n_jobs', 
        error_score=np.nan, 
        return_train_score=True
    )
    gs_start = time.time()
    grid_search_cv.fit(train_cap_x_df, train_y_df.values.ravel())
    gs_end = time.time()
    print(f'   GridSearchCV run time for {estimator_name}: {(gs_end - gs_start)/60:.3f} minutes')
        
    tuned_estimator_dict[estimator_name] = grid_search_cv

### check out the flexibility plots of the grid search cross validation

In [None]:
gs_survey_results_df = pd.DataFrame()
for estimator_name, grid_search_cv in tuned_estimator_dict.items():
    print('\n\n')
    return_dict = sml_utils.plot_flexibility(
        grid_search_cv=grid_search_cv,
        estimator_name=estimator_name,
        scoring=scoring
    )
    results_df = return_dict['results_df']
    results_df['estimator_name'] = estimator_name
    results_df = results_df[['estimator_name'] + [attr for attr in results_df if attr not in ['estimator_name']]]
    gs_survey_results_df = pd.concat([gs_survey_results_df, results_df], axis=0)

gs_survey_results_df = gs_survey_results_df.sort_values(['score', 'best_test_score'])
gs_survey_results_df = gs_survey_results_df.reset_index(drop=True)
gs_survey_results_df

### alternative model selection - use gs_survey_results_df to pick an alternative model from the grid serach and add it to tuned_estimator_dict

In [None]:
explore_alt_model = False
add_alternative_model_to_flow = False

if explore_alt_model:
    
    %load_ext autoreload
    %autoreload 2
    
    return_dict = reg_ms_utils.alternative_model_selection(
        trained_estimator_dict, 
        tuned_estimator_dict,
        train_cap_x_df, 
        train_y_df,
        gs_survey_results_df,
        scoring,
        param_grids,
        demo_reg_flow=False,  # True = demo flow and False = develop algo to get alternative model
        estimator='AdaBoostClassifier',  # the estimator you want an alternative choice for
        frac_count=0.2,  # frac_count * number of grid points = how many gridpoints in a row the train and test bands are separated
        num_std=1.0,  # specifies the width of the train and test bands in units of std dev on flex plot
        man_flex_plot_index=250, # None,  # None  # = None means algo used - if integer then you must visually inspect flex plot and specify flex_index to get the hyperparameters of the model you want
        add_alternative_model_to_flow=add_alternative_model_to_flow
    )

    if add_alternative_model_to_flow:
        tuned_estimator_dict = return_dict['tuned_estimator_dict']
        gs_survey_results_df = return_dict['gs_survey_results_df']
        param_grids = return_dict['param_grids']

In [None]:
gs_survey_results_df

In [None]:
# gs_survey_results_df = gs_survey_results_df.loc[gs_survey_results_df.estimator_name != 'AlternativeRandomForestClassifier', :]
# gs_survey_results_df

## 15. evaluate (cost function and ranking metrics) tuned composite estimators

### use cross validation for model selection

In [None]:
# get the maximal control k-fold cross validation splitter
splitter = StratifiedKFold(
    n_splits=kfold_n_splits,
    shuffle=kfold_shuffle,
    random_state=kfold_random_state
)
splitter.get_n_splits(train_cap_x_df, train_y_df[target_attr])

# perform cross validation on models
sml_utils.model_tuning_cross_val_and_analysis(
    tuned_estimator_dict, 
    train_cap_x_df, 
    train_y_df, 
    scoring, 
    splitter, 
    target_attr,
    task=task
)

### get the ranking metrics

In [None]:
# get the maximal control k-fold cross validation splitter
splitter = StratifiedKFold(
    n_splits=kfold_n_splits,
    shuffle=kfold_shuffle,
    random_state=kfold_random_state
)
splitter.get_n_splits(train_cap_x_df, train_y_df[target_attr])

# perform cross validation on models
return_dict = sml_utils.model_tuning_cross_val_and_analysis(
    tuned_estimator_dict=tuned_estimator_dict, 
    train_cap_x_df=train_cap_x_df, 
    train_y_df=train_y_df, 
    scoring=['average_precision', 'roc_auc'], 
    splitter=splitter, 
    target_attr=target_attr,
    task=task,
    return_=True
)
tuned_model_ranking_cv_scores_grouped_df = return_dict['cv_scores_grouped_df']

### check out the best estimator hyperparameters for each estimator

In [None]:
print('best hyperparameters for each estimator\n')

for index, row in gs_survey_results_df.iterrows():
    
    print(f'\nestimator_name: {row['estimator_name']}; score: {row['score']}')
    
    param_grids_ = param_grids[row['estimator_name']]
    for hyperparameter_name, hyperparameter_value in row['grid_search_cv'].best_params_.items():

        if len(param_grids_[hyperparameter_name]) > 1 and not show_all_params:  #  and only_show_searched_params:  # only check the hyperparameter you are varing
            print(f'   hyperparameter_name: {hyperparameter_name}; hyperparameter_value: {hyperparameter_value}')
        elif show_all_params:
            print(f'   hyperparameter_name: {hyperparameter_name}; hyperparameter_value: {hyperparameter_value}')

In [None]:
# get the minimum test score from the GridSeachCV

min_test_score = gs_survey_results_df.best_test_score.min()
print(f'\nMinimum test score from GridSearchCV: {min_test_score}')

# get the name of the estimator with the minimum test score
estimator_name = gs_survey_results_df.loc[gs_survey_results_df['best_test_score'] == min_test_score, 'estimator_name'].iloc[0]
print(f'\nThe estimator with minimum test score from GridSearchCV is considered the best model. It is: {estimator_name}')

# get the best estimator
best_model_name = gs_survey_results_df.loc[gs_survey_results_df['best_test_score'] == min_test_score, 'estimator_name'].values[0]
best_model = gs_survey_results_df.loc[gs_survey_results_df['best_test_score'] == min_test_score, 'grid_search_cv'].values[0].best_estimator_

print(f'\nNote that the best estimator from a GridSearchCV is not necessarily the best model.\n'
      f'\nThe best model will come from a GridSearchCV that was conducted with a parameter grid\n'
      f'that covered the range of parameters to give the best model.\n'
      f'\nTypically several GridSearchCV runs are required to assure yourself the the range of\n'
      f'parameters can give the best model')

### evaluate tuned composite estimators

In [None]:
log_loss_best_model_on_train_set = log_loss(train_y_df, best_model.predict_proba(train_cap_x_df))

print(f'best_model is the trained estimator that performed the best in GridSearchCV.\n'
      f'\nIt was trained on the whole train set using the hyperparameter combination\n'
      f'that gave the lowest estimate of test error rate in cross validation.\n'
      f'\nThe log loss of the best_model when prediction on the whole train set is {log_loss_best_model_on_train_set}.')

## beyond this point we will start using portions of an older code base so this is cell is dedicated to meeting the requirments of that code base

In [None]:
grid_search_cv_results_df = gotn.prep_gs_survey_results_df_for_calibration(gs_survey_results_df, tuned_model_ranking_cv_scores_grouped_df)
grid_search_cv_results_df

## 16. calibrate classifier if required

In [None]:
if calibrate_classifiers:
    
    # class_eval_dict:
    #    key = name of function in classification_utils.py
    #    value = [bool, function kwargs]  bool = True then call function
    print_plots = False
    binary = True
    class_eval_dict={
        'binary': binary,
        'scoring': 'average_precision',
        'get_precision_recall_curves': [True, 
                                            {
                                                'print_prc': print_plots, 
                                                'print_prd': print_plots,
                                            }
                                       ],
        'get_roc_curve': [True, 
                              {
                                  'print_roc': print_plots,
                              }
                         ]
    }

    # load the data for probability calibration
    prob_cal_set_df = pd.read_csv('prob_cal_set_df.csv').set_index(keys='index')
    prob_cal_set_df.index.name = None
    prob_cal_set_cap_x_df, prob_cal_set_y_df = prob_cal_set_df.iloc[:, :-1], prob_cal_set_df.iloc[:, -1].to_frame()
    cal_count = int(frac_of_val_for_cal * prob_cal_set_cap_x_df.shape[0])
    cal_val_count = prob_cal_set_cap_x_df.shape[0] - cal_count

    # calibrate the probability
    cal_grid_search_cv_results_df = al_utils.calibrate_estimators(
        estimator_names=grid_search_cv_results_df.estimator.to_list(),  # list(tuned_estimator_dict.keys()), 
        grid_search_cv_results_df=grid_search_cv_results_df,
        cal_cap_x_df=prob_cal_set_cap_x_df.iloc[:cal_count,:],  # calibrate on unseen data
        cal_y_df=prob_cal_set_y_df.iloc[:cal_count,:],  # calibrate on unseen data
        calibration_data_set_name='first half of prob_cal_set_df', 
        validation_cap_x_df=prob_cal_set_cap_x_df.iloc[cal_count:,:],  # use unseen data to validate calibration 
        validation_y_df=prob_cal_set_y_df.iloc[cal_count:,:],  # use unseen data to validate calibration 
        validation_data_set_name='second half of prob_cal_set_df',
        class_eval_dict=class_eval_dict,
        model_selection_stage='tuned', 
        method='isotonic',  # 'sigmoid' or 'isotonic'
        ensemble=True
    )

    # clean up estimator names
    estimator_names = al_utils.get_estimator_names_helper(
        grid_search_cv_results_df, 
        cal_grid_search_cv_results_df
    )

    # add probability calibration data to the data frame
    grid_search_cv_results_df = pd.concat([cal_grid_search_cv_results_df, grid_search_cv_results_df], axis=0)

del prob_cal_set_df, prob_cal_set_cap_x_df, prob_cal_set_y_df

grid_search_cv_results_df

## the resampling method in old code base is validation data set method. in the new code base we use k-fold cross validation. we will replace the numbers from the old method with numbers from the new method for consistency

In [None]:
# get the maximal control k-fold cross validation splitter
splitter = StratifiedKFold(
    n_splits=kfold_n_splits,
    shuffle=kfold_shuffle,
    random_state=kfold_random_state
)
splitter.get_n_splits(train_cap_x_df, train_y_df[target_attr])

# perform cross validation on models
return_dict = sml_utils.model_tuning_cross_val_and_analysis(
    tuned_estimator_dict=dict(zip(grid_search_cv_results_df.estimator, grid_search_cv_results_df.best_estimator)),
    train_cap_x_df=train_cap_x_df, 
    train_y_df=train_y_df, 
    scoring=['average_precision', 'roc_auc'], 
    splitter=splitter, 
    target_attr=target_attr,
    task=task,
    return_=True
)
tuned_model_ranking_cv_scores_grouped_df = return_dict['cv_scores_grouped_df']

grid_search_cv_results_df = gotn.add_ave_precision_and_roc_auc(grid_search_cv_results_df, tuned_model_ranking_cv_scores_grouped_df)
grid_search_cv_results_df

## 17. check for false discoveries

### shuffle the target and do cross validation to understand if we have a real or false discovery

In [None]:
sml_utils.check_for_false_discoveries(
    tuned_estimator_dict, 
    train_cap_x_df, 
    train_y_df, 
    scoring, 
    splitter, 
    target_attr, 
    shuffle_target=True,
    shuffle_target_random_state=42, 
    gs_survey_results_df=gs_survey_results_df,
    task=task
)

## 18. select a model

In [None]:
composite_estimator = None

### You can let the script select the best model by setting composite_estimator = best_model below. This will select the model with the lowest log loss coming out of GridSearchCV. This is done in cell 43.

### Or you can hand select the model you want to promote from the list of models below. 

### This is helpful if you want to select a model based on average precision or roc auc. It is also helpful if you want to select a model that has had its probability calibrated.

In [None]:
grid_search_cv_results_df

In [None]:
print(f'currently the best model name based on log loss coming out of GridSearchCV is {best_model_name}')

In [None]:
if script_select:
    print(f'you have opted for using the best_model and best_model_name assigned in cell 47 using log loss of uncalibrated '
          f'model')
elif hand_select:
    if hand_select_index is None or hand_select_index not in grid_search_cv_results_df.index:
        sys.exit(f'{hand_select_index} is not a valid index - go to cell 5 line 69 and enter a valid grid_search_cv_results_df'
                 f' index to indicate the model you want to select')
    else:
        best_model_name = grid_search_cv_results_df.loc[hand_select_index, 'estimator']
        best_model = grid_search_cv_results_df.loc[hand_select_index, 'best_estimator']
else:
    sys.exit(f'go to cell 5 lines 65 to 69 to set up model selection')

print(f'\nthe best model name is {best_model_name}')

## 19. tune classification threshold

### scan over classification thresholds and pick the threshold that minimizes the most costly errors - recall or precision

In [None]:
class_thresh_set_df = pd.read_csv('class_thresh_set_df.csv').set_index(keys='index')
class_thresh_set_df.index.name = None
class_thresh_set_cap_x_df, class_thresh_set_y_df = class_thresh_set_df.iloc[:, :-1], class_thresh_set_df.iloc[:, -1].to_frame()

In [None]:
class_threshold_list = np.arange(0, 1.1, 0.1)
thresh_class_perf_dict = \
    class_utils_2.class_thresh_metrics_class_perf_assess_binary(
        best_model_name=best_model_name, 
        estimator_names=estimator_names, 
        grid_search_cv_results_df=grid_search_cv_results_df, 
        cap_x_df=class_thresh_set_cap_x_df, 
        y_df=class_thresh_set_y_df, 
        class_threshold_list=class_threshold_list, 
        cvs_compute=False, 
        cvs_print=False, 
        data_set_name='class_thresh_set_df', 
        model_selection_stage='tuned'
)

In [None]:
class_threshold_list = np.arange(0, 1.01, 0.01)
class_utils_2.plot_errors_as_a_function_of_classification_threshold(
    best_model_name=best_model_name, 
    estimator_names=estimator_names, 
    grid_search_cv_results_df=grid_search_cv_results_df, 
    cap_x_df=class_thresh_set_cap_x_df, 
    y_df=class_thresh_set_y_df, 
    class_threshold_list=class_threshold_list, 
    data_set_name='class_thresh_set_df',
    model_selection_stage='tuned'
)

### use bootstrapping to understand how metrics will vary for future data sets from the same data generation process

In [None]:
if classification_threshold is None:
    sys.exit(f'{classification_threshold} is not a valid classification threshold - go to cell 5 line 72 to set a classification threshold')
else:
    print(f'the classification threshold is {classification_threshold}')

In [None]:
class_utils_2.precision_recall_bootstrap_no_refit_binary(
    best_model_name=best_model_name,
    estimator_names=estimator_names, 
    grid_search_cv_results_df=grid_search_cv_results_df,
    cap_x_df=class_thresh_set_cap_x_df,
    y_df=class_thresh_set_y_df, 
    n_bootstrap=20,
    data_set_name='class_thresh_set_df', 
    model_selection_stage='tuned',
    classification_threshold=classification_threshold
)

In [None]:
class_utils_2.roc_curve_bootstrap_no_refit_binary(
    best_model_name=best_model_name,
    estimator_names=estimator_names, 
    grid_search_cv_results_df=grid_search_cv_results_df, 
    cap_x_df=class_thresh_set_cap_x_df, 
    y_df=class_thresh_set_y_df, 
    n_bootstrap=20,
    data_set_name='class_thresh_set_df', 
    model_selection_stage='tuned',
    classification_threshold=classification_threshold
)

### take a final look at this models performance when tuned to the classification threshold

In [None]:
class_perf_dict = class_utils_2.classification_performance(
    trained_classifier=best_model, 
    cap_x_df=class_thresh_set_cap_x_df, 
    y_df=class_thresh_set_y_df.values.ravel(), 
    classification_threshold=classification_threshold,
    binary=True,
    # https://scikit-learn.org/stable/modules/model_evaluation.html
    cvs_scoring_dict={
        'accuracy': 'accuracy',
        'precision': 'precision',
        'recall': 'recall',
        'f1': 'f1'
    },
    cr_digits=4,
    cr_print=True,  # print classification report
    cm_print=True,  # print confusion matrix
    cvs_compute=False,  # compute cross_val_scores (classification threshold = 0.5 always)
    cvs_print=True,  # print cross_val_scores (classification threshold = 0.5 always) - ignored if cvs_compute=False
    prc_print=True,  # print precision and recall curves as a function of classification threshold
    prd_print=True,  # print precision recall curves
    roc_print=True,  # print roc curve
    data_set_name='class_thresh_set_df', 
    model_selection_stage='tuned'
)

In [None]:
del class_thresh_set_df, class_thresh_set_cap_x_df, class_thresh_set_y_df

## serialize model and classification threshold

In [None]:
now = datetime.datetime.now()
date_time_prefix = str(now).replace('-', '_').replace(' ', '_').replace(':', '_').replace('.', '_')[:-4]

date_time_prefix

In [None]:
best_estimator_file_name = date_time_prefix + '_model' + '.pkl'

best_estimator_file_name

In [None]:
# best_estimator = \
#     grid_search_cv_results_df.loc[grid_search_cv_results_df.estimator == best_model, 'best_estimator'].iloc[0]

model_dict = {
    'classification_threshold': classification_threshold,
    'best_model_name': best_model_name,
    'best_model': best_model
}

In [None]:
with open(best_estimator_file_name, 'wb') as f:
    pickle.dump(model_dict, f)

## evaluate model on the test set

This should be done in an independent notebook.

## check out script run time

In [None]:
end = time.time()
print(f'script run time: {(end - start)/60} minutes')