# Projet 7 - Implementation of a scoring model
# Notebook - Global variables

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Projet-7---Implementation-of-a-scoring-model" data-toc-modified-id="Projet-7---Implementation-of-a-scoring-model-1">Projet 7 - Implementation of a scoring model</a></span></li><li><span><a href="#Notebook---Global-variables" data-toc-modified-id="Notebook---Global-variables-2">Notebook - Global variables</a></span></li><li><span><a href="#I)-Importation-of-required-libraries" data-toc-modified-id="I)-Importation-of-required-libraries-3">I) Importation of required libraries</a></span></li><li><span><a href="#II)-Global-variables" data-toc-modified-id="II)-Global-variables-4">II) Global variables</a></span></li></ul></div>

# I) Importation of required libraries

In [1]:
### File management ###

# Files' path.
import os.path

# Save and load files.
import csv
import pickle


### Data manipulations ###

import numpy as np
from numpy import set_printoptions # Saving full data when exporting to csv format.
import pandas as pd


### Date & time ###

# Time measurment and datetime management
import datetime as dt
from time import time


### Warnings removal ###

import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter(action='ignore', category=FutureWarning)


### Data visualizations ###

from pprint import pprint
import matplotlib.pyplot as plt
import seaborn as sns


### Additional common libraries ###

from numpy import argmax, argmin
import math
from random import sample as py_rd_sp # Python random sampling.

# Those allow to transform the shap values from their logodd format to odd.
import copy
from scipy.special import expit # Opposed of logit.


### sklearn tools & libraries ###

from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.model_selection import StratifiedKFold, GridSearchCV, RandomizedSearchCV, cross_val_predict, cross_val_score
from sklearn.metrics import roc_curve, auc, roc_auc_score, precision_recall_curve, fbeta_score, confusion_matrix
from sklearn.metrics import make_scorer # Allow to make a sklearn custom scorer (For the custom job score).


### Imbalanced data management ###

from imblearn.pipeline import Pipeline # NB: imbalearn.pipeline.Pipeline allows to properly deal the SMOTE on the train set and avoid the validation/test sets.
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTENC # NB: SMOTENC can manage categorial features while SMOTE cannot.


### Bayesian hyperparmaters tuning ###

# Hyperopt modules.
from hyperopt import STATUS_OK # Check if the objective function returned a valid value (Mandatory).

# Methods for the domain space, algorithm optimization, save the trials history, bayesian optimization.
from hyperopt import hp, tpe, Trials, fmin, pyll



# II) Global variables

In [1]:
# Initialize the default cross validation method to use.
SKF_5 = StratifiedKFold(5, shuffle=True, random_state=0)

# True: Allows hyperprameter tuning, False: Get the results stored from the last hyperparameters tuning.
#HT = True

# For imbalanced data use weight or data sampling.
#IMB_PROCESS = 'Weight' #'Resp'

# Global common scaler to use.
#SCALER = MinMaxScaler()

# Update the csv file containing the training information and scores of the model or not (True = update).
#GET_CSV_FILE = True

# Set and initialize the main scorer used for the models comparisons.
MAIN_SCORER_TRAIN_LABEL = 'Job_score_train'
MAIN_SCORER_TEST_LABEL = 'Job_score_test'
MAIN_SCORER_VAL = 0

# Load/create and initialize the dataframe in which store all relevant models' information (best hyperparameters, scores...).
# NB: In case of the creation of the file data=np.full((1,len(l_COL_LABELS)), None) to force dtypes as objects
#     until one of the next added entries (rows) are full then, it will be removed. Otherwise, the np.nan values which will appear
#     within the first row will convert their columns' dtypes to float64 and prevent their replacement
#     by objects such as np.array.
l_COL_LABELS = ['Model_labels', 'Models',
                'yhat_train', 'yhat_test',
                'Best_proba_threshold_train', 'Best_proba_threshold_test',
                'Job_score_train', 'Job_score_test', 
                'AUROC_scores_train', 'AUROC_scores_test',
                'F-bêta_score_train', 'F-bêta_score_test',
                'Process_time_train (s)', 'Process_time_test (s)'
               ]

#l_COL_LABELS = ['Model_labels', 'Models',
#                'X_train_shape', 'X_test_shape',
#                'yhat_train', 'yhat_test',
#                'Best_proba_threshold_train', 'Best_proba_threshold_test',
#                'Job_score_train', 'Job_score_test', 
#                'AUROC_scores_train', 'AUROC_scores_test',
#                'F-bêta_score_train', 'F-bêta_score_test',
#                'Process_time_train (s)', 'Process_time_test (s)'
#               ]

if GET_CSV_FILE:
    try:
        df_MODELS = pd.read_pickle(os.path.join(EXPORTS_MODELS_DIR_PATH, PKL_MODELS_FILE))#.set_index('Model_labels')

    except:
        print("No csv models informations were found. A new one is created...")
        df_MODELS = pd.DataFrame(data=np.full((1,len(l_COL_LABELS)), None), columns=l_COL_LABELS).set_index('Model_labels')
        df_MODELS.to_pickle(os.path.join(EXPORTS_MODELS_DIR_PATH, PKL_MODELS_FILE))
        print('Done !')
    
else:
    print("Creation of a new csv file to store models informations...")
    df_MODELS = pd.DataFrame(data=np.full((1,len(l_COL_LABELS)), None), columns=l_COL_LABELS).set_index('Model_labels')
    df_MODELS.to_pickle(os.path.join(EXPORTS_MODELS_DIR_PATH, PKL_MODELS_FILE))
    print('Done !')  

display(df_MODELS.info())

'# Initialize the default cross validation method to use.\nSKF_5 = StratifiedKFold(5, shuffle=True, random_state=0)\n\n# True: Allows hyperprameter tuning, False: Get the results stored from the last hyperparameters tuning.\nHT = True\n\n# For imbalanced data use weight or data sampling.\nIMB_PROCESS = \'Weight\' #\'Resp\'\n\n# Global common scaler to use.\nSCALER = MinMaxScaler()\n\n# Update the csv file containing the training information and scores of the model or not (True = update).\nGET_CSV_FILE = True\n\n# Set and initialize the main scorer used for the models comparisons.\nMAIN_SCORER_TRAIN_LABEL = \'Job_score_train\'\nMAIN_SCORER_TEST_LABEL = \'Job_score_test\'\nMAIN_SCORER_VAL = 0\n\n# Load/create and initialize the dataframe in which store all relevant models\' information (best hyperparameters, scores...).\n# NB: In case of the creation of the file data=np.full((1,len(l_COL_LABELS)), None) to force dtypes as objects\n#     until one of the next added entries (rows) are full