# Initial Configs


In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

# Imports

In [None]:
from __future__ import division
#import libraries
# from datetime import datetime, timedelta, date
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
import swifter
import seaborn as sns

#do not show warnings
import warnings
warnings.filterwarnings("ignore")

#import plotly for visualization
import chart_studio.plotly as py
import plotly.offline as pyoff
import plotly.graph_objs as go
import plotly.express as px
import sys
from IPython.core.display import display, HTML
sys.path.append('..')
pyoff.init_notebook_mode()

from pycaret.classification import *
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score, classification_report
from sklearn.metrics import plot_confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve
from bokeh.resources import INLINE
import bokeh.io
from bokeh import *

from tqdm import tqdm_notebook as tqdm

In [None]:
from src.visualization.visualize import plot_scatter_segment, pareto_gen
from src.utils.frame_utils import stratified_df, date_time_features, reduce_mem_usage
from src.utils.cluster_utils import order_cluster
from src.utils.eval_utils import evaluate_all

# Package configs

In [None]:
pd.set_option('display.max_columns', None)
display(HTML("<style>.container { width:100% !important; }</style>"))
pd.set_option('plotting.backend', 'pandas_bokeh')
bokeh.io.output_notebook(INLINE)

In [None]:
# ts = calendar.timegm(time.gmtime())
# dt_object = datetime.fromtimestamp(ts)
# date = dt_object.strftime('%m_%d_%Y')

# Data

## Loads

In [None]:
df_train = pd.read_parquet('../data/processed/dataset_SMOTENC_TREINO.parquet')
df_test = pd.read_parquet('../data/processed/dataset_SMOTENC_TESTE.parquet')
# df_datatran = pd.read_parquet('../data/processed/dataset_v1.parquet')
# df_datatran['data_inversa'] = pd.to_datetime(df_datatran['data_inversa'])
# df_datatran.head()

# Modelling Classification via PyCaret

You can see the documentation in [Pycaret Classification](https://pycaret.org/classification/)

In [None]:
# for i in range(6):
#     print('Cluster:', i)
#     print('Qtde amostras =', df_test.loc[df_test['cluster_coords']==i,:].shape[0])
#     print('Qtde amostras 0 =', df_test.loc[(df_test['cluster_coords']==i) & (df_test['contem_vitima_fatal']==0),:].shape[0])
#     print('Qtde amostras 1 =', df_test.loc[(df_test['cluster_coords']==i) & (df_test['contem_vitima_fatal']==1),:].shape[0])
#     print('% amostras 0 =', df_test.loc[(df_test['cluster_coords']==i) & (df_test['contem_vitima_fatal']==0),:].shape[0]/df_test.loc[(df_test['cluster_coords']==i),:].shape[0])
#     print('% amostras 1 =', df_test.loc[(df_test['cluster_coords']==i) & (df_test['contem_vitima_fatal']==1),:].shape[0]/df_test.loc[(df_test['cluster_coords']==i),:].shape[0])

In [None]:
df_train = df_train.loc[df_train['cluster_coords']==1,:]
df_test = df_test.loc[df_test['cluster_coords']==1,:]

In [None]:
# df_train = df_train.loc[df_train['uf']=='MG',:]
# df_test = df_test.loc[df_test['uf']=='MG',:]

In [None]:
# df_train, df_test = train_test_split(df_datatran, stratify=df_datatran['contem_vitima_fatal'], random_state=33, test_size=0.20)

In [None]:
df_train.columns

In [None]:
ignore_columns = ['id', 'cluster_coords']

In [None]:
cat_columns = [
      'dia_semana'
    , 'uf'
    , 'fase_dia'
    , 'sentido_via'
    , 'condicao_metereologica'
    , 'tipo_pista'
    , 'tracado_via'
    , 'uso_solo'
    , 'em_janela_feriado'
    #, 'cluster_coords'
]

In [None]:
num_columns = [
    'risco'
    , 'risco_morte'
    , 'pessoas'
    , 'coordenada_x'
    , 'coordenada_y'
    , 'coordenada_z'
]

In [None]:
folds = 5

In [None]:
exp_reg = setup(data=df_train,
                test_data=df_test,
                target = 'contem_vitima_fatal',
                numeric_features = num_columns,
                categorical_features = cat_columns,
                ignore_features = ignore_columns,
                normalize=True,
                pca=False,
                create_clusters=False,
                fix_imbalance=False,
                data_split_stratify=True,
                ignore_low_variance=True,
                transformation=False,
                train_size=0.8,
                combine_rare_levels=True,
                fold=folds,
                rare_level_threshold=0.10,
                feature_ratio=False,
                feature_interaction=False,
                feature_selection=True,
                feature_selection_method='boruta',
                remove_multicollinearity=True,
                remove_perfect_collinearity=True,
                remove_outliers=False,
                polynomial_features=False,
                session_id=123,
#                 log_experiment=True,
                experiment_name='Predict Fatal Victim',
#                 log_plots=True,
#                 log_profile=False,
#                 log_data=True,
                silent=True,
                verbose=True,
                profile=False,
               )

In [None]:
# get_config("y_train")

In [None]:
models()

## Comparing All Models

Comparing all models to evaluate performance is the recommended starting point for modeling once the setup is completed (unless you exactly know what kind of model you need, which is often not the case). This function trains all models in the model library and scores them using stratified cross validation for metric evaluation. The output prints a score grid that shows average Accuracy, Recall, Precision, F1, Kappa, and MCC accross the folds (10 by default) along with training times.

In [None]:
%%time
best = compare_models(
    sort='AUC',
    exclude=['knn', 'ridge', 'svm', 'lr', 'ada', 'lda', 'nb', 'qda'],
    fold=folds,
    n_select=3,
    turbo=True)

Note: The AUC metric is not available for Multiclass classification however the column will still be shown with zero values to maintain consistency between the Binary Classification and Multiclass Classification display grids.

## Create a Model

`create_model` is the most granular function in PyCaret and is often the foundation behind most of the PyCaret functionalities. As the name suggests this function trains and evaluates a model using cross validation that can be set with fold parameter. The output prints a score grid that shows Accuracy, Recall, Precision, F1, Kappa and MCC by fold.

In [None]:
model_cat = create_model('catboost', fold=folds)

In [None]:
predict_model(model_cat)

In [None]:
model_et = create_model('et', fold=folds)

In [None]:
model_rf = create_model('rf', fold=folds)

In [None]:
model_lgbm = create_model('lightgbm', fold=folds)

In [None]:
model_xgb = create_model('xgboost', fold=folds)

## Tune a Model

When a model is created using the create_model() function it uses the default hyperparameters to train the model. In order to tune hyperparameters, the tune_model() function is used. This function automatically tunes the hyperparameters of a model using Random Grid Search on a pre-defined search space. The output prints a score grid that shows Accuracy, AUC, Recall, Precision, F1, Kappa, and MCC by fold for the best model. To use the custom search grid, you can pass custom_grid parameter in the tune_model function (see 9.2 KNN tuning below).

In [None]:
param_test ={'learning_rate' : [1e-5, 1e-3, 1e-2, 5e-1, 1e-1],
             'n_estimators' : sp_randint(10, 1000),
             'num_leaves': sp_randint(6, 75), 
             'min_child_samples': sp_randint(100, 500), 
             'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
             'subsample': sp_uniform(loc=0.2, scale=0.8), 
             'colsample_bytree': sp_uniform(loc=0.4, scale=0.6),
             'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
             'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100]}

tuned_model_lgbm = tune_model(model_lgbm, n_iter=100, custom_grid=param_test, optimize='AUC', choose_better=True)

The `tune_model()` function is a random grid search of hyperparameters over a pre-defined search space.

In [None]:
tuned_model_cat = tune_model(model_cat, n_iter=100, optimize='AUC', choose_better=True)

## Ensemble Model

In [None]:
bagging_tuned_model_cat = ensemble_model(tuned_model_cat, fold=folds)

In [None]:
bagging_tuned_model_lgbm = ensemble_model(tuned_model_lgbm, fold=folds)

## Stack Model

In [None]:
stacked_models = stack_models(estimator_list = best[1:], meta_model = best[0])

## Plot a Model

Before model finalization, the `plot_model()` function can be used to analyze the performance across different aspects such as AUC, confusion_matrix, decision boundary etc. This function takes a trained model object and returns a plot based on the test / hold-out set.

In [None]:
plot_model(model_lgbm, 'confusion_matrix')

In [None]:
plot_model(model_lgbm, 'class_report')

In [None]:
plot_model(model_lgbm, 'auc')

In [None]:
plot_model(model_lgbm, 'pr')

In [None]:
plot_model(model_lgbm, 'error')

In [None]:
plot_model(model_lgbm, 'calibration')

In [None]:
calibrated_lgbm = calibrate_model(model_lgbm, method='isotonic')

In [None]:
plot_model(calibrated_lgbm, 'calibration')

In [None]:
plot_model(model_lgbm, 'threshold')

In [None]:
plot_model(model_lgbm, 'feature')

## Optimize Threshold

In [None]:
from itertools import product # Biblioteca para achar valores mais próximos entre 2 arrays

In [None]:
model_predicted_test = predict_model(model_cat, probability_threshold=0.0)

In [None]:
f1_score_array, precision_score_array, recall_score_array = [], [], []
thresholds_list = np.linspace(0,1,101)

with tqdm(total = len(thresholds_list)) as pbar:
    for threshold in thresholds_list:
        model_predicted_test['contem_vitima_fatal'] = model_predicted_test['contem_vitima_fatal'].astype(int)
        model_predicted_test['y_pred'] = model_predicted_test.apply(lambda x: 1 if x['Score'] >= threshold else 0, axis=1)

        f1_score_array.append(f1_score(model_predicted_test['contem_vitima_fatal'], model_predicted_test['y_pred']))
        precision_score_array.append(precision_score(model_predicted_test['contem_vitima_fatal'], model_predicted_test['y_pred']))
        recall_score_array.append(recall_score(model_predicted_test['contem_vitima_fatal'], model_predicted_test['y_pred']))
        
        pbar.update()
    
    plt.plot(thresholds_list, f1_score_array)
    plt.axvline(x=thresholds_list[np.argmax(f1_score_array)])
    plt.title(f'Best threshold based on F1-Score = {thresholds_list[np.argmax(f1_score_array)]}')
    plt.show()
    
    #Find closest values between precision and recall arrays
    closest_pr = sorted(product(precision_score_array, recall_score_array), key=lambda t: abs(t[0]-t[1]))[0]
    
    # Get index of closest values between precision and recall arrays
    indices_precision = [i for i, x in enumerate(precision_score_array) if x == closest_pr[0]]
    indices_recall = [i for i, x in enumerate(precision_score_array) if x == closest_pr[1]]
    indice_best_threshold = list(set(indices_precision) & set(indices_recall))[0]
    
    plt.plot(thresholds_list, precision_score_array)
    plt.plot(thresholds_list, recall_score_array)
    plt.axvline(x=thresholds_list[indice_best_threshold])
    plt.title(f'Best threshold based on Precision x Recall = {thresholds_list[indice_best_threshold]}')
    plt.show()

## Predict Model

In [None]:
def generate_metrics(df, y_true, y_pred, average='macro', labels = [0, 1]):
    df['y_true'] = df[y_true].astype(str)
    df['y_pred'] = df[y_pred].astype(str)
    
    print(classification_report(df['y_true'], df['y_pred'], labels=labels))

In [None]:
# tuned_model_xgb_final = finalize_model(tuned_model_xgb)

In [None]:
df_train = get_config("X_train").reset_index()[['index']].merge(new_df.drop(ignore_columns, axis=1).reset_index(), how='left', on='index')
del df_train['index']

In [None]:
model_predicted_train = predict_model(model_rf, data=df_train)

In [None]:
model_predicted_test = predict_model(model_rf)

In [None]:
print('------------------- Train Metrics -------------------')
generate_metrics(model_predicted_train, 'contem_vitima_fatal', 'Label')

print('------------------- Test Metrics -------------------')
generate_metrics(model_predicted_test, 'contem_vitima_fatal', 'Label')

In [None]:
model_predicted_train = predict_model(model_lgbm, data=df_train)

In [None]:
model_predicted_test = predict_model(model_lgbm)

In [None]:
print('------------------- Train Metrics -------------------')
generate_metrics(model_predicted_train, 'contem_vitima_fatal', 'Label')

print('------------------- Test Metrics -------------------')
generate_metrics(model_predicted_test, 'contem_vitima_fatal', 'Label')

In [None]:
model_predicted_train = predict_model(model_cat, data=df_train, probability_threshold=0.0)

In [None]:
model_predicted_test = predict_model(model_cat)

In [None]:
print('------------------- Train Metrics -------------------')
generate_metrics(model_predicted_train, 'contem_vitima_fatal', 'Label')

print('------------------- Test Metrics -------------------')
generate_metrics(model_predicted_test, 'contem_vitima_fatal', 'Label')

In [None]:
plot_data = [
    go.Histogram(
        x = tx_data['TARGET'],
        orientation='v',
        name='Segmentos'
    )
]

plot_layout = go.Layout(
        width=700,
        height=300
    )
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)

In [None]:
r = evaluate_all(lgbm_final_pred_test['y_true'], lgbm_final_pred_test['y_pred'])

In [None]:
lgbm_final_pred_test.to_csv("../data/processed/Predict Revenue Purchase_LGBM_129.csv", index=False)

Finalize and Save Model

In [None]:
model_rf_finalized = finalize_model(model_rf)

In [None]:
save_model(model_rf, "../models/model_rf_02_02_2021", verbose=True)

In [None]:
model_predicted_et = predict_model(model_et)

print('------------------- et -------------------')
generate_metrics(model_predicted_et, 'contem_vitima_fatal', 'Label')

In [None]:
model_predicted_rf = predict_model(model_rf)

print('------------------- rf -------------------')
generate_metrics(model_predicted_rf, 'contem_vitima_fatal', 'Label')

In [None]:
model_predicted_dt = predict_model(model_dt)

print('------------------- dt -------------------')
generate_metrics(model_predicted_dt, 'contem_vitima_fatal', 'Label')

In [None]:
model_predicted_cat = predict_model(model_cat)

print('------------------- cat -------------------')
generate_metrics(model_predicted_cat, 'contem_vitima_fatal', 'Label')

In [None]:
model_predicted_lgbm = predict_model(model_lgbm)

print('------------------- lgbm -------------------')
generate_metrics(model_predicted_lgbm, 'contem_vitima_fatal', 'Label')

In [None]:
model_predicted_xgb = predict_model(model_xgb)

print('------------------- xgb -------------------')
generate_metrics(model_predicted_xgb, 'contem_vitima_fatal', 'Label')