# Initial Configs


In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

# Imports

In [None]:
from __future__ import division
#import libraries
from datetime import datetime, timedelta,date
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
import seaborn as sns

#do not show warnings
import warnings
warnings.filterwarnings("ignore")

#import plotly for visualization
import chart_studio.plotly as py
import plotly.offline as pyoff
import plotly.graph_objs as go
import plotly.express as px
import sys
from IPython.core.display import display, HTML
sys.path.append('..')
pyoff.init_notebook_mode()

#import machine learning related libraries
from sklearn.svm import SVC, SVR
import xgboost as xgb
import lightgbm as lgb
import mlflow
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from bokeh.resources import INLINE
import bokeh.io
from bokeh import *

In [None]:
from src.visualization.visualize import plot_scatter_segment, pareto_gen
from src.utils.frame_utils import stratified_df, date_time_features, reduce_mem_usage
from src.utils.cluster_utils import order_cluster
from src.utils.eval_utils import evaluate_all

# Package configs

In [None]:
pd.set_option('display.max_columns', None)
display(HTML("<style>.container { width:100% !important; }</style>"))
pd.set_option('plotting.backend', 'pandas_bokeh')
bokeh.io.output_notebook(INLINE)

In [None]:
# ts = calendar.timegm(time.gmtime())
# dt_object = datetime.fromtimestamp(ts)
# date = dt_object.strftime('%m_%d_%Y')

# Data

## Loads

In [None]:
df_datatran = pd.read_parquet('../data/processed/dataset_v1.parquet')
df_datatran['data_inversa'] = pd.to_datetime(df_datatran['data_inversa'])
df_datatran.head()

# Modelling Classification via PyCaret

You can see the documentation in [Pycaret Classification](https://pycaret.org/classification/)

In [None]:
from pycaret.classification import *

In [None]:
df_train, df_test = train_test_split(df_datatran, stratify=df_datatran['contem_vitima_fatal'], random_state=33, test_size=0.20)

In [None]:
drop_columns = [
    'id'
    , 'br'
    , 'km'
    , 'sentido_via'
    , 'classificacao_acidente'
    , 'causa_acidente'
    , 'ano'
    , 'data_inversa'
    , 'horario'
    , 'ano'
    , 'mortos'
    , 'regional'
    , 'delegacia'
    , 'uop'
]

In [None]:
df_train.drop(drop_columns, axis=1).isnull().sum().sum()

In [None]:
import scipy.stats
import swifter

In [None]:
z_score = stats.zscore(df_train.drop(drop_columns, axis=1).select_dtypes(include=[np.number]))

In [None]:
abs_z_scores = np.abs(z_score)
filtered_entries = (abs_z_scores < 3).all(axis=1)
new_df = df_train[filtered_entries]

In [None]:
print(new_df.shape)
print(df_train.shape)

In [None]:
df_train.drop(drop_columns, axis=1).head()

In [None]:
cat_columns = [
      'dia_semana'
    , 'uf'
#     , 'br'
#     , 'km'
    , 'municipio'
#     , 'causa_acidente'
    , 'tipo_acidente'
#     , 'classificacao_acidente'
    , 'fase_dia'
#     , 'sentido_via'
    , 'condicao_metereologica'
    , 'tipo_pista'
    , 'tracado_via'
    , 'uso_solo'
]

In [None]:
num_columns = [
    'pessoas'
    , 'feridos_leves'
    , 'feridos_graves'
    , 'ilesos'
    , 'ignorados'
    , 'feridos'
    , 'veiculos'
]

In [None]:
folds = 5

In [None]:
exp_reg = setup(new_df.drop(drop_columns, axis=1), target = 'contem_vitima_fatal', 
                numeric_features = num_columns,
                categorical_features = cat_columns,
                normalize=True, pca=False, 
                create_clusters=False,
                fix_imbalance=True,
                data_split_stratify=True,
                ignore_low_variance=True, 
                transformation=False, 
                train_size=0.8, 
                combine_rare_levels=True,
                fold=folds,
                rare_level_threshold=0.10,
                feature_ratio=False,
                feature_interaction=False,
                feature_selection=True,
                remove_multicollinearity=True, 
                remove_perfect_collinearity=True, 
                remove_outliers=True, 
                polynomial_features=False,
                session_id=123,
                log_experiment=True,
                experiment_name='Predict Fatal Victim v1',
                log_plots=True,
                log_profile=False,
                log_data=True,
                silent=False,
                verbose=True,
                profile=False,
               )

In [None]:
# get_config("X_train").columns

In [None]:
# get_config("X_train")

In [None]:
models()

## Comparing All Models

Comparing all models to evaluate performance is the recommended starting point for modeling once the setup is completed (unless you exactly know what kind of model you need, which is often not the case). This function trains all models in the model library and scores them using stratified cross validation for metric evaluation. The output prints a score grid that shows average Accuracy, Recall, Precision, F1, Kappa, and MCC accross the folds (10 by default) along with training times.

In [None]:
%%time
best = compare_models(
    sort='Recall',
    #exclude=['knn', 'lda', 'ada', 'qda', 'ridge', 'mlp', 'svm', 'rbfsvm'],
    #fold=7,
    n_select=3,
    turbo=True)

Note: The AUC metric is not available for Multiclass classification however the column will still be shown with zero values to maintain consistency between the Binary Classification and Multiclass Classification display grids.

## Create a Model

`create_model` is the most granular function in PyCaret and is often the foundation behind most of the PyCaret functionalities. As the name suggests this function trains and evaluates a model using cross validation that can be set with fold parameter. The output prints a score grid that shows Accuracy, Recall, Precision, F1, Kappa and MCC by fold.

In [None]:
model_et = create_model('et')

In [None]:
model_xgb = create_model('xgboost')

In [None]:
model_lgbm = create_model('lightgbm')

In [None]:
model_cat = create_model('catboost')

## Tune a Model

When a model is created using the create_model() function it uses the default hyperparameters to train the model. In order to tune hyperparameters, the tune_model() function is used. This function automatically tunes the hyperparameters of a model using Random Grid Search on a pre-defined search space. The output prints a score grid that shows Accuracy, AUC, Recall, Precision, F1, Kappa, and MCC by fold for the best model. To use the custom search grid, you can pass custom_grid parameter in the tune_model function (see 9.2 KNN tuning below).

In [None]:
tuned_model_et = tune_model(model_et, n_iter=20, optimize='Precision', choose_better=True)

In [None]:
tuned_model_xgb = tune_model(model_xgb, n_iter=20, optimize='Precision', choose_better=True)

In [None]:
param_test ={'learning_rate' : [1e-5, 1e-3, 1e-2, 5e-1, 1e-1],
             'n_estimators' : sp_randint(10, 1000),
             'num_leaves': sp_randint(6, 75), 
             'min_child_samples': sp_randint(100, 500), 
             'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
             'subsample': sp_uniform(loc=0.2, scale=0.8), 
             'colsample_bytree': sp_uniform(loc=0.4, scale=0.6),
             'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
             'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100]}

tuned_model_lgbm = tune_model(model_lgbm, n_iter=50, custom_grid=param_test, optimize='Precision', choose_better=True)

In [None]:
tuned_model_cat = tune_model(model_cat, n_iter=20, optimize='Precision', choose_better=True)

The `tune_model()` function is a random grid search of hyperparameters over a pre-defined search space.

In [None]:
bagging_tuned_model_et = ensemble_model(tuned_model_et, fold=7)

In [None]:
bagging_tuned_model_xgb = ensemble_model(tuned_model_xgb, fold=7)

In [None]:
bagging_tuned_model_lgbm = ensemble_model(tuned_model_lgbm, fold=7)

In [None]:
bagging_tuned_model_cat = ensemble_model(tuned_model_cat, fold=7)

In [None]:
stacked_models = stack_models(estimator_list = best[1:], meta_model = best[0])

## Plot a Model

Before model finalization, the `plot_model()` function can be used to analyze the performance across different aspects such as AUC, confusion_matrix, decision boundary etc. This function takes a trained model object and returns a plot based on the test / hold-out set.

In [None]:
plot_model(tuned_model_xgb, 'confusion_matrix')

In [None]:
plot_model(bagging_model, 'residuals')

In [None]:
plot_model(bagging_model, 'error')

In [None]:
plot_model(model_lgbm, 'feature')

In [None]:
models = [bagging_model, tuned_model, model_lgbm, tuned_model_lgbm]

In [None]:
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score, classification_report
from sklearn.metrics import plot_confusion_matrix

In [None]:
def generate_metrics(df, y_true, y_pred, average='macro', labels=['ASROTA', 'RESTAURANTE', 'PADARIA', 'SUB', 'BAR', 'LC', 'HOTEL']):
#     df['y_true'] = get_config("target_inverse_transformer").inverse_transform(df[y_true].values.reshape(-1, 1)).astype(int)
#     df['y_pred'] = get_config("target_inverse_transformer").inverse_transform(df[y_pred].values.reshape(-1, 1)).astype(int)
    df['y_true'] = df[y_true].astype(str)
    df['y_pred'] = df[y_pred].astype(str)
    
    print(classification_report(df['y_true'], df['y_pred'], labels=labels))
    # accuracy = accuracy_score(df['y_true'], df['y_pred'])
    # precision = precision_score(df['y_true'], df['y_pred'], average=average)
    # f1 = f1_score(df['y_true'], df['y_pred'], labels=labels, average=average)
    # recall = recall_score(df['y_true'], df['y_pred'], labels=labels, average=average)
    # 
    # print('Accuracy = ', accuracy)
    # print('Precision = ', precision)
    # print('F1 = ', f1)
    # print('Recall = ', recall)

In [None]:
for m in models:
    holdout_predict = predict_modelct_model(m)
    generate_predict_days(holdout_predict, 0, 'Label')


In [None]:
tuned_model_xgb_final = finalize_model(tuned_model_xgb)

In [None]:
bagging_tuned_model_xgb_pred_train = predict_model(bagging_tuned_model_xgb, data=X_train)
bagging_tuned_model_xgb_pred_test = predict_model(bagging_tuned_model_xgb, data=X_test)

bagging_tuned_model_xgb_pred_train.dropna(subset=['TARGET', 'Label'], inplace=True)
bagging_tuned_model_xgb_pred_test.dropna(subset=['TARGET', 'Label'], inplace=True)

print('------------------- Train Metrics -------------------')
generate_metrics(bagging_tuned_model_xgb_pred_train, 'TARGET', 'Label')

print('------------------- Test Metrics -------------------')
generate_metrics(bagging_tuned_model_xgb_pred_test, 'TARGET', 'Label')

In [None]:
tuned_model_lgbm_pred_train = predict_model(tuned_model_lgbm, data=X_train)
tuned_model_lgbm_pred_test = predict_model(tuned_model_lgbm, data=X_test)

tuned_model_lgbm_pred_train.dropna(subset=['TARGET', 'Label'], inplace=True)
tuned_model_lgbm_pred_test.dropna(subset=['TARGET', 'Label'], inplace=True)

print('------------------- Train Metrics ------------------')
generate_metrics(tuned_model_lgbm_pred_train, 'TARGET', 'Label')

print('------------------- Test Metrics ------------------')
generate_metrics(tuned_model_lgbm_pred_test, 'TARGET', 'Label')

In [None]:
tuned_model_xgb_pred_train = predict_model(tuned_model_xgb, data=X_train)
tuned_model_xgb_pred_test = predict_model(tuned_model_xgb, data=X_test)

tuned_model_xgb_pred_train.dropna(subset=['TARGET', 'Label'], inplace=True)
tuned_model_xgb_pred_test.dropna(subset=['TARGET', 'Label'], inplace=True)

print('------------------- Train Metrics ------------------')
generate_metrics(tuned_model_xgb_pred_train, 'TARGET', 'Label')

print('------------------- Test Metrics ------------------')
generate_metrics(tuned_model_xgb_pred_test, 'TARGET', 'Label')

In [None]:
plot_data = [
    go.Histogram(
        x = tx_data['TARGET'],
        orientation='v',
        name='Segmentos'
    )
]

plot_layout = go.Layout(
        width=700,
        height=300
    )
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)

In [None]:
r = evaluate_all(lgbm_final_pred_test['y_true'], lgbm_final_pred_test['y_pred'])

In [None]:
lgbm_final_pred_test.to_csv("../data/processed/Predict Revenue Purchase_LGBM_129.csv", index=False)

In [None]:
save_model(tuned_model_xgb, "../models/Classification/tuned_model_xgb_v1", verbose=True)

### H2O

In [None]:
import h2o
from sklearn.preprocessing import MinMaxScaler
h2o.init()

In [None]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']

In [None]:
mms = MinMaxScaler()
numeric_columns = list(tx_data.drop(drop_columns, axis=1).select_dtypes(include=numerics).columns)
h2o_df = tx_data.drop(drop_columns, axis=1).copy()
h2o_df.loc[:, numeric_columns] = mms.fit_transform(h2o_df.loc[:, numeric_columns])
h2o_df = pd.get_dummies(h2o_df, columns=cat_columns)

In [None]:
h2o_data = get_config("X")
h2o_data['y_true'] = get_config("y").values

In [None]:
h2o_data = h2o.H2OFrame(h2o_data)
h2o_data.head(5)# The default head() command displays the first 10 rows.

In [None]:
h2o_data_split = h2o_data.split_frame(ratios = [0.8], seed = 1234)
h2o_train = h2o_data_split[0] # using 80% for training
h2o_test = h2o_data_split[1] #rest 20% for testingprint(wine_train.shape, wine_test.shape)

In [None]:
predictors = list(h2o_data.columns)
# remove_columns = ['UNB_PDV', 'Segment Range', 'NextPurchaseDayRange', 'NextPurchaseDay']
# [predictors.remove(i) for i in remove_columns] # Since we need to predict quality
predictors

In [None]:
# Import the function for GLM
from h2o.estimators.glm import H2OGeneralizedLinearEstimator# Set up GLM for regression
glm = H2OGeneralizedLinearEstimator(family = 'gaussian', model_id = 'glm_default')# Use .train() to build the model
glm.train(x = predictors, 
                  y = 'y_true', 
                  training_frame = h2o_train)
print(glm)

In [None]:
glm.model_performance(h2o_test)

In [None]:
from h2o.automl import H2OAutoML
aml = H2OAutoML(max_models = 20, max_runtime_secs=100, seed = 1)

In [None]:
aml.train(x=predictors, y = 'y_true', 
                  training_frame = h2o_train, validation_frame=h2o_test)

In [None]:
print(aml.leaderboard)

In [None]:
y_test

In [None]:
regression_results(y_test, np.array([y_test.median()]*(len(y_test))).reshape(-1, 1))

In [None]:
metalearner = h2o.get_model(aml.leader.metalearner()['name'])
metalearner.std_coef_plot()

In [None]:
regression_results(h2o_test.as_data_frame()['y_true'], preds.as_data_frame())

## Modelling Deep Learning

In [None]:
import tensorflow as tf

In [None]:
from tensorflow.keras.layers import Dense,Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import SGD,RMSprop
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras import backend as K
from tensorflow.keras import metrics as tf_metrics
from tensorflow.keras import losses

In [None]:
y_train = X_train['next_purchase_in_days']#.apply(np.log1p)
X_train.drop('next_purchase_in_days', axis=1, inplace=True)

In [None]:
y_test = X_test['next_purchase_in_days']
X_test.drop('next_purchase_in_days', axis=1, inplace=True)

In [None]:
X_train = X_train.drop(drop_columns, axis=1)

In [None]:
X_test_prep = X_test.drop(drop_columns, axis=1)

In [None]:
mms = MinMaxScaler().fit(X_train.drop(cat_columns, axis=1))

In [None]:
cols_numeric = list(X_train.columns[~X_train.columns.isin(cat_columns)])

In [None]:
X_train.loc[:, cols_numeric] = mms.transform(X_train.loc[:, cols_numeric])

In [None]:
X_test_prep = mms.transform(X_test_prep.loc[:, cols_numeric])

In [None]:
X_train = pd.get_dummies(X_train, columns=cat_columns)

In [None]:
X_test_prep = pd.get_dummies(X_train, columns=cat_columns)

In [None]:
X_train.drop("flag_covid_0", axis=1, inplace=True)

In [None]:
X_train

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, random_state=33, test_size=0.20)

In [None]:
model = Sequential()
BatchNormalization()
model.add(Dense(512,input_dim=X_train.shape[1],activation='relu'))
BatchNormalization()
Dropout(0.5)
model.add(Dense(128,activation='relu'))
BatchNormalization()
Dropout(0.8)
model.add(Dense(64))
BatchNormalization()
model.add(Dense(1))
#sgd = SGD(lr=0.01)
mape_loss = tf.keras.losses.MeanAbsolutePercentageError()
msle_loss = tf.keras.losses.MeanSquaredLogarithmicError()
model.compile(optimizer='rmsprop', loss=msle_loss, metrics=['mean_squared_error', tf_metrics.MeanSquaredLogarithmicError()
                                                         , tf_metrics.MeanAbsolutePercentageError()
                                                         , tf_metrics.MeanAbsoluteError()])

In [None]:
%%time
h = model.fit(X_train, y_train, validation_data=(X_val, y_val), use_multiprocessing=True, workers=-1, epochs=80, batch_size=128, verbose=1)