5 - É possível identificar casos inconsistentes utilizando classificação de NCM?

- CÓDIGO NCM/SH

- DESCRIÇÃO DO PRODUTO/SERVIÇO, NATUREZA DA OPERAÇÃO

- DATA EMISSÃO MES, DATA EMISSÃO DIA DA SEMANA, DATA EMISSÃO DIA, DATA EMISSÃO PERÍODO, VALOR DA NOTA FISCAL, VALOR UNITÁRIO, QUANTIDADE, CFOP, INDICADOR IE DESTINATÁRIO

### Importação de Bibliotecas

In [None]:
import pandas as pd
import numpy as np
import random
import seaborn as sns
import datetime
import os
from sklearn.metrics import classification_report

import tensorflow_addons as tfa
import keras_tuner as kt
from tensorflow import keras
import tensorflow as tf

from classes import Preprocessing, Lstm, Model

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

### Leitura de Dados

In [2]:
raw_df = pd.read_csv('data/processed_nfe/nfe_100000.csv')

### Pré-processamento

Tipagem

In [3]:
df = Preprocessing.define_types(raw_df)
df = Preprocessing.filter_event_authorized(df)

Definição de colunas

In [None]:
df['CAPÍTULO NCM'] = df['CÓDIGO NCM/SH'].astype(str).str[0] + df['CÓDIGO NCM/SH'].astype(str).str[1]
df = df[['DESCRIÇÃO DO PRODUTO/SERVIÇO','CAPÍTULO NCM']]
df.rename(columns={'DESCRIÇÃO DO PRODUTO/SERVIÇO':'DESCRICAO'},inplace=True)

df.head()

Aplica pré-processamento no texto da 'DESCRICAO'

In [None]:
df, corpus_desc = Preprocessing.apply_preprocessing(df)

df.head()

In [None]:
df_train, df_val, df_test = Preprocessing.split_dataset(df)

In [None]:
mean_sequence_length, max_sequence_length = Preprocessing.get_sequences_details(df_train)

print(f'Mean sequence length: {mean_sequence_length}')
print(f'Max sequence length: {max_sequence_length}')

In [None]:
MAX_SEQUENCE_LENGTH = max_sequence_length
NUM_LABELS = 2

VOCAB_SIZE, X_train_padded, X_val_padded, X_test_padded = Preprocessing.adapt_X_for_input_layer(df_train['TEXT'].astype(str), df_val['TEXT'].astype(str), df_test['TEXT'].astype(str), MAX_SEQUENCE_LENGTH)

print('Training features shape:', X_train_padded.shape)
print('Validation features shape:', X_val_padded.shape)
print('Test features shape:', X_test_padded.shape)

X_train_padded

In [None]:
df_train['LABEL'].value_counts()

In [None]:
X_train_padded, y_train_smote = Preprocessing.smote(X_train_padded, df_train['LABEL'])

pd.DataFrame(y_train_smote).value_counts()

In [None]:
y_train_cat, y_val_cat, y_test_cat = Preprocessing.adapt_y_for_input_layer(y_train_smote, df_val['LABEL'], df_test['LABEL'])

y_train_cat.shape

In [None]:
METRICS = [
      keras.metrics.TruePositives(name='tp'),
      keras.metrics.FalsePositives(name='fp'),
      keras.metrics.TrueNegatives(name='tn'),
      keras.metrics.FalseNegatives(name='fn'), 
      keras.metrics.BinaryAccuracy(name='accuracy'),
      keras.metrics.Precision(name='precision'),
      keras.metrics.Recall(name='recall'),
      keras.metrics.AUC(name='auc'),
      keras.metrics.AUC(name='prc', curve='PR'), # precision-recall curve
      tfa.metrics.F1Score(num_classes=2, average='macro', name='f1_score_macro')
]

In [None]:
tuner = kt.RandomSearch(
    hypermodel=Lstm(VOCAB_SIZE, MAX_SEQUENCE_LENGTH, NUM_LABELS, METRICS, None),
    objective=kt.Objective('f1_score_macro', direction='max'),
    max_trials=1,
    executions_per_trial=2,
    # overwrite=True,
    directory="hyperparameters_search",
    project_name="lstm",
    seed=SEED
)

print(tuner.search_space_summary())

In [None]:
logdir = os.path.join("logs/lstm/", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
tensorboard_callback = keras.callbacks.TensorBoard(logdir, histogram_freq=1)
earlystopping_callback = keras.callbacks.EarlyStopping('val_loss', mode='min', verbose=1, patience=5)

callbacks_list = [earlystopping_callback, tensorboard_callback]

In [4]:
tuner.search(X_train_padded, y_train_cat,
             validation_data=(X_val_padded, y_val_cat),
             callbacks=callbacks_list)

In [5]:
best_model = tuner.get_best_models()[0]
Model.save(best_model, 'lstm')

best_hps = tuner.get_best_hyperparameters()[0]
print(best_hps.values)

NÃO CONTRIBUINTE       46623
CONTRIBUINTE ISENTO    14220
CONTRIBUINTE ICMS       1212
Name: INDICADOR IE DESTINATÁRIO, dtype: int64

In [None]:
hypermodel = Model.recover('lstm')

EPOCHS = 4
BATCH_SIZE = 8
LEARNING_RATE = 0.0002496944527778083
LOSS = 'categorical_crossentropy'

hypermodel.compile(optimizer=keras.optimizers.Adam(learning_rate=LEARNING_RATE),
                    loss=LOSS, 
                    metrics=METRICS) 