In [1]:
import pandas as pd
import os
import sys
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import dill
from typing import List
import scipy.stats as sts
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import ModelCheckpoint

import util
from pathlib import Path
from sk.replace_column_dataframe import ReplaceColumnDataFrame
from sk.norm_standard_scaler import NormStandardScaler
import constants.columns_dataframe as const
from class_manipulates_path import ManipulatePath
from class_preprocessing_refactor import Preprocessing
from class_format_data import FormatData
util.init()

current_path = sys.path[0]

In [None]:
import tensorflow as tf

# Verifique se a GPU está disponível
print("GPU disponível:", tf.config.list_physical_devices('GPU'))

# Verifique se o TensorFlow está usando a GPU
print("TensorFlow está usando a GPU:", tf.test.is_built_with_cuda())

GPU disponível: []
TensorFlow está usando a GPU: False


In [2]:
print("Configurações do TensorFlow relacionadas à GPU:")
print(tf.config.list_physical_devices('GPU'))
print("TensorFlow está usando a GPU:", tf.test.is_built_with_cuda())
print("Número de GPUs disponíveis:", len(tf.config.experimental.list_physical_devices('GPU')))

Configurações do TensorFlow relacionadas à GPU:
[]
TensorFlow está usando a GPU: False
Número de GPUs disponíveis: 0


In [3]:
def plot_variable(df, column_name, intervalos=None):
    """
    Plota um gráfico de dispersão para uma variável de um DataFrame.

    Parâmetros:
        - df: DataFrame pandas contendo os dados.
        - column_name: Nome da coluna que você deseja plotar.
        - intervalos: Lista de intervalos para plotar linhas verticais.
    """
    plt.figure(figsize=(20, 10))
    plt.scatter(df.index.values, df[column_name].values, color='darkcyan', alpha=0.5)
    plt.xlabel('Índice')
    plt.ylabel(column_name)
    plt.title(f'Gráfico de dispersão para a variável "{column_name}"')


    # Plotar linhas verticais para cada intervalo
    if intervalos:
        for intervalo in intervalos:
            plt.axvline(x=intervalo, linestyle='--', color='red')

    plt.show()

In [5]:
manipulate_path = ManipulatePath()

preprocessing = Preprocessing()

format_data = FormatData()

path_raw_data = manipulate_path.get_path_raw_data()

In [6]:
df_preprocessing = pd.read_parquet(manipulate_path.get_path_preprocessing_real_data_all_classes())

In [7]:
contagem_classes = df_preprocessing['class'].value_counts()

# Calcular a porcentagem de cada classe
porcentagem_classes_real = contagem_classes / len(df_preprocessing) * 100
porcentagem_classes_real

class
0.0    32.679781
5.0    30.983203
7.0    29.009351
1.0     4.797527
6.0     1.810310
2.0     0.719828
Name: count, dtype: float64

Aqui é possível ver que tem poquíssimas classes normais!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

# Rede 1

In [8]:
df_preprocessing["class"].unique()

array([0., 1., 2., 5., 6., 7.])

In [9]:
# Separar as features (X) e os rótulos (y)
X = df_preprocessing.drop('class', axis=1)
y = df_preprocessing['class']

In [10]:
X

Unnamed: 0_level_0,P-TPT,T-TPT,P-MON-CKP,T-JUS-CKP
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2014-01-24 09:33:03,18433410.0,116.8718,9397031.0,74.80031
2014-01-24 09:33:04,18433410.0,116.8718,9397031.0,74.80031
2014-01-24 09:33:05,18433410.0,116.8718,9397032.0,74.80031
2014-01-24 09:33:06,18433410.0,116.8718,9397033.0,74.80031
2014-01-24 09:33:07,18433410.0,116.8718,9397033.0,74.80031
...,...,...,...,...
2019-04-03 14:59:56,8489354.0,109.7213,1496222.0,73.38219
2019-04-03 14:59:57,8489349.0,109.7212,1495828.0,73.38310
2019-04-03 14:59:58,8489344.0,109.7210,1495433.0,73.38401
2019-04-03 14:59:59,8489338.0,109.7209,1495039.0,73.38493


In [11]:
X_train_k, X_test, y_train_k, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

In [12]:
X_train, X_val, y_train, y_val = train_test_split(X_train_k, y_train_k, test_size=0.2, stratify=y_train_k, random_state=42)

In [13]:
scaler = NormStandardScaler(X_train.columns)
X_train_norm = scaler.fit_transform(X_train)

X_train_k_norm = scaler.transform(X_train_k)

X_val_norm = scaler.transform(X_val)

X_test_norm = scaler.transform(X_test)

In [14]:
y_train

timestamp
2017-08-01 16:45:09    1.0
2014-03-18 13:35:54    5.0
2017-02-26 16:16:24    0.0
2019-04-03 03:37:20    0.0
2017-07-31 19:05:08    0.0
                      ... 
2018-06-20 17:05:13    0.0
2017-02-27 08:52:43    7.0
2018-06-21 15:41:24    7.0
2014-03-18 17:10:15    5.0
2014-03-14 17:47:27    0.0
Name: class, Length: 547530, dtype: float64

In [15]:
y_train_nn_1 = ReplaceColumnDataFrame(const.TARGET, const.MAPPING_TWO_CLASSES).transform(pd.DataFrame(y_train))["class"]

y_train_k_nn_1 = ReplaceColumnDataFrame(const.TARGET, const.MAPPING_TWO_CLASSES).transform(pd.DataFrame(y_train_k))["class"]

y_test_nn_1 = ReplaceColumnDataFrame(const.TARGET, const.MAPPING_TWO_CLASSES).transform(pd.DataFrame(y_test))["class"]

y_val_nn_1 = ReplaceColumnDataFrame(const.TARGET, const.MAPPING_TWO_CLASSES).transform(pd.DataFrame(y_val))["class"]

In [16]:
from sklearn.model_selection import KFold
k = 5
kf = KFold(n_splits=k, shuffle=True, random_state=42)

In [17]:
X_train_k_norm

Unnamed: 0_level_0,P-TPT,T-TPT,P-MON-CKP,T-JUS-CKP
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2014-03-20 01:17:06,-1.748677,-0.712923,-1.253200,-1.075840
2018-06-20 20:18:21,1.197046,0.628302,1.535948,0.359248
2014-03-19 14:13:39,-1.569386,-0.248792,-0.920018,-0.453377
2018-06-18 04:58:35,1.130444,0.642377,1.009372,0.624118
2014-03-18 23:21:09,-1.469840,-0.468626,-0.866594,-0.861681
...,...,...,...,...
2018-04-26 14:40:47,-0.619117,-3.688628,-1.426019,-0.402692
2014-03-18 07:09:17,-1.294946,-0.426604,-0.755625,-0.830392
2018-06-18 19:04:41,1.108242,0.658533,1.135461,0.604354
2014-03-17 14:58:10,-1.071926,-0.417184,-0.600213,-0.882121


In [18]:
from tensorflow.keras.optimizers import Adam

In [25]:
accuracies = []
histories = []

#learning_rate = 0.7
#beta_1 = 0.001
#beta_2 = 0.001

for train_index, val_index in kf.split(X_train_k_norm):
    # Divida os dados em treino e validação
    X_train_fold, X_val_fold = X_train_k_norm.iloc[train_index], X_train_k_norm.iloc[val_index]
    y_train_fold, y_val_fold = y_train_k_nn_1.iloc[train_index], y_train_k_nn_1.iloc[val_index]

    # Defina o modelo
    model = Sequential()
    model.add(Dense(32, input_dim=4, activation='relu'))
    model.add(Dense(16, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))

    #optimizer = Adam(learning_rate=learning_rate, beta_1=beta_1, beta_2=beta_2)
    optimizer = 'adam'

    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

    # Defina o checkpoint para salvar os pesos
    checkpoint = ModelCheckpoint("weights.h5", monitor='val_loss', save_best_only=True)

    # Treine o modelo com conjunto de validação e o checkpoint
    history = model.fit(X_train_fold, y_train_fold, epochs=100, batch_size=64, validation_data=(X_val_fold, y_val_fold), callbacks=[checkpoint], verbose=True)
    histories.append(history.history)
    # Avalie o modelo
    _, accuracy = model.evaluate(X_val_fold, y_val_fold, verbose=0)
    accuracies.append(accuracy)
    print(f'Acurácia do fold: {accuracy*100:.2f}%')

Epoch 1/100
Epoch 2/100
 174/8556 [..............................] - ETA: 7s - loss: 0.2040 - accuracy: 0.9167

  saving_api.save_model(




KeyboardInterrupt: 

In [19]:
mean_accuracy = np.mean(accuracies)
std_accuracy = np.std(accuracies)
print(f'\nAcurácia média: {mean_accuracy*100:.2f}%')
print(f'Desvio padrão da acurácia: {std_accuracy*100:.2f}%')


Acurácia média: 97.20%
Desvio padrão da acurácia: 0.71%


In [20]:
import json

# Salvar accuracies em um arquivo JSON
with open('accuracies_rede_1.json', 'w') as f:
    json.dump(accuracies, f)

# Salvar histories em um arquivo JSON
with open('histories_rede_1.json', 'w') as f:
    json.dump(histories, f)

In [19]:
# # Defina o modelo
# model = Sequential()
# 
# # Adicione as camadas ocultas
# model.add(Dense(32, input_dim=4, activation='relu'))  # Camada de entrada com 4 neurônios e ativação ReLU
# model.add(Dense(16, activation='relu'))  # Segunda camada oculta com 4 neurônios e ativação ReLU
# #model.add(Dense(8, activation='relu'))
# #model.add(Dense(8, activation='relu'))
# 
# # Adicione a camada de saída
# model.add(Dense(1, activation='sigmoid'))  # Camada de saída com 1 neurônio e ativação Sigmoid para classificação binária
# 
# # Compile o modelo
# model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# 
# # Defina o checkpoint para salvar os pesos
# checkpoint = ModelCheckpoint("weights.h5", monitor='val_loss', save_best_only=True)
# 
# # Treine o modelo com conjunto de validação e o checkpoint
# model.fit(X_train_norm, y_train_nn_1, epochs=100, batch_size=10, validation_data=(X_val_norm, y_val_nn_1), callbacks=[checkpoint])
# 
# # Avalie o modelo
# _, accuracy = model.evaluate(X_test_norm, y_test_nn_1)
# print('Acurácia: %.2f' % (accuracy*100))

Epoch 1/100
Epoch 2/100
  175/47909 [..............................] - ETA: 41s - loss: 0.1795 - accuracy: 0.9194

  saving_api.save_model(


Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100

KeyboardInterrupt: 

In [23]:
y_pred = model.predict(X_test_norm)

y_pred = np.squeeze(y_pred)
y_pred = np.where(y_pred > 0.5, 1, 0)

accuracy_score(y_test_nn_1.values, y_pred)



0.973837447156689

In [22]:
precision = precision_score(y_test_nn_1.values, y_pred, average=None)
recall = recall_score(y_test_nn_1.values, y_pred, average=None)
f1 = f1_score(y_test_nn_1.values, y_pred, average=None)
for i in range(len(precision)):
    print(f'Classe {np.unique(y_test_nn_1)[i]}:')
    print(f'Precision: {precision[i]}')
    print(f'Recall: {recall[i]}')
    print(f'F1-score: {f1[i]}\n')

Classe 0.0:
Precision: 0.9597026440904162
Recall: 0.9602633116341178
F1-score: 0.9599828959993325

Classe 1.0:
Precision: 0.9807049359701735
Recall: 0.9804268119758538
F1-score: 0.9805658542515042



# Segunda rede

In [24]:
df_data_train = X_train_norm.copy()
df_data_train["class"] = y_train
df_data_train = df_data_train[~(df_data_train["class"] == 0)].copy()

df_data_train_k = X_train_k_norm.copy()
df_data_train_k["class"] = y_train_k
df_data_train_k = df_data_train_k[~(df_data_train_k["class"] == 0)].copy()

df_data_val = X_val_norm.copy()
df_data_val["class"] = y_val
df_data_val = df_data_val[~(df_data_val["class"] == 0)].copy()

df_data_test= X_test_norm.copy()
df_data_test["class"] = y_test
df_data_test = df_data_test[~(df_data_test["class"] == 0)].copy()

In [27]:
contagem_classes = df_data_train_k['class'].value_counts()

# Calcular a porcentagem de cada classe
porcentagem_classes_real = contagem_classes / len(df_data_train_k) * 100
porcentagem_classes_real

class
5.0    46.023640
7.0    43.091668
1.0     7.126455
6.0     2.689106
2.0     1.069131
Name: count, dtype: float64

In [28]:
def mapping_labels(df: pd.DataFrame):
    df_data = df.copy()
    unique_labels = df_data['class'].unique()
    unique_labels.sort()

    # Mapear os rótulos para inteiros em ordem crescente
    label_mapping = {label: i for i, label in enumerate(unique_labels)}

    # Aplicar o mapeamento aos rótulos verdadeiros
    labels_int = df_data['class'].map(label_mapping)

    # Criar DataFrame com rótulos inteiros
    df_mapped = pd.DataFrame({'class': df_data['class'], 'mapped_class': labels_int})
    df_data['class'] = df_mapped["mapped_class"]
    return df_data, label_mapping

def inverse_mapping_labels(df: pd.DataFrame, label_mapping: dict):
    df_data = df.copy()
    # Inverter o mapeamento original
    inverse_label_mapping = {v: k for k, v in label_mapping.items()}

    # Aplicar o mapeamento inverso aos rótulos
    labels_original = df_data['class'].map(inverse_label_mapping)

    # Atualizar o DataFrame com os rótulos originais
    df_data['class'] = labels_original
    return df_data

In [34]:
df_data_train_k, label_mapping = mapping_labels(df_data_train_k)
#df_data_val, _ = mapping_labels(df_data_val)
#df_data_test, _ = mapping_labels(df_data_test)

In [35]:
label_mapping

{1.0: 0, 2.0: 1, 5.0: 2, 6.0: 3, 7.0: 4}

In [37]:
contagem_classes = df_data_train_k['class'].value_counts()

# Calcular a porcentagem de cada classe
porcentagem_classes_real = contagem_classes / len(df_data_train_k) * 100
porcentagem_classes_real

class
2    46.023640
4    43.091668
0     7.126455
3     2.689106
1     1.069131
Name: count, dtype: float64

In [38]:
X_train_norm_nn_2 = df_data_train_k.drop('class', axis=1)
y_train_nn_2 = df_data_train_k['class']

In [41]:
accuracies = []
histories = []

for train_index, val_index in kf.split(X_train_norm_nn_2):
    # Divida os dados em treino e validação
    X_train_fold, X_val_fold = X_train_norm_nn_2.iloc[train_index], X_train_norm_nn_2.iloc[val_index]
    y_train_fold, y_val_fold = y_train_nn_2.iloc[train_index], y_train_nn_2.iloc[val_index]

    model_2 = Sequential()
    model_2.add(Dense(8, input_dim=4, activation='relu'))  # Camada de entrada com 8 neurônios e ativação ReLU
    # Adicione a camada de saída
    model_2.add(Dense(5, activation='softmax'))  # Camada de saída com 5 neurônios (um para cada classe) e ativação Softmax para classificação multiclasse

    # Compile o model_2o
    model_2.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    # Defina o checkpoint para salvar os pesos
    checkpoint = ModelCheckpoint("weights_2.h5", monitor='val_loss', save_best_only=True)

    # Treine o model_2o com conjunto de validação e o checkpoint
    history = model_2.fit(X_train_fold, y_train_fold, epochs=5, batch_size=32, validation_data=(X_val_fold, y_val_fold), callbacks=[checkpoint], verbose=True)


    histories.append(history.history)
    # Avalie o modelo
    _, accuracy = model_2.evaluate(X_val_fold, y_val_fold, verbose=0)
    accuracies.append(accuracy)
    print(f'Acurácia do fold: {accuracy*100:.2f}%')

Epoch 1/5
Epoch 2/5
  194/11519 [..............................] - ETA: 8s - loss: 0.0012 - accuracy: 0.9998 

  saving_api.save_model(


Epoch 3/5
Epoch 4/5
Epoch 5/5
Acurácia do fold: 100.00%
Epoch 1/5
Epoch 2/5
  193/11519 [..............................] - ETA: 8s - loss: 0.0024 - accuracy: 0.9994

  saving_api.save_model(


Epoch 3/5
Epoch 4/5
Epoch 5/5
Acurácia do fold: 100.00%
Epoch 1/5
Epoch 2/5
  203/11519 [..............................] - ETA: 8s - loss: 0.0032 - accuracy: 0.9994 

  saving_api.save_model(


Epoch 3/5
Epoch 4/5
Epoch 5/5
Acurácia do fold: 100.00%
Epoch 1/5
Epoch 2/5
  280/11519 [..............................] - ETA: 8s - loss: 0.0011 - accuracy: 0.9999

  saving_api.save_model(


Epoch 3/5
Epoch 4/5
Epoch 5/5
Acurácia do fold: 100.00%
Epoch 1/5
Epoch 2/5
  192/11519 [..............................] - ETA: 9s - loss: 0.0045 - accuracy: 0.9993

  saving_api.save_model(


Epoch 3/5
Epoch 4/5
Epoch 5/5
Acurácia do fold: 100.00%


In [42]:
mean_accuracy = np.mean(accuracies)
std_accuracy = np.std(accuracies)
print(f'\nAcurácia média: {mean_accuracy*100:.2f}%')
print(f'Desvio padrão da acurácia: {std_accuracy*100:.2f}%')


Acurácia média: 100.00%
Desvio padrão da acurácia: 0.00%


In [43]:
import json

# Salvar accuracies em um arquivo JSON
with open('accuracies_rede_2.json', 'w') as f:
    json.dump(accuracies, f)

# Salvar histories em um arquivo JSON
with open('histories_rede_2.json', 'w') as f:
    json.dump(histories, f)

In [45]:
# # Separar as features (X) e os rótulos (y)
# X_train_norm_nn_2 = df_data_train.drop('class', axis=1)
# y_train_nn_2 = df_data_train['class']
# 
# X_test_norm_nn_2 = df_data_test.drop('class', axis=1)
# y_test_nn_2 = df_data_test['class']
# 
# X_val_norm_nn_2 = df_data_val.drop('class', axis=1)
# y_val_nn_2 = df_data_val['class']

In [46]:
# model_2 = Sequential()
# 
# # Adicione as camadas ocultas
# model_2.add(Dense(8, input_dim=4, activation='relu'))  # Camada de entrada com 8 neurônios e ativação ReLU
# #model_2.add(Dense(3, activation='relu'))  # Segunda camada oculta com 3 neurônios e ativação ReLU (opcional)
# 
# # Adicione a camada de saída
# model_2.add(Dense(5, activation='softmax'))  # Camada de saída com 5 neurônios (um para cada classe) e ativação Softmax para classificação multiclasse
# 
# # Compile o model_2o
# model_2.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# 
# # Defina o checkpoint para salvar os pesos
# checkpoint = ModelCheckpoint("weights_2.h5", monitor='val_loss', save_best_only=True)
# 
# # Treine o model_2o com conjunto de validação e o checkpoint
# model_2.fit(X_train_norm_nn_2, y_train_nn_2, epochs=2, batch_size=10, validation_data=(X_val_norm_nn_2, y_val_nn_2), callbacks=[checkpoint])

Epoch 1/2
Epoch 2/2
  173/32253 [..............................] - ETA: 28s - loss: 1.3641e-04 - accuracy: 1.0000

  saving_api.save_model(




<keras.src.callbacks.History at 0x17eec439f10>

# Avaliação dados de teste

In [44]:
y_pred = model.predict(X_test_norm)

y_pred = np.squeeze(y_pred)
y_pred = np.where(y_pred > 0.5, 1, 0)

accuracy_score(y_test_nn_1.values, y_pred)



0.973837447156689

In [45]:
len(y_pred)

293320

In [46]:
df_test_all = X_test_norm.copy()
df_test_all["class"] = y_pred
df_test_all_one = df_test_all[df_test_all["class"] == 1]

In [47]:
X_test_all = df_test_all_one.drop('class', axis=1)

In [48]:
y_pred_all = model_2.predict(X_test_all)
y_pred_all = np.argmax(y_pred_all, axis=1)



In [49]:
df_test_all_one.loc[:, "class"] = y_pred_all

In [50]:
df_test_all_one = inverse_mapping_labels(df_test_all_one, label_mapping)

In [51]:
df_test_all.loc[df_test_all["class"] == 1, "class"] = df_test_all_one["class"].values

In [52]:
df_test_all

Unnamed: 0_level_0,P-TPT,T-TPT,P-MON-CKP,T-JUS-CKP,class
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-06-21 16:48:38,1.256248,0.591633,1.701213,0.123913,7
2017-02-27 04:46:13,-0.236830,0.645243,-0.190925,0.737439,7
2014-03-18 07:15:53,-1.297435,-0.427036,-0.805715,-0.805688,5
2014-03-16 03:04:51,-0.547333,-1.136632,-0.333371,-1.797729,5
2014-03-15 12:49:09,-0.349048,-1.117220,-0.139313,-1.647358,5
...,...,...,...,...,...
2018-06-17 18:38:21,1.137084,0.638832,1.021910,0.623915,0
2014-03-19 04:30:13,-1.487571,-0.372468,-0.970286,-0.762578,0
2014-03-17 22:16:32,-1.195370,-0.434045,-0.686572,-0.870438,5
2017-08-01 19:32:26,0.612422,0.714572,-1.005371,1.205920,1


In [53]:
accuracy_score(y_test, df_test_all["class"].values)

0.973837447156689

In [54]:
precision = precision_score(y_test, df_test_all["class"].values, average=None)
recall = recall_score(y_test, df_test_all["class"].values, average=None)
f1 = f1_score(y_test, df_test_all["class"].values, average=None)

for i in range(len(precision)):
    print(f'Classe {np.unique(y_test)[i]}:')
    print(f'Precision: {precision[i]}')
    print(f'Recall: {recall[i]}')
    print(f'F1-score: {f1[i]}\n')

Classe 0.0:
Precision: 0.9597026440904162
Recall: 0.9602633116341178
F1-score: 0.9599828959993325

Classe 1.0:
Precision: 1.0
Recall: 0.9995736213757817
F1-score: 0.9997867652285165

Classe 2.0:
Precision: 1.0
Recall: 0.9952651515151515
F1-score: 0.9976269577598482

Classe 5.0:
Precision: 0.9955813249296265
Recall: 0.9768155809859155
F1-score: 0.9861091826023205

Classe 6.0:
Precision: 0.9992242048099301
Recall: 0.9702448210922787
F1-score: 0.9845213070896235

Classe 7.0:
Precision: 0.9607556634490376
Recall: 0.9813844165001763
F1-score: 0.9709604841663422

