In [1]:
import pandas as pd
import os
import sys
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import dill
from typing import List
import scipy.stats as sts
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

import util
from pathlib import Path
from class_manipulates_path import ManipulatePath
from class_preprocessing_refactor import Preprocessing
from class_format_data import FormatData
util.init()

current_path = sys.path[0]

In [2]:
def plot_variable(df, column_name, intervalos=None):
    """
    Plota um gráfico de dispersão para uma variável de um DataFrame.

    Parâmetros:
        - df: DataFrame pandas contendo os dados.
        - column_name: Nome da coluna que você deseja plotar.
        - intervalos: Lista de intervalos para plotar linhas verticais.
    """
    plt.figure(figsize=(20, 10))
    plt.scatter(df.index.values, df[column_name].values, color='darkcyan', alpha=0.5)
    plt.xlabel('Índice')
    plt.ylabel(column_name)
    plt.title(f'Gráfico de dispersão para a variável "{column_name}"')


    # Plotar linhas verticais para cada intervalo
    if intervalos:
        for intervalo in intervalos:
            plt.axvline(x=intervalo, linestyle='--', color='red')

    plt.show()

In [3]:
def class_and_file_generator(data_path, real=False, simulated=False, drawn=False):
    for class_path in data_path.iterdir():
        if class_path.is_dir():
            class_code = int(class_path.stem)
            for instance_path in class_path.iterdir():
                if (instance_path.suffix == '.csv'):
                    if (simulated and instance_path.stem.startswith('SIMULATED')) or \
                       (drawn and instance_path.stem.startswith('DRAWN')) or \
                       (real and (not instance_path.stem.startswith('SIMULATED')) and \
                       (not instance_path.stem.startswith('DRAWN'))):
                        yield class_code, instance_path

In [4]:
manipulate_path = ManipulatePath()

preprocessing = Preprocessing()

format_data = FormatData()

path_raw_data = manipulate_path.get_path_raw_data()

abnormal_classes_codes = [1, 2, 5, 6, 7, 8]

In [5]:
# Gets all real instances but maintains only those with any type of undesirable event
real_instances = pd.DataFrame(class_and_file_generator(path_raw_data, 
                                                       real=True,
                                                       simulated=False, 
                                                       drawn=False),
                              columns=['class_code', 'instance_path'])
real_instances = real_instances.loc[real_instances.iloc[:,0].isin(abnormal_classes_codes)].reset_index(drop=True)

In [6]:
# Gets all real instances but maintains only those with any type of undesirable event
draw_instances = pd.DataFrame(class_and_file_generator(path_raw_data, 
                                                       real=False,
                                                       simulated=True, 
                                                       drawn=False),
                              columns=['class_code', 'instance_path'])
draw_instances = draw_instances.loc[draw_instances.iloc[:,0].isin(abnormal_classes_codes)].reset_index(drop=True)

In [7]:
0 % 10

0

In [8]:
#file_paths = real_instances[real_instances["class_code"] == falha]["instance_path"].values
df_output = pd.DataFrame()
qtde_segundos = 60 * 20
falhas = [1, 2, 5, 6, 7, 8, 101, 102, 105, 106, 107, 108]
for file_path in real_instances["instance_path"].values:
    df_data = pd.read_csv(file_path)

    df_data = FormatData.set_index_dataframe(df_data=df_data,
                                         column_name="timestamp")
    
    for falha in falhas:
        df_data['class'] = df_data['class'].replace(falha, 1)
    
    segundos = (df_data['class'] == 0).sum()

    quantidade = df_data['class'].isin(falhas).sum()
    if quantidade != 0:
        print(f"1 se repete {quantidade}")
        print(f"Index min {df_data.index.min()} index max {df_data.index.max()}")

        df_output = pd.concat([df_output, df_data], axis=0)

1 se repete 14303
Index min 2014-01-24 09:33:03 index max 2014-01-25 01:47:30
1 se repete 9499
Index min 2014-01-26 16:19:44 index max 2014-01-26 23:00:00
1 se repete 40042
Index min 2017-07-31 18:09:30 index max 2017-08-01 20:45:00
1 se repete 31658
Index min 2017-07-31 22:04:32 index max 2017-08-01 20:19:27
1 se repete 10573
Index min 2018-06-17 20:02:57 index max 2018-06-18 11:00:00
1 se repete 5306
Index min 2013-11-04 01:41:01 index max 2013-11-04 04:13:01
1 se repete 3175
Index min 2017-03-13 16:08:04 index max 2017-03-13 18:00:21
1 se repete 3863
Index min 2017-12-18 19:01:31 index max 2017-12-18 21:44:59
1 se repete 20186
Index min 2014-05-15 10:46:09 index max 2014-05-15 16:59:37
1 se repete 1490
Index min 2014-05-30 09:32:03 index max 2014-05-30 10:33:14
1 se repete 11299
Index min 2014-06-06 22:38:10 index max 2014-06-07 02:41:46
1 se repete 4901
Index min 2014-07-20 12:01:02 index max 2014-07-20 15:20:12
1 se repete 2536
Index min 2014-07-26 17:45:23 index max 2014-07-26 18

In [9]:
#file_paths = draw_instances[draw_instances["class_code"] == falha]["instance_path"].values
df_draw = pd.DataFrame()
qtde_segundos = 60 * 20
falhas = [1, 2, 5, 6, 7, 8, 101, 102, 105, 106, 107, 108]
for file_path in draw_instances["instance_path"].values:
    df_data = pd.read_csv(file_path)

    df_data = FormatData.set_index_dataframe(df_data=df_data,
                                         column_name="timestamp")
    
    for falha in falhas:
        df_data['class'] = df_data['class'].replace(falha, 1)
    
    segundos = (df_data['class'] == 0).sum()

    quantidade = df_data['class'].isin(falhas).sum()
    if quantidade != 0:
        print(f"1 se repete {quantidade}")
        print(f"Index min {df_data.index.min()} index max {df_data.index.max()}")

        df_draw = pd.concat([df_draw, df_data], axis=0)

1 se repete 159021
Index min 2018-09-05 20:44:36 index max 2018-09-07 20:44:35
1 se repete 79511
Index min 2018-09-05 20:44:36 index max 2018-09-06 20:44:35
1 se repete 19439
Index min 2018-09-05 20:44:36 index max 2018-09-06 02:44:35
1 se repete 28508
Index min 2018-09-05 20:44:36 index max 2018-09-06 08:44:35
1 se repete 12959
Index min 2018-09-05 20:44:36 index max 2018-09-06 00:44:35
1 se repete 53570
Index min 2018-09-05 20:44:36 index max 2018-09-06 20:44:35
1 se repete 41478
Index min 2018-09-05 20:44:36 index max 2018-09-06 08:44:35
1 se repete 12959
Index min 2018-09-05 20:44:36 index max 2018-09-06 00:44:35
1 se repete 20317
Index min 2018-09-05 20:44:36 index max 2018-09-06 02:44:35
1 se repete 33710
Index min 2018-09-05 20:44:36 index max 2018-09-06 08:44:35
1 se repete 304263
Index min 2018-08-20 11:25:44 index max 2018-08-24 11:25:44
1 se repete 311012
Index min 2018-08-20 11:25:44 index max 2018-08-24 11:25:44
1 se repete 318042
Index min 2018-08-20 11:25:44 index max 20

In [10]:
df_output

Unnamed: 0_level_0,P-PDG,P-TPT,T-TPT,P-MON-CKP,T-JUS-CKP,P-JUS-CKGL,T-JUS-CKGL,QGL,class
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2014-01-24 09:33:03,0.0,18433410.0,116.8718,9397031.0,74.80031,-300917.8,,0.0,0.0
2014-01-24 09:33:04,0.0,18433410.0,116.8718,9397031.0,74.80031,-300917.8,,0.0,0.0
2014-01-24 09:33:05,0.0,18433410.0,116.8718,9397032.0,74.80031,-300917.8,,0.0,0.0
2014-01-24 09:33:06,0.0,18433410.0,116.8718,9397033.0,74.80031,-300917.8,,0.0,0.0
2014-01-24 09:33:07,0.0,18433410.0,116.8718,9397033.0,74.80031,-300917.8,,0.0,0.0
...,...,...,...,...,...,...,...,...,...
2019-04-03 14:59:56,0.0,8489354.0,109.7213,1496222.0,73.38219,9706965.0,,0.0,1.0
2019-04-03 14:59:57,0.0,8489349.0,109.7212,1495828.0,73.38310,9706899.0,,0.0,1.0
2019-04-03 14:59:58,0.0,8489344.0,109.7210,1495433.0,73.38401,9706834.0,,0.0,1.0
2019-04-03 14:59:59,0.0,8489338.0,109.7209,1495039.0,73.38493,9706768.0,,0.0,1.0


In [11]:
df_draw

Unnamed: 0_level_0,P-PDG,P-TPT,T-TPT,P-MON-CKP,T-JUS-CKP,P-JUS-CKGL,T-JUS-CKGL,QGL,class
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2018-09-05 20:44:36,420.2314,175.3406,114.8907,96.64524,66.75450,,,,0
2018-09-05 20:44:37,420.2314,175.3406,114.8907,96.64524,66.75450,,,,0
2018-09-05 20:44:38,420.2314,175.3406,114.8907,96.64524,66.75450,,,,0
2018-09-05 20:44:39,420.2314,175.3406,114.8907,96.64524,66.75450,,,,0
2018-09-05 20:44:40,420.2314,175.3406,114.8907,96.64524,66.75450,,,,0
...,...,...,...,...,...,...,...,...,...
2018-08-21 11:25:40,260.9383,211.8869,117.8766,122.91520,65.02185,,,,1
2018-08-21 11:25:41,260.9383,211.8869,117.8766,122.91520,65.02185,,,,1
2018-08-21 11:25:42,260.9383,211.8869,117.8766,122.91520,65.02185,,,,1
2018-08-21 11:25:43,260.9383,211.8869,117.8766,122.91520,65.02185,,,,1


In [12]:
df_output = df_output[~df_output.index.duplicated()]

In [13]:
df_draw = df_draw[~df_draw.index.duplicated()]

In [14]:
df_output.shape

(1334120, 9)

In [15]:
df_draw.shape

(518401, 9)

In [16]:
df_aux = df_output.drop(columns=["QGL", "T-JUS-CKGL", "P-JUS-CKGL"])
df_aux

Unnamed: 0_level_0,P-PDG,P-TPT,T-TPT,P-MON-CKP,T-JUS-CKP,class
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2014-01-24 09:33:03,0.0,18433410.0,116.8718,9397031.0,74.80031,0.0
2014-01-24 09:33:04,0.0,18433410.0,116.8718,9397031.0,74.80031,0.0
2014-01-24 09:33:05,0.0,18433410.0,116.8718,9397032.0,74.80031,0.0
2014-01-24 09:33:06,0.0,18433410.0,116.8718,9397033.0,74.80031,0.0
2014-01-24 09:33:07,0.0,18433410.0,116.8718,9397033.0,74.80031,0.0
...,...,...,...,...,...,...
2019-04-03 14:59:56,0.0,8489354.0,109.7213,1496222.0,73.38219,1.0
2019-04-03 14:59:57,0.0,8489349.0,109.7212,1495828.0,73.38310,1.0
2019-04-03 14:59:58,0.0,8489344.0,109.7210,1495433.0,73.38401,1.0
2019-04-03 14:59:59,0.0,8489338.0,109.7209,1495039.0,73.38493,1.0


In [17]:
df_aux_draw = df_draw.drop(columns=["QGL", "T-JUS-CKGL", "P-JUS-CKGL"])
df_aux_draw

Unnamed: 0_level_0,P-PDG,P-TPT,T-TPT,P-MON-CKP,T-JUS-CKP,class
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-09-05 20:44:36,420.2314,175.3406,114.89070,96.64524,66.75450,0
2018-09-05 20:44:37,420.2314,175.3406,114.89070,96.64524,66.75450,0
2018-09-05 20:44:38,420.2314,175.3406,114.89070,96.64524,66.75450,0
2018-09-05 20:44:39,420.2314,175.3406,114.89070,96.64524,66.75450,0
2018-09-05 20:44:40,420.2314,175.3406,114.89070,96.64524,66.75450,0
...,...,...,...,...,...,...
2018-08-24 11:25:40,251.8535,139.0000,59.08355,19.22864,54.13496,1
2018-08-24 11:25:41,251.8535,139.0000,59.08355,19.22868,54.13496,1
2018-08-24 11:25:42,251.8535,139.0000,59.08355,19.22872,54.13496,1
2018-08-24 11:25:43,251.8535,139.0000,59.08355,19.22875,54.13496,1


In [18]:
for column_name in df_aux.columns:
    percent = Preprocessing.check_null_dataframe(df_data=df_aux, column_name=column_name)
    print(f"{column_name} null data is {percent}")

P-PDG null data is 0.26414415494858035
P-TPT null data is 8.322489731058676
T-TPT null data is 8.322189907954307
P-MON-CKP null data is 8.322789554163043
T-JUS-CKP null data is 8.322789554163043
class null data is 6.438401343207508


In [19]:
for column_name in df_aux_draw.columns:
    percent = Preprocessing.check_null_dataframe(df_data=df_aux_draw, column_name=column_name)
    print(f"{column_name} null data is {percent}")

P-PDG null data is 0.0
P-TPT null data is 0.0
T-TPT null data is 0.0
P-MON-CKP null data is 0.0
T-JUS-CKP null data is 0.0
class null data is 0.0


In [20]:
df_aux = df_aux.dropna()
df_aux.shape

(1139181, 6)

In [21]:
df_aux_draw = df_aux_draw.dropna()
df_aux_draw.shape

(518401, 6)

In [22]:
df_aux = df_aux.drop_duplicates()
df_aux_draw = df_aux_draw.drop_duplicates()

In [23]:
contagem_classes = df_aux['class'].value_counts()

# Calcular a porcentagem de cada classe
porcentagem_classes_real = contagem_classes / len(df_aux) * 100
porcentagem_classes_real

class
1.0    65.880605
0.0    34.119395
Name: count, dtype: float64

In [24]:
contagem_classes = df_aux_draw['class'].value_counts()

# Calcular a porcentagem de cada classe
porcentagem_classes_draw = contagem_classes / len(df_aux_draw) * 100
porcentagem_classes_draw

class
1    96.830891
0     3.169109
Name: count, dtype: float64

Aqui é possível ver que tem poquíssimas classes normais!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

# MLP

In [25]:
df_aux["class"].unique()

array([0., 1.])

In [26]:
# Separar as features (X) e os rótulos (y)
X = df_aux.drop('class', axis=1)
y = df_aux['class']

In [27]:
X

Unnamed: 0_level_0,P-PDG,P-TPT,T-TPT,P-MON-CKP,T-JUS-CKP
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2014-01-24 09:33:03,0.0,18433410.0,116.8718,9397031.0,74.80031
2014-01-24 09:33:05,0.0,18433410.0,116.8718,9397032.0,74.80031
2014-01-24 09:33:06,0.0,18433410.0,116.8718,9397033.0,74.80031
2014-01-24 09:33:08,0.0,18433410.0,116.8718,9397034.0,74.80031
2014-01-24 09:33:10,0.0,18433410.0,116.8718,9397035.0,74.80031
...,...,...,...,...,...
2019-04-03 14:59:56,0.0,8489354.0,109.7213,1496222.0,73.38219
2019-04-03 14:59:57,0.0,8489349.0,109.7212,1495828.0,73.38310
2019-04-03 14:59:58,0.0,8489344.0,109.7210,1495433.0,73.38401
2019-04-03 14:59:59,0.0,8489338.0,109.7209,1495039.0,73.38493


In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

In [34]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.3, stratify=y_train, random_state=42)

In [36]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

X_val = scaler.transform(X_val)

X_test = scaler.transform(X_test)

In [38]:
# Defina o modelo
model = Sequential()

# Adicione as camadas ocultas
model.add(Dense(8, input_dim=5, activation='relu'))  # Camada de entrada com 4 neurônios e ativação ReLU
model.add(Dense(4, activation='relu'))  # Segunda camada oculta com 4 neurônios e ativação ReLU

# Adicione a camada de saída
model.add(Dense(1, activation='sigmoid'))  # Camada de saída com 1 neurônio e ativação Sigmoid para classificação binária

# Compile o modelo
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Treine o modelo
model.fit(X_train, y_train, epochs=100, batch_size=10, validation_data=(X_val, y_val))

# Avalie o modelo
_, accuracy = model.evaluate(X_test, y_test)
print('Acurácia: %.2f' % (accuracy*100))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100

KeyboardInterrupt: 

In [43]:
y_pred = model.predict(X_test)



In [61]:
y_pred = np.squeeze(y_pred)
y_pred = np.where(y_pred > 0.5, 1, 0)

In [62]:
accuracy_score(y_test.values, y_pred)

0.9135817772386661

In [63]:
precision = precision_score(y_test.values, y_pred, average=None)
recall = recall_score(y_test.values, y_pred, average=None)
f1 = f1_score(y_test.values, y_pred, average=None)

In [64]:
for i in range(len(precision)):
    print(f'Classe {porcentagem_classes_real.index.values[i]}:')
    print(f'Precision: {precision[i]}')
    print(f'Recall: {recall[i]}')
    print(f'F1-score: {f1[i]}\n')

Classe 1.0:
Precision: 0.8734652557258183
Recall: 0.8732159230489767
F1-score: 0.8733405715917102

Classe 0.0:
Precision: 0.9343489088759093
Recall: 0.9344870376260511
F1-score: 0.9344179681463182



In [65]:
# Separar as features (X) e os rótulos (y)
X = df_aux_draw.drop('class', axis=1)
y_test = df_aux_draw['class']

X_norm = scaler.transform(X.values)

y_pred = model.predict(X_norm)

y_pred = np.squeeze(y_pred)
y_pred = np.where(y_pred > 0.5, 1, 0)

print(accuracy_score(y_test, y_pred))

precision = precision_score(y_test, y_pred, average=None)
recall = recall_score(y_test, y_pred, average=None)
f1 = f1_score(y_test, y_pred, average=None)

unique_y_pred = np.unique(y_pred)
unique_y_test = np.unique(y_test)

result_array = np.sort(np.unique(np.concatenate((unique_y_pred, unique_y_test))))

for i in range(len(precision)):
    print(f'Classe {result_array[i]}:')
    print(f'Precision: {precision[i]}')
    print(f'Recall: {recall[i]}')
    print(f'F1-score: {f1[i]}\n')


  150/12297 [..............................] - ETA: 8s



0.8808801329653406
Classe 0:
Precision: 0.09200664136622391
Recall: 0.3110665597433841
F1-score: 0.14200988467874795

Classe 1:
Precision: 0.9755469279244918
Recall: 0.8995291536313436
F1-score: 0.9359971161014935



In [109]:
X = df_aux_draw[df_aux_draw["class"] == 7].drop('class', axis=1)
df_aux_draw[df_aux_draw["class"] == 7].drop('class', axis=1)

y_test = df_aux_draw[df_aux_draw["class"] == 7]["class"]

X_norm = scaler.transform(X.values)

y_pred = modelo.predict(X_norm)



In [111]:
y_pred

array([5., 5., 5., ..., 5., 5., 5.])

está prevendo todo mundo da classe 7 como a classe 5

In [112]:
print(accuracy_score(y_test, y_pred))

0.0
