In [1]:
import pandas as pd
import os
import sys
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import dill
from typing import List
import scipy.stats as sts
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix

import util
from pathlib import Path
from class_manipulates_path import ManipulatePath
from class_preprocessing_refactor import Preprocessing
from class_format_data import FormatData
util.init()

current_path = sys.path[0]

In [2]:
def plot_variable(df, column_name, intervalos=None):
    """
    Plota um gráfico de dispersão para uma variável de um DataFrame.

    Parâmetros:
        - df: DataFrame pandas contendo os dados.
        - column_name: Nome da coluna que você deseja plotar.
        - intervalos: Lista de intervalos para plotar linhas verticais.
    """
    plt.figure(figsize=(20, 10))
    plt.scatter(df.index.values, df[column_name].values, color='darkcyan', alpha=0.5)
    plt.xlabel('Índice')
    plt.ylabel(column_name)
    plt.title(f'Gráfico de dispersão para a variável "{column_name}"')


    # Plotar linhas verticais para cada intervalo
    if intervalos:
        for intervalo in intervalos:
            plt.axvline(x=intervalo, linestyle='--', color='red')

    plt.show()

In [3]:
def class_and_file_generator(data_path, real=False, simulated=False, drawn=False):
    for class_path in data_path.iterdir():
        if class_path.is_dir():
            class_code = int(class_path.stem)
            for instance_path in class_path.iterdir():
                if (instance_path.suffix == '.csv'):
                    if (simulated and instance_path.stem.startswith('SIMULATED')) or \
                       (drawn and instance_path.stem.startswith('DRAWN')) or \
                       (real and (not instance_path.stem.startswith('SIMULATED')) and \
                       (not instance_path.stem.startswith('DRAWN'))):
                        yield class_code, instance_path

In [4]:
manipulate_path = ManipulatePath()

preprocessing = Preprocessing()

format_data = FormatData()

path_raw_data = manipulate_path.get_path_raw_data()

abnormal_classes_codes = [1, 2, 5, 6, 7, 8]

In [5]:
# Gets all real instances but maintains only those with any type of undesirable event
real_instances = pd.DataFrame(class_and_file_generator(path_raw_data, 
                                                       real=True,
                                                       simulated=True, 
                                                       drawn=True),
                              columns=['class_code', 'instance_path'])
real_instances = real_instances.loc[real_instances.iloc[:,0].isin(abnormal_classes_codes)].reset_index(drop=True)

In [6]:
# Gets all real instances but maintains only those with any type of undesirable event
draw_instances = pd.DataFrame(class_and_file_generator(path_raw_data, 
                                                       real=False,
                                                       simulated=False, 
                                                       drawn=True),
                              columns=['class_code', 'instance_path'])
draw_instances = draw_instances.loc[draw_instances.iloc[:,0].isin(abnormal_classes_codes)].reset_index(drop=True)

In [15]:
falha = 7
falha_2 = 107

In [16]:
file_paths = real_instances[real_instances["class_code"] == falha]["instance_path"].values
df_output = pd.DataFrame()

for file_path in file_paths:
    df_data = pd.read_csv(file_path)

    df_data = FormatData.set_index_dataframe(df_data=df_data,
                                         column_name="timestamp")
    quantidade = ((df_data['class'] == falha) | (df_data['class'] == falha_2)).sum()
    if quantidade != 0:
        print(f"1 se repete {quantidade}")
        print(f"Index min {df_data.index.min()} index max {df_data.index.max()}")

    df_output = pd.concat([df_output, df_data], axis=0)

1 se repete 304263
Index min 2018-08-20 11:25:44 index max 2018-08-24 11:25:44
1 se repete 311012
Index min 2018-08-20 11:25:44 index max 2018-08-24 11:25:44
1 se repete 318042
Index min 2018-08-20 11:25:44 index max 2018-08-24 11:25:44
1 se repete 36311
Index min 2018-08-20 11:25:44 index max 2018-08-20 23:25:44
1 se repete 311012
Index min 2018-08-20 11:25:44 index max 2018-08-24 11:25:44
1 se repete 283454
Index min 2018-08-20 11:25:44 index max 2018-08-24 11:25:44
1 se repete 311012
Index min 2018-08-20 11:25:44 index max 2018-08-24 11:25:44
1 se repete 162537
Index min 2018-08-20 11:25:44 index max 2018-08-22 11:25:44
1 se repete 36311
Index min 2018-08-20 11:25:44 index max 2018-08-20 23:25:44
1 se repete 77753
Index min 2018-08-20 11:25:44 index max 2018-08-21 11:25:44
1 se repete 38817
Index min 2017-02-26 14:01:46 index max 2017-02-27 10:18:58
1 se repete 127179
Index min 2018-06-17 18:13:15 index max 2018-06-19 23:06:08
1 se repete 100242
Index min 2018-06-20 15:57:28 index m

In [17]:
file_paths = draw_instances[draw_instances["class_code"] == falha]["instance_path"].values
df_draw = pd.DataFrame()

for file_path in file_paths:
    df_data = pd.read_csv(file_path)

    df_data = FormatData.set_index_dataframe(df_data=df_data,
                                         column_name="timestamp")
    quantidade = ((df_data['class'] == falha) | (df_data['class'] == falha_2)).sum()
    if quantidade != 0:
        print(f"1 se repete {quantidade}")
        print(f"Index min {df_data.index.min()} index max {df_data.index.max()}")

    df_draw = pd.concat([df_draw, df_data], axis=0)

1 se repete 304263
Index min 2018-08-20 11:25:44 index max 2018-08-24 11:25:44
1 se repete 311012
Index min 2018-08-20 11:25:44 index max 2018-08-24 11:25:44
1 se repete 318042
Index min 2018-08-20 11:25:44 index max 2018-08-24 11:25:44
1 se repete 36311
Index min 2018-08-20 11:25:44 index max 2018-08-20 23:25:44
1 se repete 311012
Index min 2018-08-20 11:25:44 index max 2018-08-24 11:25:44
1 se repete 283454
Index min 2018-08-20 11:25:44 index max 2018-08-24 11:25:44
1 se repete 311012
Index min 2018-08-20 11:25:44 index max 2018-08-24 11:25:44
1 se repete 162537
Index min 2018-08-20 11:25:44 index max 2018-08-22 11:25:44
1 se repete 36311
Index min 2018-08-20 11:25:44 index max 2018-08-20 23:25:44
1 se repete 77753
Index min 2018-08-20 11:25:44 index max 2018-08-21 11:25:44


In [18]:
df_output

Unnamed: 0_level_0,P-PDG,P-TPT,T-TPT,P-MON-CKP,T-JUS-CKP,P-JUS-CKGL,T-JUS-CKGL,QGL,class
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2018-08-20 11:25:44,250.0977,1.351902e+02,59.97301,1.239075e+01,54.89203,,,,0.0
2018-08-20 11:25:45,250.0977,1.351902e+02,59.97301,1.239075e+01,54.89203,,,,0.0
2018-08-20 11:25:46,250.0977,1.351902e+02,59.97301,1.239075e+01,54.89203,,,,0.0
2018-08-20 11:25:47,250.0977,1.351902e+02,59.97301,1.239075e+01,54.89203,,,,0.0
2018-08-20 11:25:48,250.0977,1.351902e+02,59.97301,1.239075e+01,54.89203,,,,0.0
...,...,...,...,...,...,...,...,...,...
2019-04-03 14:59:56,0.0000,8.489354e+06,109.72130,1.496222e+06,73.38219,9706965.0,,0.0,107.0
2019-04-03 14:59:57,0.0000,8.489349e+06,109.72120,1.495828e+06,73.38310,9706899.0,,0.0,107.0
2019-04-03 14:59:58,0.0000,8.489344e+06,109.72100,1.495433e+06,73.38401,9706834.0,,0.0,107.0
2019-04-03 14:59:59,0.0000,8.489338e+06,109.72090,1.495039e+06,73.38493,9706768.0,,0.0,107.0


In [19]:
df_draw

Unnamed: 0_level_0,P-PDG,P-TPT,T-TPT,P-MON-CKP,T-JUS-CKP,P-JUS-CKGL,T-JUS-CKGL,QGL,class
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2018-08-20 11:25:44,250.0977,135.1902,59.97301,12.39075,54.89203,,,,0
2018-08-20 11:25:45,250.0977,135.1902,59.97301,12.39075,54.89203,,,,0
2018-08-20 11:25:46,250.0977,135.1902,59.97301,12.39075,54.89203,,,,0
2018-08-20 11:25:47,250.0977,135.1902,59.97301,12.39075,54.89203,,,,0
2018-08-20 11:25:48,250.0977,135.1902,59.97301,12.39075,54.89203,,,,0
...,...,...,...,...,...,...,...,...,...
2018-08-21 11:25:40,260.9383,211.8869,117.87660,122.91520,65.02185,,,,7
2018-08-21 11:25:41,260.9383,211.8869,117.87660,122.91520,65.02185,,,,7
2018-08-21 11:25:42,260.9383,211.8869,117.87660,122.91520,65.02185,,,,7
2018-08-21 11:25:43,260.9383,211.8869,117.87660,122.91520,65.02185,,,,7


In [20]:
df_output = df_output[~df_output.index.duplicated()]

In [21]:
df_draw = df_draw[~df_draw.index.duplicated()]

In [22]:
df_output.shape

(811939, 9)

In [28]:
df_aux = df_output.drop(columns=["QGL", "T-JUS-CKGL", "P-JUS-CKGL"])
df_aux

Unnamed: 0_level_0,P-PDG,P-TPT,T-TPT,P-MON-CKP,T-JUS-CKP,class
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-08-20 11:25:44,250.0977,1.351902e+02,59.97301,1.239075e+01,54.89203,0.0
2018-08-20 11:25:45,250.0977,1.351902e+02,59.97301,1.239075e+01,54.89203,0.0
2018-08-20 11:25:46,250.0977,1.351902e+02,59.97301,1.239075e+01,54.89203,0.0
2018-08-20 11:25:47,250.0977,1.351902e+02,59.97301,1.239075e+01,54.89203,0.0
2018-08-20 11:25:48,250.0977,1.351902e+02,59.97301,1.239075e+01,54.89203,0.0
...,...,...,...,...,...,...
2019-04-03 14:59:56,0.0000,8.489354e+06,109.72130,1.496222e+06,73.38219,107.0
2019-04-03 14:59:57,0.0000,8.489349e+06,109.72120,1.495828e+06,73.38310,107.0
2019-04-03 14:59:58,0.0000,8.489344e+06,109.72100,1.495433e+06,73.38401,107.0
2019-04-03 14:59:59,0.0000,8.489338e+06,109.72090,1.495039e+06,73.38493,107.0


In [29]:
df_aux_draw = df_draw.drop(columns=["QGL", "T-JUS-CKGL", "P-JUS-CKGL"])
df_aux_draw

Unnamed: 0_level_0,P-PDG,P-TPT,T-TPT,P-MON-CKP,T-JUS-CKP,class
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-08-20 11:25:44,250.0977,135.1902,59.97301,12.39075,54.89203,0
2018-08-20 11:25:45,250.0977,135.1902,59.97301,12.39075,54.89203,0
2018-08-20 11:25:46,250.0977,135.1902,59.97301,12.39075,54.89203,0
2018-08-20 11:25:47,250.0977,135.1902,59.97301,12.39075,54.89203,0
2018-08-20 11:25:48,250.0977,135.1902,59.97301,12.39075,54.89203,0
...,...,...,...,...,...,...
2018-08-24 11:25:40,251.8535,139.0000,59.08355,19.22864,54.13496,107
2018-08-24 11:25:41,251.8535,139.0000,59.08355,19.22868,54.13496,107
2018-08-24 11:25:42,251.8535,139.0000,59.08355,19.22872,54.13496,107
2018-08-24 11:25:43,251.8535,139.0000,59.08355,19.22875,54.13496,107


In [30]:
for column_name in df_aux.columns:
    percent = Preprocessing.check_null_dataframe(df_data=df_aux, column_name=column_name)
    print(f"{column_name} null data is {percent}")

P-PDG null data is 0.044584630126154795
P-TPT null data is 0.044584630126154795
T-TPT null data is 0.044584630126154795
P-MON-CKP null data is 0.044584630126154795
T-JUS-CKP null data is 0.044584630126154795
class null data is 0.6363778559719387


In [31]:
for column_name in df_aux.columns:
    percent = Preprocessing.check_null_dataframe(df_data=df_aux_draw, column_name=column_name)
    print(f"{column_name} null data is {percent}")

P-PDG null data is 0.0
P-TPT null data is 0.0
T-TPT null data is 0.0
P-MON-CKP null data is 0.0
T-JUS-CKP null data is 0.0
class null data is 0.0


In [32]:
df_aux = df_aux.dropna()
df_aux.shape

(806410, 6)

In [33]:
df_aux_draw = df_aux_draw.dropna()
df_aux_draw.shape

(345601, 6)

# KNN

In [34]:
contagem_classes = df_aux_draw['class'].value_counts()

# Calcular a porcentagem de cada classe
porcentagem_classes = contagem_classes / len(df_aux_draw) * 100
porcentagem_classes

class
107    88.038808
0      11.961192
Name: count, dtype: float64

In [35]:
contagem_classes = df_aux['class'].value_counts()

# Calcular a porcentagem de cada classe
porcentagem_classes = contagem_classes / len(df_aux) * 100
porcentagem_classes

class
107.0    72.811969
0.0      23.979985
7.0       3.208046
Name: count, dtype: float64

In [36]:
# Separar as features (X) e os rótulos (y)
X = df_aux.drop('class', axis=1)
y = df_aux['class']

In [37]:
X

Unnamed: 0_level_0,P-PDG,P-TPT,T-TPT,P-MON-CKP,T-JUS-CKP
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-08-20 11:25:44,250.0977,1.351902e+02,59.97301,1.239075e+01,54.89203
2018-08-20 11:25:45,250.0977,1.351902e+02,59.97301,1.239075e+01,54.89203
2018-08-20 11:25:46,250.0977,1.351902e+02,59.97301,1.239075e+01,54.89203
2018-08-20 11:25:47,250.0977,1.351902e+02,59.97301,1.239075e+01,54.89203
2018-08-20 11:25:48,250.0977,1.351902e+02,59.97301,1.239075e+01,54.89203
...,...,...,...,...,...
2019-04-03 14:59:56,0.0000,8.489354e+06,109.72130,1.496222e+06,73.38219
2019-04-03 14:59:57,0.0000,8.489349e+06,109.72120,1.495828e+06,73.38310
2019-04-03 14:59:58,0.0000,8.489344e+06,109.72100,1.495433e+06,73.38401
2019-04-03 14:59:59,0.0000,8.489338e+06,109.72090,1.495039e+06,73.38493


In [38]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

In [39]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

X_test = scaler.transform(X_test)

In [40]:
modelo = KNeighborsClassifier(n_neighbors=5)

In [41]:
modelo.fit(X_train, y_train)

In [42]:
y_pred = modelo.predict(X_test)

In [43]:
accuracy_score(y_test, y_pred)

0.9957424469769307

In [44]:
precision = precision_score(y_test,  y_pred, average=None)
recall = recall_score(y_test,  y_pred, average=None)
f1 = f1_score(y_test,  y_pred, average=None)

for i in range(len(precision)):
    print(f'Classe {porcentagem_classes.index.values[i]}:')
    print(f'Precision: {precision[i]:.3f}')
    print(f'Recall: {recall[i]:.3f}')
    print(f'F1-score: {f1[i]:.3f}\n')

Classe 107.0:
Precision: 0.993
Recall: 0.994
F1-score: 0.994

Classe 0.0:
Precision: 0.979
Recall: 0.983
F1-score: 0.981

Classe 7.0:
Precision: 0.997
Recall: 0.997
F1-score: 0.997



In [45]:
confusion_matrix(y_test.values, y_pred)

array([[ 57667,      0,    346],
       [     0,   7629,    132],
       [   392,    160, 175597]], dtype=int64)

# Teste com os dados desenhados

In [46]:
# Separar as features (X) e os rótulos (y)
X_draw = df_aux_draw.drop('class', axis=1)
y_draw = df_aux_draw['class']

In [47]:
X_draw = scaler.transform(X_draw)

In [48]:
y_draw_pred = modelo.predict(X_draw)

In [53]:
accuracy_score(y_draw.values, y_draw_pred)

1.0

In [54]:
precision = precision_score(y_draw.values,  y_draw_pred, average=None)
recall = recall_score(y_draw.values,  y_draw_pred, average=None)
f1 = f1_score(y_draw.values,  y_draw_pred, average=None)

for i in range(len(precision)):
    print(f'Classe {porcentagem_classes.index.values[i]}:')
    print(f'Precision: {precision[i]:.3f}')
    print(f'Recall: {recall[i]:.3f}')
    print(f'F1-score: {f1[i]:.3f}\n')

Classe 107.0:
Precision: 1.000
Recall: 1.000
F1-score: 1.000

Classe 0.0:
Precision: 1.000
Recall: 1.000
F1-score: 1.000



# MLP

In [27]:
modelo = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42)

In [28]:
modelo.fit(X_train, y_train)

In [29]:
y_pred = modelo.predict(X_test)

In [30]:
accuracy_score(y_test, y_pred)

0.9812504068922115