In [1]:
import pandas as pd
import os
import sys
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import dill
from typing import List
import scipy.stats as sts
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier

import util
from pathlib import Path
from class_manipulates_path import ManipulatePath
from class_preprocessing_refactor import Preprocessing
from class_format_data import FormatData
util.init()

current_path = sys.path[0]

In [2]:
def plot_variable(df, column_name, intervalos=None):
    """
    Plota um gráfico de dispersão para uma variável de um DataFrame.

    Parâmetros:
        - df: DataFrame pandas contendo os dados.
        - column_name: Nome da coluna que você deseja plotar.
        - intervalos: Lista de intervalos para plotar linhas verticais.
    """
    plt.figure(figsize=(20, 10))
    plt.scatter(df.index.values, df[column_name].values, color='darkcyan', alpha=0.5)
    plt.xlabel('Índice')
    plt.ylabel(column_name)
    plt.title(f'Gráfico de dispersão para a variável "{column_name}"')


    # Plotar linhas verticais para cada intervalo
    if intervalos:
        for intervalo in intervalos:
            plt.axvline(x=intervalo, linestyle='--', color='red')

    plt.show()

In [3]:
def class_and_file_generator(data_path, real=False, simulated=False, drawn=False):
    for class_path in data_path.iterdir():
        if class_path.is_dir():
            class_code = int(class_path.stem)
            for instance_path in class_path.iterdir():
                if (instance_path.suffix == '.csv'):
                    if (simulated and instance_path.stem.startswith('SIMULATED')) or \
                       (drawn and instance_path.stem.startswith('DRAWN')) or \
                       (real and (not instance_path.stem.startswith('SIMULATED')) and \
                       (not instance_path.stem.startswith('DRAWN'))):
                        yield class_code, instance_path

In [4]:
manipulate_path = ManipulatePath()

preprocessing = Preprocessing()

format_data = FormatData()

path_raw_data = manipulate_path.get_path_raw_data()

abnormal_classes_codes = [1, 2, 5, 6, 7, 8]

In [5]:
# Gets all real instances but maintains only those with any type of undesirable event
real_instances = pd.DataFrame(class_and_file_generator(path_raw_data, 
                                                       real=True,
                                                       simulated=False, 
                                                       drawn=False),
                              columns=['class_code', 'instance_path'])
real_instances = real_instances.loc[real_instances.iloc[:,0].isin(abnormal_classes_codes)].reset_index(drop=True)

In [6]:
# Gets all real instances but maintains only those with any type of undesirable event
draw_instances = pd.DataFrame(class_and_file_generator(path_raw_data, 
                                                       real=False,
                                                       simulated=True, 
                                                       drawn=False),
                              columns=['class_code', 'instance_path'])
draw_instances = draw_instances.loc[draw_instances.iloc[:,0].isin(abnormal_classes_codes)].reset_index(drop=True)

In [36]:
#file_paths = real_instances[real_instances["class_code"] == falha]["instance_path"].values
df_output = pd.DataFrame()
qtde_segundos = 60 * 20
falhas = [1, 2, 5, 6, 7, 8, 101, 102, 105, 106, 107, 108]
for file_path in real_instances["instance_path"].values:
    df_data = pd.read_csv(file_path)

    df_data = FormatData.set_index_dataframe(df_data=df_data,
                                         column_name="timestamp")
    
    for falha in falhas:
        df_data['class'] = df_data['class'].replace(falha, falha % 10)
    
    segundos = (df_data['class'] == 0).sum()

    quantidade = df_data['class'].isin(falhas).sum()
    if quantidade != 0:
        print(f"1 se repete {quantidade}")
        print(f"Index min {df_data.index.min()} index max {df_data.index.max()}")

        df_output = pd.concat([df_output, df_data], axis=0)

1 se repete 14303
Index min 2014-01-24 09:33:03 index max 2014-01-25 01:47:30
1 se repete 9499
Index min 2014-01-26 16:19:44 index max 2014-01-26 23:00:00
1 se repete 40042
Index min 2017-07-31 18:09:30 index max 2017-08-01 20:45:00
1 se repete 31658
Index min 2017-07-31 22:04:32 index max 2017-08-01 20:19:27
1 se repete 10573
Index min 2018-06-17 20:02:57 index max 2018-06-18 11:00:00
1 se repete 5306
Index min 2013-11-04 01:41:01 index max 2013-11-04 04:13:01
1 se repete 3175
Index min 2017-03-13 16:08:04 index max 2017-03-13 18:00:21
1 se repete 3863
Index min 2017-12-18 19:01:31 index max 2017-12-18 21:44:59
1 se repete 20186
Index min 2014-05-15 10:46:09 index max 2014-05-15 16:59:37
1 se repete 1490
Index min 2014-05-30 09:32:03 index max 2014-05-30 10:33:14
1 se repete 11299
Index min 2014-06-06 22:38:10 index max 2014-06-07 02:41:46
1 se repete 4901
Index min 2014-07-20 12:01:02 index max 2014-07-20 15:20:12
1 se repete 2536
Index min 2014-07-26 17:45:23 index max 2014-07-26 18

In [38]:
#file_paths = draw_instances[draw_instances["class_code"] == falha]["instance_path"].values
df_draw = pd.DataFrame()
qtde_segundos = 60 * 20
falhas = [1, 2, 5, 6, 7, 8, 101, 102, 105, 106, 107, 108]
for file_path in draw_instances["instance_path"].values:
    df_data = pd.read_csv(file_path)

    df_data = FormatData.set_index_dataframe(df_data=df_data,
                                         column_name="timestamp")
    
    for falha in falhas:
        df_data['class'] = df_data['class'].replace(falha, falha % 10)
    
    segundos = (df_data['class'] == 0).sum()

    quantidade = df_data['class'].isin(falhas).sum()
    if quantidade != 0:
        print(f"1 se repete {quantidade}")
        print(f"Index min {df_data.index.min()} index max {df_data.index.max()}")

        df_draw = pd.concat([df_draw, df_data], axis=0)

1 se repete 39599
Index min 2018-10-06 03:57:02 index max 2018-10-06 16:37:00
1 se repete 39599
Index min 2018-10-06 03:59:46 index max 2018-10-06 16:39:44
1 se repete 53999
Index min 2018-10-06 04:44:59 index max 2018-10-06 21:24:57
1 se repete 53999
Index min 2018-10-06 04:51:24 index max 2018-10-06 21:31:22
1 se repete 68399
Index min 2018-10-06 05:52:12 index max 2018-10-07 02:32:10
1 se repete 68400
Index min 2018-10-06 05:55:58 index max 2018-10-07 02:35:57
1 se repete 82799
Index min 2018-10-06 06:30:28 index max 2018-10-07 07:10:26
1 se repete 82800
Index min 2018-10-06 06:42:03 index max 2018-10-07 07:22:02
1 se repete 82800
Index min 2018-10-06 06:45:16 index max 2018-10-07 07:25:15
1 se repete 82799
Index min 2018-10-06 06:48:24 index max 2018-10-07 07:28:22
1 se repete 82800
Index min 2018-10-06 06:52:48 index max 2018-10-07 07:32:47
1 se repete 82800
Index min 2018-10-06 07:08:19 index max 2018-10-07 07:48:18
1 se repete 82799
Index min 2018-10-06 07:11:55 index max 2018-1

  df_data.index = pd.to_datetime(df_data.index)


1 se repete 25199
Index min 2018-05-18 20:18:55 index max 2018-05-19 03:48:53
1 se repete 25199
Index min 2018-05-18 21:18:05 index max 2018-05-19 04:48:03
1 se repete 25199
Index min 2018-05-18 21:45:04 index max 2018-05-19 05:15:02
1 se repete 25199
Index min 2018-05-19 00:44:59 index max 2018-05-19 08:14:57
1 se repete 25199
Index min 2018-05-19 01:19:06 index max 2018-05-19 08:49:04
1 se repete 25199
Index min 2018-05-19 01:53:53 index max 2018-05-19 09:23:51
1 se repete 25199
Index min 2018-05-19 02:51:48 index max 2018-05-19 10:21:46
1 se repete 25199
Index min 2018-05-19 04:40:51 index max 2018-05-19 12:10:49
1 se repete 25199
Index min 2018-05-19 04:59:10 index max 2018-05-19 12:29:08
1 se repete 25199
Index min 2018-05-19 05:29:51 index max 2018-05-19 12:59:49
1 se repete 25199
Index min 2018-05-19 06:08:10 index max 2018-05-19 13:38:08
1 se repete 25199
Index min 2018-05-19 06:40:37 index max 2018-05-19 14:10:35
1 se repete 25199
Index min 2018-05-19 07:12:31 index max 2018-0

In [39]:
df_output

Unnamed: 0_level_0,P-PDG,P-TPT,T-TPT,P-MON-CKP,T-JUS-CKP,P-JUS-CKGL,T-JUS-CKGL,QGL,class
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2014-01-24 09:33:03,0.0,18433410.0,116.8718,9397031.0,74.80031,-300917.8,,0.0,0.0
2014-01-24 09:33:04,0.0,18433410.0,116.8718,9397031.0,74.80031,-300917.8,,0.0,0.0
2014-01-24 09:33:05,0.0,18433410.0,116.8718,9397032.0,74.80031,-300917.8,,0.0,0.0
2014-01-24 09:33:06,0.0,18433410.0,116.8718,9397033.0,74.80031,-300917.8,,0.0,0.0
2014-01-24 09:33:07,0.0,18433410.0,116.8718,9397033.0,74.80031,-300917.8,,0.0,0.0
...,...,...,...,...,...,...,...,...,...
2019-04-03 14:59:56,0.0,8489354.0,109.7213,1496222.0,73.38219,9706965.0,,0.0,7.0
2019-04-03 14:59:57,0.0,8489349.0,109.7212,1495828.0,73.38310,9706899.0,,0.0,7.0
2019-04-03 14:59:58,0.0,8489344.0,109.7210,1495433.0,73.38401,9706834.0,,0.0,7.0
2019-04-03 14:59:59,0.0,8489338.0,109.7209,1495039.0,73.38493,9706768.0,,0.0,7.0


In [40]:
df_draw

Unnamed: 0_level_0,P-PDG,P-TPT,T-TPT,P-MON-CKP,T-JUS-CKP,P-JUS-CKGL,T-JUS-CKGL,QGL,class
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2018-10-06 03:57:02,22520410.0,13068630.0,96.932750,1049626.0,70.33402,,,,0
2018-10-06 03:57:03,22520430.0,13068650.0,96.932790,1049626.0,70.33425,,,,0
2018-10-06 03:57:04,22520440.0,13068640.0,96.932830,1049626.0,70.33449,,,,0
2018-10-06 03:57:05,22520420.0,13068600.0,96.932870,1049626.0,70.33473,,,,0
2018-10-06 03:57:06,22520390.0,13068560.0,96.932900,1049626.0,70.33496,,,,0
...,...,...,...,...,...,...,...,...,...
2018-05-19 19:01:05,33586320.0,28076270.0,3.253794,4002285.0,37.74498,,,,8
2018-05-19 19:01:06,33586440.0,28076420.0,3.253723,4002285.0,38.21421,,,,8
2018-05-19 19:01:07,33586900.0,28076490.0,3.253835,4002278.0,38.75061,,,,8
2018-05-19 19:01:08,33586480.0,28076560.0,3.253942,4002255.0,39.31084,,,,8


In [41]:
df_output = df_output[~df_output.index.duplicated()]

In [42]:
df_draw = df_draw[~df_draw.index.duplicated()]

In [43]:
df_output.shape

(1334120, 9)

In [44]:
df_draw.shape

(988133, 9)

In [45]:
df_aux = df_output.drop(columns=["QGL", "T-JUS-CKGL", "P-JUS-CKGL"])
df_aux

Unnamed: 0_level_0,P-PDG,P-TPT,T-TPT,P-MON-CKP,T-JUS-CKP,class
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2014-01-24 09:33:03,0.0,18433410.0,116.8718,9397031.0,74.80031,0.0
2014-01-24 09:33:04,0.0,18433410.0,116.8718,9397031.0,74.80031,0.0
2014-01-24 09:33:05,0.0,18433410.0,116.8718,9397032.0,74.80031,0.0
2014-01-24 09:33:06,0.0,18433410.0,116.8718,9397033.0,74.80031,0.0
2014-01-24 09:33:07,0.0,18433410.0,116.8718,9397033.0,74.80031,0.0
...,...,...,...,...,...,...
2019-04-03 14:59:56,0.0,8489354.0,109.7213,1496222.0,73.38219,7.0
2019-04-03 14:59:57,0.0,8489349.0,109.7212,1495828.0,73.38310,7.0
2019-04-03 14:59:58,0.0,8489344.0,109.7210,1495433.0,73.38401,7.0
2019-04-03 14:59:59,0.0,8489338.0,109.7209,1495039.0,73.38493,7.0


In [46]:
df_aux_draw = df_draw.drop(columns=["QGL", "T-JUS-CKGL", "P-JUS-CKGL"])
df_aux_draw

Unnamed: 0_level_0,P-PDG,P-TPT,T-TPT,P-MON-CKP,T-JUS-CKP,class
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-10-06 03:57:02,22520410.0,13068630.0,96.932750,1049626.0,70.33402,0
2018-10-06 03:57:03,22520430.0,13068650.0,96.932790,1049626.0,70.33425,0
2018-10-06 03:57:04,22520440.0,13068640.0,96.932830,1049626.0,70.33449,0
2018-10-06 03:57:05,22520420.0,13068600.0,96.932870,1049626.0,70.33473,0
2018-10-06 03:57:06,22520390.0,13068560.0,96.932900,1049626.0,70.33496,0
...,...,...,...,...,...,...
2018-05-19 19:01:05,33586320.0,28076270.0,3.253794,4002285.0,37.74498,8
2018-05-19 19:01:06,33586440.0,28076420.0,3.253723,4002285.0,38.21421,8
2018-05-19 19:01:07,33586900.0,28076490.0,3.253835,4002278.0,38.75061,8
2018-05-19 19:01:08,33586480.0,28076560.0,3.253942,4002255.0,39.31084,8


In [47]:
for column_name in df_aux.columns:
    percent = Preprocessing.check_null_dataframe(df_data=df_aux, column_name=column_name)
    print(f"{column_name} null data is {percent}")

P-PDG null data is 0.26414415494858035
P-TPT null data is 8.322489731058676
T-TPT null data is 8.322189907954307
P-MON-CKP null data is 8.322789554163043
T-JUS-CKP null data is 8.322789554163043
class null data is 6.438401343207508


In [48]:
for column_name in df_aux_draw.columns:
    percent = Preprocessing.check_null_dataframe(df_data=df_aux_draw, column_name=column_name)
    print(f"{column_name} null data is {percent}")

P-PDG null data is 0.0
P-TPT null data is 0.0
T-TPT null data is 31.694518855255318
P-MON-CKP null data is 0.0
T-JUS-CKP null data is 0.0
class null data is 0.0


In [49]:
df_aux = df_aux.dropna()
df_aux.shape

(1139181, 6)

In [50]:
df_aux_draw = df_aux_draw.dropna()
df_aux_draw.shape

(674949, 6)

In [51]:
df_aux = df_aux.drop_duplicates()
df_aux_draw = df_aux_draw.drop_duplicates()

In [52]:
contagem_classes = df_aux['class'].value_counts()

# Calcular a porcentagem de cada classe
porcentagem_classes = contagem_classes / len(df_aux) * 100
porcentagem_classes

class
0.0    34.119395
5.0    29.185127
7.0    27.332065
1.0     6.570228
6.0     1.700241
2.0     1.092943
Name: count, dtype: float64

In [54]:
contagem_classes = df_aux_draw['class'].value_counts()

# Calcular a porcentagem de cada classe
porcentagem_classes = contagem_classes / len(df_aux_draw) * 100
porcentagem_classes

class
5    47.368295
8    25.091305
1    17.121951
2     8.405596
0     2.012854
Name: count, dtype: float64

Aqui é possível ver que tem poquíssimas classes normais!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

# KNN

In [55]:
df_aux["class"].unique()

array([0., 1., 2., 5., 6., 7.])

In [56]:
# Separar as features (X) e os rótulos (y)
X = df_aux.drop('class', axis=1)
y = df_aux['class']

In [57]:
X

Unnamed: 0_level_0,P-PDG,P-TPT,T-TPT,P-MON-CKP,T-JUS-CKP
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2014-01-24 09:33:03,0.0,18433410.0,116.8718,9397031.0,74.80031
2014-01-24 09:33:05,0.0,18433410.0,116.8718,9397032.0,74.80031
2014-01-24 09:33:06,0.0,18433410.0,116.8718,9397033.0,74.80031
2014-01-24 09:33:08,0.0,18433410.0,116.8718,9397034.0,74.80031
2014-01-24 09:33:10,0.0,18433410.0,116.8718,9397035.0,74.80031
...,...,...,...,...,...
2019-04-03 14:59:56,0.0,8489354.0,109.7213,1496222.0,73.38219
2019-04-03 14:59:57,0.0,8489349.0,109.7212,1495828.0,73.38310
2019-04-03 14:59:58,0.0,8489344.0,109.7210,1495433.0,73.38401
2019-04-03 14:59:59,0.0,8489338.0,109.7209,1495039.0,73.38493


In [58]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

In [59]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

X_val = scaler.transform(X_val)

In [60]:
modelo = KNeighborsClassifier(n_neighbors=5)

In [61]:
modelo.fit(X_train, y_train)

In [62]:
y_pred = modelo.predict(X_val)

In [63]:
accuracy_score(y_val, y_pred)

0.9877371771601604

In [64]:
precision = precision_score(y_val, y_pred, average=None)
recall = recall_score(y_val, y_pred, average=None)
f1 = f1_score(y_val, y_pred, average=None)

In [66]:
unique_y_pred = np.unique(y_pred)
unique_y_test = np.unique(y_val)

result_array = np.sort(np.unique(np.concatenate((unique_y_pred, unique_y_test))))

for i in range(len(precision)):
    print(f'Classe {result_array[i]}:')
    print(f'Classe {i}:')
    print(f'Precision: {precision[i]}')
    print(f'Recall: {recall[i]}')
    print(f'F1-score: {f1[i]}\n')

Classe 0.0:
Classe 0:
Precision: 0.9811346819661374
Recall: 0.982959361970832
F1-score: 0.9820461743876039

Classe 1.0:
Classe 1:
Precision: 0.9999550783882125
Recall: 0.9999101608121462
F1-score: 0.9999326190957482

Classe 2.0:
Classe 2:
Precision: 0.9997298757428417
Recall: 0.9994598973805023
F1-score: 0.999594868332208

Classe 5.0:
Classe 3:
Precision: 0.9860255994815295
Recall: 0.9846695251193269
F1-score: 0.9853470957296094

Classe 6.0:
Classe 4:
Precision: 0.9954616861581428
Recall: 0.9899323034195452
F1-score: 0.9926892950391645

Classe 7.0:
Classe 5:
Precision: 0.9939285251285597
Recall: 0.9934455614465116
F1-score: 0.9936869846035868



In [67]:
# Separar as features (X) e os rótulos (y)
X = df_aux_draw.drop('class', axis=1)
y_test = df_aux_draw['class']

X_norm = scaler.transform(X.values)

y_pred = modelo.predict(X_norm)

print(accuracy_score(y_test, y_pred))

precision = precision_score(y_test, y_pred, average=None)
recall = recall_score(y_test, y_pred, average=None)
f1 = f1_score(y_test, y_pred, average=None)

unique_y_pred = np.unique(y_pred)
unique_y_test = np.unique(y_test)

result_array = np.sort(np.unique(np.concatenate((unique_y_pred, unique_y_test))))

for i in range(len(precision)):
    print(f'Classe {result_array[i]}:')
    print(f'Precision: {precision[i]}')
    print(f'Recall: {recall[i]}')
    print(f'F1-score: {f1[i]}\n')




0.07825290198654296


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Classe 0.0:
Precision: 0.0068845513693448325
Recall: 0.09149908592321755
F1-score: 0.012805587892898719

Classe 1.0:
Precision: 0.0
Recall: 0.0
F1-score: 0.0

Classe 2.0:
Precision: 0.24953259075656353
Recall: 0.8968808142716428
F1-score: 0.3904368043909132

Classe 5.0:
Precision: 0.007202072538860104
Recall: 0.0021596426490580697
F1-score: 0.003322874645151651

Classe 6.0:
Precision: 0.0
Recall: 0.0
F1-score: 0.0

Classe 7.0:
Precision: 0.0
Recall: 0.0
F1-score: 0.0

Classe 8.0:
Precision: 0.0
Recall: 0.0
F1-score: 0.0

