In [2]:
import pandas as pd
import os
import sys
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import dill
from typing import List
import scipy.stats as sts
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier

import util
from pathlib import Path
from class_manipulates_path import ManipulatePath
from class_preprocessing_refactor import Preprocessing
from class_format_data import FormatData
util.init()

current_path = sys.path[0]

In [3]:
def plot_variable(df, column_name, intervalos=None):
    """
    Plota um gráfico de dispersão para uma variável de um DataFrame.

    Parâmetros:
        - df: DataFrame pandas contendo os dados.
        - column_name: Nome da coluna que você deseja plotar.
        - intervalos: Lista de intervalos para plotar linhas verticais.
    """
    plt.figure(figsize=(20, 10))
    plt.scatter(df.index.values, df[column_name].values, color='darkcyan', alpha=0.5)
    plt.xlabel('Índice')
    plt.ylabel(column_name)
    plt.title(f'Gráfico de dispersão para a variável "{column_name}"')


    # Plotar linhas verticais para cada intervalo
    if intervalos:
        for intervalo in intervalos:
            plt.axvline(x=intervalo, linestyle='--', color='red')

    plt.show()

In [4]:
def class_and_file_generator(data_path, real=False, simulated=False, drawn=False):
    for class_path in data_path.iterdir():
        if class_path.is_dir():
            class_code = int(class_path.stem)
            for instance_path in class_path.iterdir():
                if (instance_path.suffix == '.csv'):
                    if (simulated and instance_path.stem.startswith('SIMULATED')) or \
                       (drawn and instance_path.stem.startswith('DRAWN')) or \
                       (real and (not instance_path.stem.startswith('SIMULATED')) and \
                       (not instance_path.stem.startswith('DRAWN'))):
                        yield class_code, instance_path

In [5]:
manipulate_path = ManipulatePath()

preprocessing = Preprocessing()

format_data = FormatData()

path_raw_data = manipulate_path.get_path_raw_data()

abnormal_classes_codes = [1, 2, 5, 6, 7, 8]

In [6]:
# Gets all real instances but maintains only those with any type of undesirable event
real_instances = pd.DataFrame(class_and_file_generator(path_raw_data, 
                                                       real=True,
                                                       simulated=False, 
                                                       drawn=False),
                              columns=['class_code', 'instance_path'])
real_instances = real_instances.loc[real_instances.iloc[:,0].isin(abnormal_classes_codes)].reset_index(drop=True)

In [7]:
real_instances[real_instances["class_code"] == 1]

Unnamed: 0,class_code,instance_path
0,1,c:\Users\marce\Documents\MEGA\0_PUC-Rio\2_Peri...
1,1,c:\Users\marce\Documents\MEGA\0_PUC-Rio\2_Peri...
2,1,c:\Users\marce\Documents\MEGA\0_PUC-Rio\2_Peri...
3,1,c:\Users\marce\Documents\MEGA\0_PUC-Rio\2_Peri...
4,1,c:\Users\marce\Documents\MEGA\0_PUC-Rio\2_Peri...


In [8]:
real_instances[real_instances["class_code"] == 1]["instance_path"].values

array([WindowsPath('c:/Users/marce/Documents/MEGA/0_PUC-Rio/2_Periodo/2_Data Science/2_Trabalho_Final/3_3W/1_data/1_raw_data/1/WELL-00001_20140124093303.csv'),
       WindowsPath('c:/Users/marce/Documents/MEGA/0_PUC-Rio/2_Periodo/2_Data Science/2_Trabalho_Final/3_3W/1_data/1_raw_data/1/WELL-00002_20140126161944.csv'),
       WindowsPath('c:/Users/marce/Documents/MEGA/0_PUC-Rio/2_Periodo/2_Data Science/2_Trabalho_Final/3_3W/1_data/1_raw_data/1/WELL-00006_20170731180930.csv'),
       WindowsPath('c:/Users/marce/Documents/MEGA/0_PUC-Rio/2_Periodo/2_Data Science/2_Trabalho_Final/3_3W/1_data/1_raw_data/1/WELL-00006_20170731220432.csv'),
       WindowsPath('c:/Users/marce/Documents/MEGA/0_PUC-Rio/2_Periodo/2_Data Science/2_Trabalho_Final/3_3W/1_data/1_raw_data/1/WELL-00006_20180617200257.csv')],
      dtype=object)

In [9]:
falha = 1

file_paths = real_instances[real_instances["class_code"] == falha]["instance_path"].values
df_output = pd.DataFrame()

for file_path in file_paths:
    df_data = pd.read_csv(file_path)

    df_data = FormatData.set_index_dataframe(df_data=df_data,
                                         column_name="timestamp")
    quantidade = (df_data['class'] == falha).sum()
    print(f"1 se repete {quantidade}")
    print(f"Index min {df_data.index.min()} index max {df_data.index.max()}")
    
    df_output = pd.concat([df_output, df_data], axis=0)

1 se repete 4603
Index min 2014-01-24 09:33:03 index max 2014-01-25 01:47:30
1 se repete 2929
Index min 2014-01-26 16:19:44 index max 2014-01-26 23:00:00
1 se repete 1205
Index min 2017-07-31 18:09:30 index max 2017-08-01 20:45:00
1 se repete 861
Index min 2017-07-31 22:04:32 index max 2017-08-01 20:19:27
1 se repete 819
Index min 2018-06-17 20:02:57 index max 2018-06-18 11:00:00


In [10]:
df_output = df_output[~df_output.index.duplicated()]

In [11]:
df_output.head()

Unnamed: 0_level_0,P-PDG,P-TPT,T-TPT,P-MON-CKP,T-JUS-CKP,P-JUS-CKGL,T-JUS-CKGL,QGL,class
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2014-01-24 09:33:03,0.0,18433410.0,116.8718,9397031.0,74.80031,-300917.8,,0.0,0.0
2014-01-24 09:33:04,0.0,18433410.0,116.8718,9397031.0,74.80031,-300917.8,,0.0,0.0
2014-01-24 09:33:05,0.0,18433410.0,116.8718,9397032.0,74.80031,-300917.8,,0.0,0.0
2014-01-24 09:33:06,0.0,18433410.0,116.8718,9397033.0,74.80031,-300917.8,,0.0,0.0
2014-01-24 09:33:07,0.0,18433410.0,116.8718,9397033.0,74.80031,-300917.8,,0.0,0.0


In [12]:
df_aux = df_output.drop(columns=["QGL", "T-JUS-CKGL"])
df_aux

Unnamed: 0_level_0,P-PDG,P-TPT,T-TPT,P-MON-CKP,T-JUS-CKP,P-JUS-CKGL,class
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2014-01-24 09:33:03,0.000000e+00,18433410.0,116.8718,9397031.0,74.80031,-300917.8,0.0
2014-01-24 09:33:04,0.000000e+00,18433410.0,116.8718,9397031.0,74.80031,-300917.8,0.0
2014-01-24 09:33:05,0.000000e+00,18433410.0,116.8718,9397032.0,74.80031,-300917.8,0.0
2014-01-24 09:33:06,0.000000e+00,18433410.0,116.8718,9397033.0,74.80031,-300917.8,0.0
2014-01-24 09:33:07,0.000000e+00,18433410.0,116.8718,9397033.0,74.80031,-300917.8,0.0
...,...,...,...,...,...,...,...
2018-06-18 10:59:56,-1.180116e+42,20647810.0,118.2188,10136700.0,71.27952,4108865.0,1.0
2018-06-18 10:59:57,-1.180116e+42,20647810.0,118.2194,10137410.0,71.27949,4108866.0,1.0
2018-06-18 10:59:58,-1.180116e+42,20647810.0,118.2201,10138120.0,71.27946,4108866.0,1.0
2018-06-18 10:59:59,-1.180116e+42,20647810.0,118.2207,10138830.0,71.27944,4108867.0,1.0


In [13]:
for column_name in df_aux.columns:
    percent = Preprocessing.check_null_dataframe(df_data=df_aux, column_name=column_name)
    print(f"{column_name} null data is {percent}")

P-PDG null data is 0.0
P-TPT null data is 0.0
T-TPT null data is 0.0
P-MON-CKP null data is 0.0
T-JUS-CKP null data is 0.0
P-JUS-CKGL null data is 0.0
class null data is 4.459575935183589


In [14]:
df_aux = df_aux.dropna()
df_aux.shape

(221692, 7)

In [15]:
for column_name in df_aux.columns:
    percent = Preprocessing.check_column_low_variance(df_data=df_output, column_name=column_name)
    print(f"{column_name} low variance is {percent}")

P-PDG low variance is 0
P-TPT low variance is 0
T-TPT low variance is 0
P-MON-CKP low variance is 0
T-JUS-CKP low variance is 0
P-JUS-CKGL low variance is 0
class low variance is 0


In [16]:
df_aux["class"] = df_aux["class"].replace(101, 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_aux["class"] = df_aux["class"].replace(101, 1)


In [17]:
contagem_classes = df_aux['class'].value_counts()

# Calcular a porcentagem de cada classe
porcentagem_classes = contagem_classes / len(df_aux) * 100
porcentagem_classes

class
0.0    66.432257
1.0    33.567743
Name: count, dtype: float64

In [18]:
import numpy as np
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split

In [19]:
# Separar as features (X) e os rótulos (y)
X = df_aux.drop('class', axis=1)
y = df_aux['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, stratify=y_train, random_state=42)

In [20]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

In [21]:
# Definir a arquitetura do Autoencoder
input_dim = X_train.shape[1]
encoding_dim = 4  # Dimensão da camada de codificação

input_layer = Input(shape=(input_dim,))
encoder = Dense(encoding_dim, activation='relu')(input_layer)
decoder = Dense(input_dim, activation='sigmoid')(encoder)

autoencoder = Model(inputs=input_layer, outputs=decoder)

# Compilar o modelo
autoencoder.compile(optimizer='adam', loss='mse')

# Treinar o Autoencoder
autoencoder.fit(X_train, X_train, epochs=50, batch_size=32, shuffle=True, validation_data=(X_val, X_val))

# Usar o Autoencoder para reconstruir os dados de teste
reconstructions = autoencoder.predict(X_test)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [55]:
# Calcular a diferença entre os dados originais e as reconstruções
mse = np.sqrt(np.mean(np.power(X_test - reconstructions, 2), axis=1))
#mse = np.mean(np.power(X_test - reconstructions, 2), axis=1)

# Definir um limiar para identificar anomalias (por exemplo, o percentil 95 das pontuações MSE)
limiar = np.percentile(mse, 95)

# Identificar anomalias com base no limiar
anomalias_indices = np.where(mse >= limiar)[0]

# Imprimir os índices das anomalias
print("Índices das anomalias:", anomalias_indices)

Índices das anomalias: [   16    29    69 ... 66491 66492 66505]


In [56]:
df_teste = pd.DataFrame(X_test)

In [57]:
df_teste["PRED"] = 0
df_teste.loc[anomalias_indices, 'PRED'] = 1
df_teste["class"] = y_test.values

In [58]:
accuracy_score(df_teste['class'].values,  df_teste["PRED"].values)

0.6996301196848499

In [59]:
precision = precision_score(df_teste['class'].values,  df_teste["PRED"].values, average=None)
recall = recall_score(df_teste['class'].values,  df_teste["PRED"].values, average=None)
f1 = f1_score(df_teste['class'].values,  df_teste["PRED"].values, average=None)

In [60]:
for i in range(len(precision)):
    print(f'Classe {i}:')
    print(f'Precision: {precision[i]}')
    print(f'Recall: {recall[i]}')
    print(f'F1-score: {f1[i]}\n')

Classe 0:
Precision: 0.6915577221360514
Recall: 0.9889323948124844
F1-score: 0.8139337773017278

Classe 1:
Precision: 0.8529765484064943
Recall: 0.12707726763717805
F1-score: 0.22119995321819813

