In [1]:
import pandas as pd
import os
import sys
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import dill
from typing import List
import scipy.stats as sts
import pickle
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import load_model

import util
from pathlib import Path
from sk.replace_column_dataframe import ReplaceColumnDataFrame
from sk.norm_standard_scaler import NormStandardScaler
import constants.columns_dataframe as const
from class_manipulates_path import ManipulatePath
from class_preprocessing_refactor import Preprocessing
from class_format_data import FormatData
util.init()

current_path = sys.path[0]

In [2]:
import tensorflow as tf

# Verifique se a GPU está disponível
print("GPU disponível:", tf.config.list_physical_devices('GPU'))

# Verifique se o TensorFlow está usando a GPU
print("TensorFlow está usando a GPU:", tf.test.is_built_with_cuda())

GPU disponível: []
TensorFlow está usando a GPU: False


In [3]:
print("Configurações do TensorFlow relacionadas à GPU:")
print(tf.config.list_physical_devices('GPU'))
print("TensorFlow está usando a GPU:", tf.test.is_built_with_cuda())
print("Número de GPUs disponíveis:", len(tf.config.experimental.list_physical_devices('GPU')))

Configurações do TensorFlow relacionadas à GPU:
[]
TensorFlow está usando a GPU: False
Número de GPUs disponíveis: 0


In [4]:
def plot_variable(df, column_name, intervalos=None):
    """
    Plota um gráfico de dispersão para uma variável de um DataFrame.

    Parâmetros:
        - df: DataFrame pandas contendo os dados.
        - column_name: Nome da coluna que você deseja plotar.
        - intervalos: Lista de intervalos para plotar linhas verticais.
    """
    plt.figure(figsize=(20, 10))
    plt.scatter(df.index.values, df[column_name].values, color='darkcyan', alpha=0.5)
    plt.xlabel('Índice')
    plt.ylabel(column_name)
    plt.title(f'Gráfico de dispersão para a variável "{column_name}"')


    # Plotar linhas verticais para cada intervalo
    if intervalos:
        for intervalo in intervalos:
            plt.axvline(x=intervalo, linestyle='--', color='red')

    plt.show()

In [5]:
def mapping_labels(df: pd.DataFrame):
    df_data = df.copy()
    unique_labels = df_data['class'].unique()
    unique_labels.sort()

    # Mapear os rótulos para inteiros em ordem crescente
    label_mapping = {label: i for i, label in enumerate(unique_labels)}

    # Aplicar o mapeamento aos rótulos verdadeiros
    labels_int = df_data['class'].map(label_mapping)

    # Criar DataFrame com rótulos inteiros
    df_mapped = pd.DataFrame({'class': df_data['class'], 'mapped_class': labels_int})
    df_data['class'] = df_mapped["mapped_class"]
    return df_data, label_mapping

def inverse_mapping_labels(df: pd.DataFrame, label_mapping: dict):
    df_data = df.copy()
    # Inverter o mapeamento original
    inverse_label_mapping = {v: k for k, v in label_mapping.items()}

    # Aplicar o mapeamento inverso aos rótulos
    labels_original = df_data['class'].map(inverse_label_mapping)

    # Atualizar o DataFrame com os rótulos originais
    df_data['class'] = labels_original
    return df_data

In [6]:
manipulate_path = ManipulatePath()

preprocessing = Preprocessing()

format_data = FormatData()

path_raw_data = manipulate_path.get_path_raw_data()

# Carregando Scaler

In [7]:
with open('scaler.pkl', 'rb') as file:
    loaded_scaler: StandardScaler = pickle.load(file)

# Avaliação com os dados desenhados

In [8]:
df_preprocessing = pd.read_parquet(manipulate_path.get_path_preprocessing_draw_data())

In [9]:
contagem_classes = df_preprocessing['class'].value_counts()

# Calcular a porcentagem de cada classe
porcentagem_classes_real = contagem_classes / len(df_preprocessing) * 100
porcentagem_classes_real

class
7    76.090745
1    19.606371
0     4.302884
Name: count, dtype: float64

In [10]:
X = df_preprocessing.drop('class', axis=1)
y = df_preprocessing['class']

X_norm = loaded_scaler.transform(X)

In [11]:
len(X)

1995127

In [12]:
model_1_load = load_model(os.path.join(Path(os.getcwd()), "peso_rede_1.h5"))

In [13]:
model_2_load = load_model(os.path.join(Path(os.getcwd()), "peso_rede_2.h5"))

In [14]:
y_test_nn_1 = ReplaceColumnDataFrame(const.TARGET, const.MAPPING_TWO_CLASSES).transform(pd.DataFrame(y))["class"]

In [15]:
np.unique(y_test_nn_1)

array([0, 1], dtype=int64)

In [16]:
y_pred = model_1_load.predict(X_norm)

y_pred = np.squeeze(y_pred)
y_pred = np.where(y_pred > 0.5, 1, 0)

accuracy_score(y_test_nn_1, y_pred)



0.2924024385415064

In [17]:
f1_score(y_test_nn_1, y_pred, average="macro")

0.24993209454582074

In [18]:
df_test_all = X_norm.copy()
df_test_all["class"] = y_pred
df_test_all_one = df_test_all[df_test_all["class"] == 1]

In [19]:
X_test_all = df_test_all_one.drop('class', axis=1)

In [20]:
y_pred_all = model_2_load.predict(X_test_all)
y_pred_all = np.argmax(y_pred_all, axis=1)



In [21]:
np.unique(y_pred_all)

array([1, 4], dtype=int64)

In [22]:
df_test_all_one.loc[:, "class"] = y_pred_all

In [23]:
df_test_all_one["class"].unique()

array([4, 1], dtype=int64)

In [24]:
df_test_all_one["class"] = df_test_all_one["class"].replace(4, 7)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test_all_one["class"] = df_test_all_one["class"].replace(4, 7)


In [28]:
df_test_all.loc[df_test_all["class"] == 1, "class"] = df_test_all_one["class"].values

In [29]:
df_test_all

Unnamed: 0_level_0,P-TPT,T-TPT,P-MON-CKP,T-JUS-CKP,class
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-09-05 20:44:36,-2.725227,0.475421,-1.651004,0.392071,7
2018-09-05 20:49:17,-2.725227,0.475421,-1.651004,0.392071,7
2018-09-05 20:49:18,-2.725227,0.475421,-1.651004,0.392071,7
2018-09-05 20:49:19,-2.725227,0.475421,-1.651004,0.392071,7
2018-09-05 20:49:20,-2.725227,0.475421,-1.651004,0.392071,7
...,...,...,...,...,...
2018-08-21 11:11:33,-2.725220,0.640268,-1.650997,0.294921,7
2018-08-21 11:11:35,-2.725220,0.640268,-1.650997,0.294921,7
2018-08-21 11:11:36,-2.725220,0.640268,-1.650997,0.294921,7
2018-08-21 11:11:38,-2.725220,0.640268,-1.650997,0.294921,7


In [30]:
df_test_all["class"].unique()

array([7, 0, 1], dtype=int64)

In [31]:
np.unique(y)

array([0, 1, 7], dtype=int64)

In [32]:
accuracy_score(y, df_test_all["class"].values)

0.11067766613353436

In [33]:
f1_score(y, df_test_all["class"].values, average="macro")

0.08113612727585373

In [34]:
precision = precision_score(y, df_test_all["class"].values, average=None)
recall = recall_score(y, df_test_all["class"].values, average=None)
f1 = f1_score(y, df_test_all["class"].values, average=None)

for i in range(len(precision)):
    print(f'Classe {np.unique(y)[i]}:')
    print(f'Precision: {precision[i]}')
    print(f'Recall: {recall[i]}')
    print(f'F1-score: {f1[i]}\n')

Classe 0:
Precision: 0.037863245897091106
Recall: 0.6326996552045475
F1-score: 0.07145060540825676

Classe 1:
Precision: 1.0
Recall: 0.007175871483644024
F1-score: 0.014249490455075016

Classe 7:
Precision: 0.2934676015482557
Recall: 0.1078270503989508
F1-score: 0.15770828596422942



# Avaliação com os dados simulados

In [35]:
df_preprocessing = pd.read_parquet(manipulate_path.get_path_preprocessing_simulated_data())

In [36]:
#df_preprocessing, label_mapping = mapping_labels(df_preprocessing)

In [37]:
label_mapping = {1.0: 0, 2.0: 1, 5.0: 2, 6.0: 3, 7.0: 4}

In [38]:
contagem_classes = df_preprocessing['class'].value_counts()

# Calcular a porcentagem de cada classe
porcentagem_classes_real = contagem_classes / len(df_preprocessing) * 100
porcentagem_classes_real

class
5    55.980035
1    40.844868
2     2.312167
0     0.862930
Name: count, dtype: float64

In [39]:
X = df_preprocessing.drop('class', axis=1)
y = df_preprocessing['class']

X_norm = loaded_scaler.transform(X)

In [49]:
len(X)

17437449

In [40]:
y_test_nn_1 = ReplaceColumnDataFrame(const.TARGET, const.MAPPING_TWO_CLASSES).transform(pd.DataFrame(y))["class"]

In [41]:
np.unique(
    y_test_nn_1
)

array([0, 1], dtype=int64)

In [42]:
y_pred = model_1_load.predict(X_norm)



In [43]:
y_pred

array([[8.1351134e-11],
       [8.1313900e-11],
       [8.1285992e-11],
       ...,
       [9.9999994e-01],
       [9.9999994e-01],
       [9.9999994e-01]], dtype=float32)

In [44]:
y_pred = np.squeeze(y_pred)
y_pred = np.where(y_pred > 0.5, 1, 0)

accuracy_score(y_test_nn_1, y_pred)

0.32644213038271824

In [47]:
np.unique(y_pred)

array([0, 1])

In [45]:
df_test_all = X_norm.copy()
df_test_all["class"] = y_pred
df_test_all_one = df_test_all[df_test_all["class"] == 1]

In [46]:
X_test_all = df_test_all_one.drop('class', axis=1)

In [48]:
y_pred_all = model_2_load.predict(X_test_all)
y_pred_all = np.argmax(y_pred_all, axis=1)



In [49]:
np.unique(y_pred_all)

array([0, 1, 2, 3, 4], dtype=int64)

In [57]:
unique, counts = np.unique(y_pred_all, return_counts=True)

# Calcular a proporção de cada valor
proportions = counts / len(y_pred_all)
proportions

array([0.46505533, 0.04955884, 0.01609157, 0.35750314, 0.11179111])

In [53]:
label_mapping

{1.0: 0, 2.0: 1, 5.0: 2, 6.0: 3, 7.0: 4}

In [51]:
temp = y_pred_all.copy()

In [54]:
temp = np.where(temp == 4, 7, temp)
temp = np.where(temp == 3, 6, temp)
temp = np.where(temp == 2, 5, temp)
temp = np.where(temp == 1, 2, temp)
temp = np.where(temp == 0, 1, temp)

In [55]:
np.unique(temp)

array([1, 2, 5, 6, 7], dtype=int64)

In [56]:
unique, counts = np.unique(temp, return_counts=True)

# Calcular a proporção de cada valor
proportions = counts / len(temp)
proportions

array([0.46505533, 0.04955884, 0.01609157, 0.35750314, 0.11179111])

In [58]:
df_test_all_one.loc[:, "class"] = temp

In [59]:
df_test_all_one["class"].unique()

array([6, 1, 5, 2, 7], dtype=int64)

In [62]:
df_test_all.loc[df_test_all["class"] == 1, "class"] = df_test_all_one["class"].values

In [64]:
np.unique(y)

array([0, 1, 2, 5], dtype=int64)

In [65]:
y_test_nn_2 = df_test_all["class"].values
np.unique(y_test_nn_2)

array([0, 1, 2, 5, 6, 7], dtype=int64)

In [66]:
accuracy_score(y, y_test_nn_2)

0.02053649017124007

In [67]:
f1_score(y, y_test_nn_2, average="macro")

0.13563843288342614

In [68]:
precision = precision_score(y, y_test_nn_2, average=None)
recall = recall_score(y, y_test_nn_2, average=None)
f1 = f1_score(y, y_test_nn_2, average=None)

  _warn_prf(average, modifier, msg_start, len(result))


In [69]:

for i in range(len(precision)):
    print(f'Classe {df_test_all["class"].unique()[i]}:')
    print(f'Precision: {precision[i]}')
    print(f'Recall: {recall[i]}')
    print(f'F1-score: {f1[i]}\n')

Classe 0:
Precision: 0.00718104513513201
Recall: 0.5613963966957527
F1-score: 0.014180699291871527

Classe 6:
Precision: 0.0
Recall: 0.0
F1-score: 0.0

Classe 1:
Precision: 0.9731141687619359
Recall: 0.6786719678161034
F1-score: 0.7996498980086854

Classe 5:
Precision: 0.0
Recall: 0.0
F1-score: 0.0

Classe 2:
Precision: 0.0
Recall: 0.0
F1-score: 0.0

Classe 7:
Precision: 0.0
Recall: 0.0
F1-score: 0.0

