# Evaluation of CatVAE discretization performance with preprocessed tank Dataset
Visual evaluation of the preciseness of discretization and meaningful categories. <br>
Wihtin the first subplot we plot a selection of measurement values from the dataset. <br>
The second subplot includes the learned discretizations from the CatVAE indicated by the index for the x-axis. <br>
The third subplot plots the loglikelihood of the CatVAE according to the the input data. <br>

To make a statement about the change in states and likelihood, we plot one codeblock with nominal system behavior and the following with anomalous system behavior. 

In [None]:
cd ..

In [None]:
import os
import pandas as pd
import torch
import plotly.graph_objects as go
import yaml
from plotly.subplots import make_subplots
import numpy as np
from ipywidgets import interact
from sklearn.metrics import precision_score

from utils import standardize_data
from seq_vae import seq_vae
from seq_datamodule import Dataset as seq_dataset
from sklearn.preprocessing import StandardScaler


In [None]:
def plot_like(anom):

    MODEL_VERSION_SEQ = 'VAE_training_hparams/SmA/seq_vae'
    ckpt_file_name_seq = os.listdir(f'./{MODEL_VERSION_SEQ}/checkpoints/')[-1]
    ckpt_file_path_seq = f'./{MODEL_VERSION_SEQ}/checkpoints/{ckpt_file_name_seq}'
    with open(f'./{MODEL_VERSION_SEQ}/hparams.yaml') as f:
        hparam_seq = yaml.safe_load(f)
    model_seq = seq_vae.load_from_checkpoint(ckpt_file_path_seq, hparams=hparam_seq).to('cuda')

    threshold = -550

    if anom=='norm':
        df_csv = pd.read_csv('preprocessed_data/SmA/id1_norm.csv').drop(columns=['CuStepNo ValueY']).reset_index(drop=True)
        df_label = pd.read_csv('preprocessed_data/SmA/id1_norm.csv')['CuStepNo ValueY'].iloc[20000:25000].reset_index(drop=True)
        scaler = StandardScaler().fit(pd.read_csv('preprocessed_data/SmA/id1_norm.csv').drop(columns=['CuStepNo ValueY']).iloc[8000:30000, :].reset_index(drop=True))
        df_csv_sc = pd.DataFrame(scaler.transform(df_csv), columns=df_csv.columns, index=df_csv.index).iloc[20000:25000, :].reset_index(drop=True)

    else:
        df_csv = pd.read_csv(f'preprocessed_data/SmA/{anom}_anomaly.csv').drop(columns=['CuStepNo ValueY']).reset_index(drop=True)
        df_label = pd.read_csv(f'preprocessed_data/SmA/{anom}_anomaly.csv')['CuStepNo ValueY'].iloc[40000:45000].reset_index(drop=True)
        scaler = StandardScaler().fit(pd.read_csv('preprocessed_data/SmA/id1_norm.csv').drop(columns=['CuStepNo ValueY']).iloc[8000:30000, :].reset_index(drop=True))
        df_csv_sc = pd.DataFrame(scaler.transform(df_csv), columns=df_csv.columns, index=df_csv.index).iloc[40000:45000, :].reset_index(drop=True)        
    dataset_seq = seq_dataset(dataframe=df_csv_sc.iloc[:], **hparam_seq)

    all_residuals = []
    for window in dataset_seq:
        # 'window' is the input sequence
        predicted, x = model_seq(window.unsqueeze(0).to('cuda'))
        all_residuals.extend(x['recon_loss'].flatten().detach().cpu().numpy())

    all_residuals = np.array(all_residuals)
    anom_labels = np.where(all_residuals < threshold, .5 , 0)

    fig = make_subplots(rows=5, cols=1, shared_xaxes=True)
    for i in range(0,3):
        fig.add_trace(go.Scatter(x=pd.DataFrame(df_csv_sc).index, y=pd.DataFrame(df_csv_sc)[pd.DataFrame(df_csv_sc).columns[i]],  mode='markers'),
                      row=1, col=1)
    new_x = list(range(10, 10 + len(df_csv)))
    fig.add_trace(go.Scatter(x=new_x, y=all_residuals, name='residual', mode='markers'), row=4, col=1)
    fig.add_trace(go.Scatter(x=new_x, y=anom_labels, mode='lines', name='anomaly'), row=5, col=1)
    fig.update_layout(title_text=anom)
    return fig

def composite_f1score(anomaly_data):

    MODEL_VERSION_SEQ = 'VAE_training_hparams/SmA/seq_vae'
    threshold = -550
    ckpt_file_name_seq = os.listdir(f'./{MODEL_VERSION_SEQ}/checkpoints/')[-1]
    ckpt_file_path_seq = f'./{MODEL_VERSION_SEQ}/checkpoints/{ckpt_file_name_seq}'
    with open(f'./{MODEL_VERSION_SEQ}/hparams.yaml') as f:
        hparam_seq = yaml.safe_load(f)
    model_seq = seq_vae.load_from_checkpoint(ckpt_file_path_seq, hparams=hparam_seq).to('cuda')

    

    fscore_arr = []
    for anom_name in anomaly_data:
        df_csv = pd.read_csv(f'preprocessed_data/SmA/{anom_name}_anomaly.csv').drop(columns=['CuStepNo ValueY']).reset_index(drop=True)
        df_label = pd.read_csv(f'preprocessed_data/SmA/{anom_name}_anomaly.csv')['CuStepNo ValueY'].iloc[40000:45000].reset_index(drop=True)
        scaler = StandardScaler().fit(pd.read_csv('preprocessed_data/SmA/id1_norm.csv').drop(columns=['CuStepNo ValueY']).iloc[8000:30000, :].reset_index(drop=True))
        df_csv_sc = pd.DataFrame(scaler.transform(df_csv), columns=df_csv.columns, index=df_csv.index).iloc[40000:45000, :].reset_index(drop=True)
        dataset_seq = seq_dataset(dataframe=df_csv_sc, **hparam_seq)
        start_event = 0

        all_residuals = []
        for window in dataset_seq:
            # Assuming 'window' is your input sequence
            _, x = model_seq(window.unsqueeze(0).to('cuda'))  # Replace with your prediction method
            all_residuals.extend(x['recon_loss'].flatten().detach().cpu().numpy())

        all_residuals = np.array(all_residuals)
        anom_labels = np.where(all_residuals < threshold, .5 , 0)

        true_anomalies = np.ones(np.shape(all_residuals))
        pred_anomalies = np.array(anom_labels) != 0
        # True Positives (TP): True anomalies correctly predicted as anomalies
        tp = np.sum([pred_anomalies[start_event:].any()])
        # False Negatives (FN): True anomalies missed by the prediction
        fn = 1 - tp
        # Recall for events (Rec_e): Proportion of true anomalies correctly identified
        rec_e = tp / (tp + fn) if (tp + fn) > 0 else 0
        # Precision for the entire time series (Prec_t)
        prec_t = precision_score(true_anomalies, pred_anomalies)
        # Composite F-score
        if prec_t == 0 and rec_e == 0:
            fscore_c = 0
        else:
            fscore_c = 2 * rec_e * prec_t / (rec_e + prec_t)

        fscore_arr.append(fscore_c)  
        mean_fscore = np.mean(fscore_arr)    
    return mean_fscore



In [None]:
interact(plot_like, anom=['id2', 'norm',  'id3', 'id4', 'id5', 'id6', 'id7', 'id8', 'id9', 'id10'])

In [None]:
anomaly=['id2', 'id3', 'id4', 'id5', 'id6', 'id7', 'id8', 'id9', 'id10']

composite_f1score(anomaly)
