# Evaluation of CatVAE discretization performance with preprocessed tank Dataset
Visual evaluation of the preciseness of discretization and meaningful categories. <br>
Wihtin the first subplot we plot a selection of measurement values from the dataset. <br>
The second subplot includes the learned discretizations from the CatVAE indicated by the index for the x-axis. <br>
The third subplot plots the loglikelihood of the CatVAE according to the the input data. <br>

To make a statement about the change in states and likelihood, we plot one codeblock with nominal system behavior and the following with anomalous system behavior. 

In [None]:
cd ..

In [None]:
import os
import pandas as pd
import torch
import plotly.graph_objects as go
import yaml
from plotly.subplots import make_subplots
import numpy as np
from ipywidgets import interact
from sklearn.metrics import precision_score

from seq_vae import seq_vae
from seq_datamodule import Dataset as seq_dataset
from sklearn.preprocessing import StandardScaler

In [None]:
def plot_like(anomaly):
    threshold = -20

    MODEL_VERSION_SEQ = 'VAE_training_hparams/tank/seq_vae'
    ckpt_file_name_seq = os.listdir(f'./{MODEL_VERSION_SEQ}/checkpoints/')[-1]
    ckpt_file_path_seq = f'./{MODEL_VERSION_SEQ}/checkpoints/{ckpt_file_name_seq}'
    with open(f'./{MODEL_VERSION_SEQ}/hparams.yaml') as f:
        hparam_seq = yaml.safe_load(f)
    model_seq = seq_vae.load_from_checkpoint(ckpt_file_path_seq, hparams=hparam_seq)

    if anomaly=='norm':
        df_csv = pd.read_csv('preprocessed_data/tank_simulation/norm_long.csv').iloc[1000:2000, :3].reset_index(drop=True)
        df_csv_realcat = pd.read_csv(f'preprocessed_data/tank_simulation/norm_long.csv').iloc[1000:2000, 3].reset_index(drop=True)
        scaler = StandardScaler().fit(pd.read_csv('preprocessed_data/tank_simulation/norm_long.csv').iloc[1000:, :3].reset_index(drop=True))
        df_csv_sc = pd.DataFrame(scaler.transform(df_csv), columns=df_csv.columns, index=df_csv.index).iloc[:, :].reset_index(drop=True)
 
    else:
        df_csv = pd.read_csv(f'preprocessed_data/tank_simulation/{anomaly}_long_faulty.csv').iloc[1000:2000, :3].reset_index(drop=True)
        df_csv_realcat = pd.read_csv(f'preprocessed_data/tank_simulation/{anomaly}_long_faulty.csv').iloc[1000:2000, 3].reset_index(drop=True)
        scaler = StandardScaler().fit(pd.read_csv('preprocessed_data/tank_simulation/norm_long.csv').iloc[1000:, :3].reset_index(drop=True))
        df_csv_sc = pd.DataFrame(scaler.transform(df_csv), columns=df_csv.columns, index=df_csv.index).iloc[:, :].reset_index(drop=True)

    faulty_idx = df_csv_realcat.str.contains('faulty').astype(int).reset_index(drop=True)
    faulty_idx.iloc[faulty_idx.idxmax():] = 1
    dataset_seq = seq_dataset(dataframe=df_csv_sc, **hparam_seq)
    start_event = faulty_idx.idxmax()

    all_residuals = []
    for window in dataset_seq:
        # Assuming 'window' is your input sequence
        predicted, x = model_seq(window.unsqueeze(0).to('cuda'))  # Replace with your prediction method
        all_residuals.extend(x['recon_loss'].flatten().detach().cpu().numpy())

    all_residuals = np.array(all_residuals)
    anom_labels = np.where(all_residuals < threshold, .5 , 0)

    true_anomalies = np.array(faulty_idx.iloc[10:]) != 0
    pred_anomalies = np.array(anom_labels) != 0
    # True Positives (TP): True anomalies correctly predicted as anomalies
    tp = np.sum([pred_anomalies[start_event:].any()])
    # False Negatives (FN): True anomalies missed by the prediction
    fn = 1 - tp
    # Recall for events (Rec_e): Proportion of true anomalies correctly identified
    rec_e = tp / (tp + fn) if (tp + fn) > 0 else 0
    # Precision for the entire time series (Prec_t)
    prec_t = precision_score(true_anomalies, pred_anomalies)
    # Composite F-score
    if prec_t == 0 and rec_e == 0:
        fscore_c = 0
    else:
        fscore_c = 2 * rec_e * prec_t / (rec_e + prec_t)

    fig = make_subplots(rows=5, cols=1, shared_xaxes=True)
    for i in range(0,3):
        fig.add_trace(go.Scatter(x=pd.DataFrame(df_csv_sc).index, y=pd.DataFrame(df_csv_sc)[pd.DataFrame(df_csv_sc).columns[i]], name=df_csv_sc.columns[i],  mode='markers'), 
                      row=1, col=1)
    new_x = list(range(10, 10 + len(df_csv_sc)))

    fig.add_trace(go.Scatter(x=pd.DataFrame(df_csv_realcat).index, y=df_csv_realcat.values, mode='markers'), row=3, col=1)
    fig.add_trace(go.Scatter(x=new_x, y=all_residuals, name='residual', mode='markers'), row=4, col=1)
    fig.add_trace(go.Scatter(x=faulty_idx.index, y=faulty_idx, mode='lines', name='fault'), row=5, col=1)
    fig.add_trace(go.Scatter(x=new_x, y=anom_labels, mode='lines', name='anomaly'), row=5, col=1)

    fig.update_layout(title_text=anomaly)
    return fig

def composite_f1score(anomaly_data):
    threshold = 20

    MODEL_VERSION_SEQ = 'VAE_training_hparams/tank/seq_vae'
    ckpt_file_name_seq = os.listdir(f'./{MODEL_VERSION_SEQ}/checkpoints/')[-1]
    ckpt_file_path_seq = f'./{MODEL_VERSION_SEQ}/checkpoints/{ckpt_file_name_seq}'
    with open(f'./{MODEL_VERSION_SEQ}/hparams.yaml') as f:
        hparam_seq = yaml.safe_load(f)
    model_seq = seq_vae.load_from_checkpoint(ckpt_file_path_seq, hparams=hparam_seq)

    fscore_arr = []
    for anom_name in anomaly_data:
        df_csv = pd.read_csv(f'preprocessed_data/tank_simulation/{anom_name}_long_faulty.csv').iloc[1000:2000, :3].reset_index(drop=True)
        df_csv_realcat = pd.read_csv(f'preprocessed_data/tank_simulation/{anom_name}_faulty.csv').iloc[1000:2000, 3].reset_index(drop=True)
        scaler = StandardScaler().fit(pd.read_csv('preprocessed_data/tank_simulation/norm_long.csv').iloc[1000:, :3].reset_index(drop=True))
        df_csv_sc = pd.DataFrame(scaler.transform(df_csv), columns=df_csv.columns, index=df_csv.index).iloc[:, :].reset_index(drop=True)
        faulty_idx = df_csv_realcat.str.contains('faulty').astype(int).reset_index(drop=True)
        faulty_idx.iloc[faulty_idx.idxmax():] = 1
        dataset_seq = seq_dataset(dataframe=df_csv_sc, **hparam_seq)
        start_event = faulty_idx.idxmax()

        all_residuals = []
        for window in dataset_seq:
            # Assuming 'window' is your input sequence
            _, x = model_seq(window.unsqueeze(0).to('cuda'))  # Replace with your prediction method
            all_residuals.extend(x['recon_loss'].flatten().detach().cpu().numpy())

        all_residuals = np.array(all_residuals)
        anom_labels = np.where(all_residuals < threshold, .5 , 0)

        true_anomalies = np.array(faulty_idx.iloc[10:]) != 0
        pred_anomalies = np.array(anom_labels) != 0
        # True Positives (TP): True anomalies correctly predicted as anomalies
        tp = np.sum([pred_anomalies[start_event:].any()])
        # False Negatives (FN): True anomalies missed by the prediction
        fn = 1 - tp
        # Recall for events (Rec_e): Proportion of true anomalies correctly identified
        rec_e = tp / (tp + fn) if (tp + fn) > 0 else 0
        # Precision for the entire time series (Prec_t)
        prec_t = precision_score(true_anomalies, pred_anomalies)
        # Composite F-score
        if prec_t == 0 and rec_e == 0:
            fscore_c = 0
        else:
            fscore_c = 2 * rec_e * prec_t / (rec_e + prec_t)

        fscore_arr.append(fscore_c)
        mean_fscore = np.mean(fscore_arr)    
    return mean_fscore


In [None]:
interact(plot_like, anomaly=['norm', 'v12_50s', 'q1not', 'v12not', 'v23not', 'v3not', 
                             'q1short1s', 'v12short1s', 'v23short1s', 'v3short1s',
                              'q1_50s', 'v12_50s', 'v23_50s', 'v3_50s',
                               'q1_100s', 'v12_100s', 'v23_100s', 'v3_100s', 
                               'rest_100s', 'q1v3_100s', 'v12v23_100s', 
                               'v12v3_100s', 'q1v23_100s'])


In [None]:
anomaly_data=['v12v23_100s', 'v12_50s',
                             'q1short1s', 'v12short1s', 'v23short1s', 'v3short1s',
                              'q1_50s', 'v12_50s', 'v23_50s', 'v3_50s',
                               'q1_100s', 'v12_100s', 'v23_100s', 'v3_100s', 
                               'rest_100s', 'q1v3_100s', 'v12v23_100s', 
                               'v12v3_100s', 'q1v23_100s']


composite_f1score(anomaly_data)
