# Evaluation of CatVAE discretization performance with preprocessed tank Dataset
Visual evaluation of the preciseness of discretization and meaningful categories. <br>
Wihtin the first subplot we plot a selection of measurement values from the dataset. <br>
The second subplot includes the learned discretizations from the CatVAE indicated by the index for the x-axis. <br>
The third subplot plots the loglikelihood of the CatVAE according to the the input data. <br>

To make a statement about the change in states and likelihood, we plot one codeblock with nominal system behavior and the following with anomalous system behavior. 

In [None]:
cd ..

In [None]:
import os
import pandas as pd
import torch
import yaml
import numpy as np
from sklearn.metrics import precision_score
from sklearn.preprocessing import StandardScaler
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from ipywidgets import interact

from datamodule import Dataset
from catvae import CategoricalVAE

np.random.seed(123)
torch.manual_seed(123)

In [None]:
def plot_like(anomaly):
    threshold = -65

    MODEL_VERSION = 'VAE_training_hparams/tank/catvae'
    ckpt_file_name = os.listdir(f'./{MODEL_VERSION}/checkpoints/')[-1]
    ckpt_file_path = f'./{MODEL_VERSION}/checkpoints/{ckpt_file_name}'
    with open(f'./{MODEL_VERSION}/hparams.yaml') as f:
        hparam = yaml.safe_load(f)
    model = CategoricalVAE.load_from_checkpoint(ckpt_file_path, hparams=hparam["hparams"]).to('cuda')

    if anomaly=='norm':
        df_csv = pd.read_csv(f'preprocessed_data/tank_simulation/norm_long.csv').iloc[1000:4000, :3].reset_index(drop=True)
        labels_orig =  pd.read_csv(f'preprocessed_data/tank_simulation/norm_long.csv').iloc[1000:4000, 3].reset_index(drop=True)
        scaler = StandardScaler().fit(pd.read_csv('preprocessed_data/tank_simulation/norm_long.csv').iloc[1000:, :3].reset_index(drop=True))
        df_csv_sc = pd.DataFrame(scaler.transform(df_csv), columns=df_csv.columns, index=df_csv.index).iloc[:, :].reset_index(drop=True)

    else: 
        df_csv = pd.read_csv(f'preprocessed_data/tank_simulation/{anomaly}_long_faulty.csv').iloc[1000:4000, :3].reset_index(drop=True)
        labels_orig = pd.read_csv(f'preprocessed_data/tank_simulation/{anomaly}_long_faulty.csv').iloc[1000:4000, 3].reset_index(drop=True)
        scaler = StandardScaler().fit(pd.read_csv('preprocessed_data/tank_simulation/norm_long.csv').iloc[1000:, :3].reset_index(drop=True))
        df_csv_sc = pd.DataFrame(scaler.transform(df_csv), columns=df_csv.columns, index=df_csv.index).iloc[:, :].reset_index(drop=True)

    df = Dataset(dataframe = df_csv_sc)[:][0:]

    faulty_idx = labels_orig.str.contains('faulty').astype(int).reset_index(drop=True)
    faulty_idx.iloc[faulty_idx.idxmax():] = 1
    start_event = faulty_idx.idxmax()

    likelihood = pd.DataFrame(model.function_likelihood(torch.tensor(df).to(device='cuda')).cpu().detach()).rolling(10).median().fillna(method='bfill')
    pzx_logits, pzx, mu, sigma, pxz, z = model.get_states(torch.tensor(df).to(device='cuda'))
    df_states = pd.DataFrame(torch.zeros(z.shape).to(device='cuda').scatter(1, torch.argmax(pzx_logits, dim=1).unsqueeze(1), 1).cpu().detach().numpy(), index=pd.DataFrame(df_csv).index).astype(int)
    cats = pd.DataFrame(df_states.idxmax(axis=1), index=pd.DataFrame(df_csv).index)  

    all_residuals = np.array(likelihood)
    anom_labels = np.where(all_residuals < threshold, .5 , 0)

    fig = make_subplots(rows=4, cols=1, shared_xaxes=True)
    for i in range(0,3):
        fig.add_trace(go.Scatter(x=pd.DataFrame(df_csv).index, y=pd.DataFrame(df)[pd.DataFrame(df).columns[i]], name=df_csv.columns[i],  mode='markers'), 
                      row=1, col=1)
    fig.add_trace(go.Scatter(x=pd.DataFrame(df_csv).index, y=likelihood[likelihood.columns[0]], mode='markers'), row=2, col=1)
    fig.add_trace(go.Scatter(x=pd.DataFrame(anom_labels).index, y=pd.DataFrame(anom_labels)[pd.DataFrame(anom_labels).columns[0]], mode='lines', name='anomaly'), row=3, col=1)    
    fig.add_trace(go.Scatter(x=faulty_idx.index, y=faulty_idx, mode='lines', name='fault'), row=3, col=1)
    return fig


def composite_f1score(anomaly_data):
    threshold = -65

    
    MODEL_VERSION = 'VAE_training_hparams/tank/catvae'
    ckpt_file_name = os.listdir(f'./{MODEL_VERSION}/checkpoints/')[-1]
    ckpt_file_path = f'./{MODEL_VERSION}/checkpoints/{ckpt_file_name}'
    with open(f'./{MODEL_VERSION}/hparams.yaml') as f:
        hparam = yaml.safe_load(f)
    model = CategoricalVAE.load_from_checkpoint(ckpt_file_path, hparams=hparam["hparams"]).to('cuda')

    fscore_arr = []
    for anom_name in anomaly_data:
        df_csv = pd.read_csv(f'preprocessed_data/tank_simulation/{anom_name}_long_faulty.csv').iloc[1000:4000, :3].reset_index(drop=True)
        labels_orig = pd.read_csv(f'preprocessed_data/tank_simulation/{anom_name}_long_faulty.csv').iloc[1000:4000, 3].reset_index(drop=True)
        scaler = StandardScaler().fit(pd.read_csv('preprocessed_data/tank_simulation/norm_long.csv').iloc[1000:, :3].reset_index(drop=True))
        df_csv_sc = pd.DataFrame(scaler.transform(df_csv), columns=df_csv.columns, index=df_csv.index).iloc[:, :].reset_index(drop=True)
        df = Dataset(dataframe = df_csv_sc)[:][0:]

        faulty_idx = labels_orig.str.contains('faulty').astype(int).reset_index(drop=True)
        faulty_idx.iloc[faulty_idx.idxmax():] = 1
        start_event = faulty_idx.idxmax()

        likelihood = pd.DataFrame(model.function_likelihood(torch.tensor(df).to(device='cuda')).cpu().detach()).rolling(10).median().fillna(method='bfill')
        pzx_logits, pzx, mu, sigma, pxz, z = model.get_states(torch.tensor(df).to(device='cuda'))
        df_states = pd.DataFrame(torch.zeros(z.shape).to(device='cuda').scatter(1, torch.argmax(pzx_logits, dim=1).unsqueeze(1), 1).cpu().detach().numpy(), index=pd.DataFrame(df_csv).index).astype(int)
        cats = pd.DataFrame(df_states.idxmax(axis=1), index=pd.DataFrame(df_csv).index)  

        all_residuals = np.array(likelihood)
        anom_labels = np.where(all_residuals < threshold, .5 , 0)

        true_anomalies = np.array(faulty_idx.iloc[:]) != 0
        pred_anomalies = np.array(anom_labels) != 0
        # True Positives (TP): True anomalies correctly predicted as anomalies
        tp = np.sum([pred_anomalies[start_event:].any()])
        # False Negatives (FN): True anomalies missed by the prediction
        # fn = np.sum(true_anomalies & ~pred_anomalies)
        fn = 1 - tp
        # Recall for events (Rec_e): Proportion of true anomalies correctly identified
        rec_e = tp / (tp + fn) if (tp + fn) > 0 else 0
        # Precision for the entire time series (Prec_t)
        prec_t = precision_score(true_anomalies, pred_anomalies)
        # Composite F-score
        if prec_t == 0 and rec_e == 0:
            fscore_c = 0
        else:
            fscore_c = 2 * rec_e * prec_t / (rec_e + prec_t)

        fscore_arr.append(fscore_c)  
        mean_fscore = np.mean(fscore_arr)    
    return mean_fscore


In [None]:
# plot of nominal data
anomalies = ['norm', 'v12_50s', 'q1not', 'v12not', 'v23not', 'v3not', 
                             'q1short1s', 'v12short1s', 'v23short1s', 'v3short1s',
                              'q1_50s', 'v12_50s', 'v23_50s', 'v3_50s',
                               'q1_100s', 'v12_100s', 'v23_100s', 'v3_100s', 
                               'rest_100s', 'q1v3_100s', 'v12v23_100s', 
                               'v12v3_100s', 'q1v23_100s']
interact(plot_like, anomaly=anomalies)

In [None]:
anomaly_data=['v12v23_100s', 'v12_50s',
                             'q1short1s', 'v12short1s', 'v23short1s', 'v3short1s',
                              'q1_50s', 'v12_50s', 'v23_50s', 'v3_50s',
                               'q1_100s', 'v12_100s', 'v23_100s', 'v3_100s', 
                               'rest_100s', 'q1v3_100s', 'v12v23_100s', 
                               'v12v3_100s', 'q1v23_100s']


composite_f1score(anomaly_data)
