# Evaluation of CatVAE discretization performance with preprocessed SmA
 Dataset
Visual evaluation of the preciseness of discretization and meaningful categories compared to original states. <br>
Wihtin the first subplot we plot a selection of measurement values from the dataset. <br>
The second subplot includes the learned discretizations from the CatVAE indicated by the index for the x-axis. <br>
The third subplot plots the loglikelihood of the CatVAE according to the the input data. <br>

To make a statement about the change in states and likelihood, we plot one codeblock with nominal system behavior and the following with anomalous system (Deviation ID2) behavior. 

In [None]:
cd ..

In [None]:
import os
import pandas as pd
import torch
import yaml
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np
from ipywidgets import interact

from utils import standardize_data
from datamodule import Dataset
from catvae import CategoricalVAE

np.random.seed(123)
torch.manual_seed(123)


In [None]:
def plot_like(idx):
    MODEL_VERSION = f'VAE_training_hparams/SmA/catvae/version_{idx}'
    ckpt_file_name = os.listdir(f'./{MODEL_VERSION}/checkpoints/')[-1]
    ckpt_file_path = f'./{MODEL_VERSION}/checkpoints/{ckpt_file_name}'
    with open(f'./{MODEL_VERSION}/hparams.yaml') as f:
        hparam = yaml.safe_load(f)
    model = CategoricalVAE.load_from_checkpoint(ckpt_file_path, hparams=hparam["hparams"]).to('cuda')

    df_csv = pd.read_csv('preprocessed_data/SmA/id1_norm.csv').drop(columns=['CuStepNo ValueY']).iloc[10000:30000].reset_index(drop=True)
    labels_orig = pd.read_csv('preprocessed_data/SmA/id1_norm.csv')['CuStepNo ValueY'].iloc[10000:30000].reset_index(drop=True)
    df_sc = standardize_data(df_csv, 'scaler_SmA.pkl')
    df = Dataset(dataframe = df_sc)[:][0:]
    new_categories = pd.DataFrame(labels_orig)
    
    likelihood = pd.DataFrame(model.function_likelihood(torch.tensor(df).to(device='cuda')).cpu().detach()).rolling(10).median().fillna(method='bfill')
    pzx_logits, pzx, mu, sigma, pxz, z = model.get_states(torch.tensor(df).to(device='cuda'))
    df_states = pd.DataFrame(torch.zeros(z.shape).to(device='cuda').scatter(1, torch.argmax(pzx_logits, dim=1).unsqueeze(1), 1).cpu().detach().numpy(), index=pd.DataFrame(df).index).astype(int)
    cats = pd.DataFrame(df_states.idxmax(axis=1), index=pd.DataFrame(df).index)  

    cluster_assignments = cats[cats.columns[0]]
    class_assignments = labels_orig[:len(cluster_assignments)]
    assert len(cluster_assignments) == len(class_assignments)
    
    num_samples = len(cluster_assignments)
    num_clusters = len(np.unique(cluster_assignments))
    num_classes = len(np.unique(class_assignments))
    
    cluster_class_counts = {cluster_: {class_: 0 for class_ in np.unique(class_assignments)}
                            for cluster_ in np.unique(cluster_assignments)}
    
    for cluster_, class_ in zip(cluster_assignments, class_assignments):
        cluster_class_counts[cluster_][class_] += 1
        
    total_intersection = sum([max(list(class_dict.values())) for cluster_, class_dict in cluster_class_counts.items()])
    purity = total_intersection/num_samples
    print(purity)

    fig = make_subplots(rows=3, cols=1, shared_xaxes=True)
    for i in range(0,3):
        fig.add_trace(go.Scatter(x=pd.DataFrame(df_sc).index, y=pd.DataFrame(df_sc)[pd.DataFrame(df_sc).columns[i]], name=df_sc.columns[i],  mode='markers'), 
                      row=1, col=1)
    fig.add_trace(go.Scatter(x=pd.DataFrame(df_sc).index, y=labels_orig, mode='markers', name='original categories'), row=2, col=1)
    fig.add_trace(go.Scatter(x = pd.DataFrame(df_sc).index, y=cats[cats.columns[0]], name='category', mode='lines'),row=3, col=1)
    return fig

# computation of total purity
def calc_purity():
    total_purity = []
    for idx in range(0,10):     
        MODEL_VERSION = f'VAE_training_hparams/SmA/catvae/version_{idx}'
        ckpt_file_name = os.listdir(f'./{MODEL_VERSION}/checkpoints/')[-1]
        ckpt_file_path = f'./{MODEL_VERSION}/checkpoints/{ckpt_file_name}'
        with open(f'./{MODEL_VERSION}/hparams.yaml') as f:
            hparam = yaml.safe_load(f)
        model = CategoricalVAE.load_from_checkpoint(ckpt_file_path, hparams=hparam["hparams"]).to('cuda')

        df = pd.read_csv('preprocessed_data/SmA/id1_norm.csv').reset_index(drop=True).drop(columns=['CuStepNo ValueY'])
        df_label = pd.read_csv('preprocessed_data/SmA/id1_norm.csv')['CuStepNo ValueY'].iloc[10000:30000].reset_index(drop=True)
        df_scaled = standardize_data(df, f'scaler_siemens_train.pkl')
        df = Dataset(dataframe = df_scaled)[:][10000:30000]

        likelihood = pd.DataFrame(model.function_likelihood(torch.tensor(df).to(device='cuda')).cpu().detach()).rolling(10).median().fillna(method='bfill')
        pzx_logits, pzx, mu, sigma, pxz, z = model.get_states(torch.tensor(df).to(device='cuda'))
        df_states = pd.DataFrame(torch.zeros(z.shape).to(device='cuda').scatter(1, torch.argmax(pzx_logits, dim=1).unsqueeze(1), 1).cpu().detach().numpy(), index=pd.DataFrame(df).index).astype(int)
        cats = pd.DataFrame(df_states.idxmax(axis=1), index=pd.DataFrame(df).index)  

        cluster_assignments = cats[cats.columns[0]]
        class_assignments = df_label[:len(cluster_assignments)]
        assert len(cluster_assignments) == len(class_assignments)
        
        num_samples = len(cluster_assignments)
        num_clusters = len(np.unique(cluster_assignments))
        num_classes = len(np.unique(class_assignments))
        
        cluster_class_counts = {cluster_: {class_: 0 for class_ in np.unique(class_assignments)}
                                for cluster_ in np.unique(cluster_assignments)}
        
        for cluster_, class_ in zip(cluster_assignments, class_assignments):
            cluster_class_counts[cluster_][class_] += 1
            
        total_intersection = sum([max(list(class_dict.values())) for cluster_, class_dict in cluster_class_counts.items()])
        
        purity = total_intersection/num_samples
        total_purity.append(purity)
    mean = sum(total_purity)/len(total_purity)
    variance = sum([((x - mean) ** 2) for x in total_purity]) / len(total_purity) 
    std = variance ** 0.5
    return {'mean ':mean, 'std ':std}


In [None]:
# plot of nominal data
interact(plot_like, idx=range(10))

In [None]:
calc_purity()