# Evaluation of CatVAE discretization performance with preprocessed tank Dataset
Visual evaluation of the preciseness of discretization and meaningful categories. <br>


In [None]:
cd ..

In [None]:
import os
import pandas as pd
import torch
import plotly.graph_objects as go
import yaml
from plotly.subplots import make_subplots
import numpy as np
from ipywidgets import interact

from seqcat_datamodule import Dataset
from seqcat_catvae import seqcat_vae
from sklearn.preprocessing import StandardScaler
np.random.seed(123)
torch.manual_seed(123)


In [None]:
def plot_like(idx):
    # load trained model
    MODEL_VERSION = f'VAE_training_hparams/tank/rnn_catvae/version_{idx}'
    ckpt_file_name = os.listdir(f'./{MODEL_VERSION}/checkpoints/')[-1]
    ckpt_file_path = f'./{MODEL_VERSION}/checkpoints/{ckpt_file_name}'
    with open(f'./{MODEL_VERSION}/hparams.yaml') as f:
        hparam = yaml.safe_load(f)
    model = seqcat_vae.load_from_checkpoint(ckpt_file_path, hparams=hparam["hparams"])
    # read normal data
    df_csv = pd.read_csv(f'preprocessed_data/tank_simulation/norm_long.csv').iloc[1500:, :3].reset_index(drop=True)
    df_csv_realcat = pd.read_csv(f'preprocessed_data/tank_simulation/norm_long.csv').iloc[1500:, 3].reset_index(drop=True)
    scaler = StandardScaler().fit(pd.read_csv('preprocessed_data/tank_simulation/norm_long.csv').iloc[:, :3].reset_index(drop=True))
    df_csv_sc = pd.DataFrame(scaler.transform(df_csv), columns=df_csv.columns, index=df_csv.index).iloc[:2000, :].reset_index(drop=True)

    faulty_idx = df_csv_realcat.str.contains('faulty').astype(int)
    dataset = Dataset(dataframe=df_csv_sc.iloc[:, :3].reset_index(drop=True), number_timesteps=hparam["hparams"]["NUMBER_TIMESTEPS"])

    all_cats = []
    all_kl = []
    all_like = []
    all_residuals = []
    all_mu = []
    # compute discretized categories and likelihoods
    for window in dataset:
        pzx_logits, pzx, mu, sigma, pxz, z = model.get_states(window.unsqueeze(0).to('cuda'))
        _, kl = model.kl_divergence(pzx=pzx)
        like = model.function_likelihood(x=window.unsqueeze(0).to('cuda')).mean()
        z_list = z.detach().cpu().numpy().astype(int) 
        all_cats.append(z_list)
        all_kl.append(kl.detach().cpu().numpy())
        all_like.append(like.detach().cpu().numpy())
        all_mu.append(mu.detach().cpu().numpy())

    all = pd.DataFrame(np.vstack(all_cats))
    kl_ = pd.DataFrame(np.vstack(all_kl))
    cats = pd.DataFrame(all.idxmax(axis=1))
    like_ = pd.DataFrame(all_like)
    mu_ = pd.DataFrame(np.vstack(all_mu)[::10].reshape(-1, np.vstack(all_mu)[::10].shape[2]))
    data_ = pd.DataFrame(np.vstack(dataset)[::10].reshape(-1, np.vstack(all_mu)[::10].shape[2]))
    unique_cats = cats[cats.columns[0]].unique()
    
    # compute purity measure
    cluster_assignments = cats[cats.columns[0]]
    class_assignments = df_csv_realcat[:len(cluster_assignments)]
    assert len(cluster_assignments) == len(class_assignments)
    
    num_samples = len(cluster_assignments)
    num_clusters = len(np.unique(cluster_assignments))
    num_classes = len(np.unique(class_assignments))
    
    cluster_class_counts = {cluster_: {class_: 0 for class_ in np.unique(class_assignments)}
                            for cluster_ in np.unique(cluster_assignments)}
    
    for cluster_, class_ in zip(cluster_assignments, class_assignments):
        cluster_class_counts[cluster_][class_] += 1
        
    total_intersection = sum([max(list(class_dict.values())) for cluster_, class_dict in cluster_class_counts.items()])
    purity = total_intersection/num_samples
    print(purity)

    fig = make_subplots(rows=6, cols=1, shared_xaxes=True)
    for i in range(0,3):
        fig.add_trace(go.Scatter(x=pd.DataFrame(df_csv_sc).index, y=pd.DataFrame(df_csv_sc)[pd.DataFrame(df_csv_sc).columns[i]], name=df_csv_sc.columns[i],  mode='markers'), 
                    row=1, col=1)
    fig.add_trace(go.Scatter(x = pd.DataFrame(df_csv_sc).index, y=cats[cats.columns[0]], name='category', mode='lines'),row=2, col=1)
    fig.add_trace(go.Scatter(x=pd.DataFrame(df_csv_sc).index, y=df_csv_realcat.values, name='vae cats', mode='markers'), row=3, col=1)
    return fig

# computation of total purity
def calc_purity():
    total_purity = []
    # compute mean and var of trained models
    for idx in range(10):
        # load trained model 
        MODEL_VERSION = f'VAE_training_hparams/tank/rnn_catvae/version_{idx}'
        ckpt_file_name = os.listdir(f'./{MODEL_VERSION}/checkpoints/')[-1]
        ckpt_file_path = f'./{MODEL_VERSION}/checkpoints/{ckpt_file_name}'
        with open(f'./{MODEL_VERSION}/hparams.yaml') as f:
            hparam = yaml.safe_load(f)
        model = seqcat_vae.load_from_checkpoint(ckpt_file_path, hparams=hparam["hparams"])
        # read normal data
        df_csv = pd.read_csv(f'preprocessed_data/tank_simulation/norm_long.csv').iloc[:, :3].reset_index(drop=True)
        df_csv_realcat = pd.read_csv(f'preprocessed_data/tank_simulation/norm_long.csv').iloc[1500:, 3].reset_index(drop=True)
        scaler = StandardScaler().fit(pd.read_csv('preprocessed_data/tank_simulation/norm_long.csv').iloc[:, :3].reset_index(drop=True))
        df_csv_sc = pd.DataFrame(scaler.transform(df_csv), columns=df_csv.columns, index=df_csv.index).iloc[1500:, :].reset_index(drop=True)

        faulty_idx = df_csv_realcat.str.contains('faulty').astype(int)
        dataset = Dataset(dataframe=df_csv_sc.iloc[:, :3].reset_index(drop=True), number_timesteps=hparam["hparams"]["NUMBER_TIMESTEPS"])


        all_cats = []
        all_kl = []
        all_like = []
        all_residuals = []
        all_mu = []
        # compute discretized categories and likelihoods
        for window in dataset:
            pzx_logits, pzx, mu, sigma, pxz, z = model.get_states(window.unsqueeze(0).to('cuda'))
            _, kl = model.kl_divergence(pzx=pzx)
            like = model.function_likelihood(x=window.unsqueeze(0).to('cuda')).mean()
            z_list = z.detach().cpu().numpy().astype(int) 
            all_cats.append(z_list)
            all_kl.append(kl.detach().cpu().numpy())
            all_like.append(like.detach().cpu().numpy())
            all_mu.append(mu.detach().cpu().numpy())

        all = pd.DataFrame(np.vstack(all_cats))
        kl_ = pd.DataFrame(np.vstack(all_kl))
        cats = pd.DataFrame(all.idxmax(axis=1))
        like_ = pd.DataFrame(all_like)
        mu_ = pd.DataFrame(np.vstack(all_mu)[::10].reshape(-1, np.vstack(all_mu)[::10].shape[2]))
        data_ = pd.DataFrame(np.vstack(dataset)[::10].reshape(-1, np.vstack(all_mu)[::10].shape[2]))
        unique_cats = cats[cats.columns[0]].unique()
        # compute purity measure
        cluster_assignments = cats[cats.columns[0]]
        class_assignments = df_csv_realcat[0:-10]
        assert len(cluster_assignments) == len(class_assignments)
        
        num_samples = len(cluster_assignments)
        num_clusters = len(np.unique(cluster_assignments))
        num_classes = len(np.unique(class_assignments))
        
        cluster_class_counts = {cluster_: {class_: 0 for class_ in np.unique(class_assignments)}
                                for cluster_ in np.unique(cluster_assignments)}
        
        for cluster_, class_ in zip(cluster_assignments, class_assignments):
            cluster_class_counts[cluster_][class_] += 1
            
        total_intersection = sum([max(list(class_dict.values())) for cluster_, class_dict in cluster_class_counts.items()])
        
        purity = total_intersection/num_samples
        print(purity)
        total_purity.append(purity)
    mean = sum(total_purity)/len(total_purity)
    variance = sum([((x - mean) ** 2) for x in total_purity]) / len(total_purity) 
    std = variance ** 0.5
    return {'mean ':mean, 'std ':std}



In [None]:
# plot function to see likelihoods and discretizations
interact(plot_like, idx = range(10))


In [None]:
# computing the purity
calc_purity()