# Evaluation of CatVAE discretization performance with preprocessed BeRfiPl Dataset
Visual evaluation of the preciseness of discretization and meaningful categories compared to original states. <br>


In [None]:
cd ..

In [None]:
import os
import pandas as pd
import torch
import plotly.graph_objects as go
import yaml
from plotly.subplots import make_subplots
import numpy as np
from ipywidgets import interact

from utils import standardize_data
from datamodule import Dataset
from catvae import CategoricalVAE
from utils import preprocess_data

np.random.seed(123)
torch.manual_seed(123)

In [None]:
def plot_like(anom, idx):
    # load trained model
    MODEL_VERSION = f'VAE_training_hparams/BeRfiPl/catvae/version_{idx}'
    ckpt_file_name = os.listdir(f'./{MODEL_VERSION}/checkpoints/')[-1]
    ckpt_file_path = f'./{MODEL_VERSION}/checkpoints/{ckpt_file_name}'
    with open(f'./{MODEL_VERSION}/hparams.yaml') as f:
        hparam = yaml.safe_load(f)
    model = CategoricalVAE.load_from_checkpoint(ckpt_file_path, hparams=hparam["hparams"]).to('cuda')

    if anom==False:
        # read normal data
        _, data, states, _, _ = preprocess_data('BeRfiPl_labels')
        powers_of_two = 2**torch.arange(states.size(1) - 1, -1, -1).float()
        categories = torch.matmul(states, powers_of_two)
        _, new_categories = torch.unique(categories, return_inverse=True)
        new_categories = pd.DataFrame(new_categories).iloc[3000:5000].reset_index(drop=True)
        df = pd.DataFrame(data[:5000])
    else: 
        # read anomalous data
        df = pd.read_csv('preprocessed_data/BeRfiPl_label/ds1c.csv', index_col=0).reset_index(drop=True)
    df_sc = standardize_data(df, 'scaler_BeRfiPl.pkl')
    df = Dataset(dataframe = df_sc)[:][3000:]

    # compute discretized categories and likelihoods
    likelihood = pd.DataFrame(model.function_likelihood(torch.tensor(df).to(device='cuda')).cpu().detach()).rolling(10).median().fillna(method='bfill')
    pzx_logits, pzx, mu, sigma, pxz, z = model.get_states(torch.tensor(df).to(device='cuda'))
    df_states = pd.DataFrame(torch.zeros(z.shape).to(device='cuda').scatter(1, torch.argmax(pzx_logits, dim=1).unsqueeze(1), 1).cpu().detach().numpy(), index=pd.DataFrame(df).index).astype(int)
    cats = pd.DataFrame(df_states.idxmax(axis=1), index=pd.DataFrame(df).index)  

    # compute purity measure
    cluster_assignments = cats[cats.columns[0]]
    class_assignments = new_categories[0] 
    assert len(cluster_assignments) == len(class_assignments)
    
    num_samples = len(cluster_assignments)
    num_clusters = len(np.unique(cluster_assignments))
    num_classes = len(np.unique(class_assignments))
    
    cluster_class_counts = {cluster_: {class_: 0 for class_ in np.unique(class_assignments)}
                            for cluster_ in np.unique(cluster_assignments)}
    
    for cluster_, class_ in zip(cluster_assignments, class_assignments):
        cluster_class_counts[cluster_][class_] += 1
        
    total_intersection = sum([max(list(class_dict.values())) for cluster_, class_dict in cluster_class_counts.items()])
    purity = total_intersection/num_samples

    fig = make_subplots(rows=4, cols=1, shared_xaxes=True)
    for i in range(0,5):
        fig.add_trace(go.Scatter(x=pd.DataFrame(df).index, y=pd.DataFrame(df)[pd.DataFrame(df).columns[i]], mode='markers'), 
                      row=1, col=1)
        
    fig.add_trace(go.Scatter(x = pd.DataFrame(df).index, y=cats[cats.columns[0]], name='category', mode='lines'),row=2, col=1)
    fig.add_trace(go.Scatter(x=new_categories.index, y=new_categories[new_categories.columns[0]], mode='markers', name='real_cat'), row=3, col=1)
    fig.add_trace(go.Scatter(x=pd.DataFrame(df).index, y=likelihood[likelihood.columns[0]],name='likelihood', mode='markers'), row=4, col=1)
    return fig


def calc_purity():
    anom = False
    total_purity = []
    # compute mean and var of trained models
    for idx in range(0, 9):  
        # load trained model 
        MODEL_VERSION = f'VAE_training_hparams/BeRfiPl/catvae/version_{idx}' 
        ckpt_file_name = os.listdir(f'./{MODEL_VERSION}/checkpoints/')[-1]
        ckpt_file_path = f'./{MODEL_VERSION}/checkpoints/{ckpt_file_name}'
        with open(f'./{MODEL_VERSION}/hparams.yaml') as f:
            hparam = yaml.safe_load(f)
        model = CategoricalVAE.load_from_checkpoint(ckpt_file_path, hparams=hparam["hparams"]).to('cuda')
        
        # read normal data
        _, data, states, _, _ = preprocess_data('BeRfiPl_labels')
        powers_of_two = 2**torch.arange(states.size(1) - 1, -1, -1).float()
        categories = torch.matmul(states, powers_of_two)
        _, new_categories = torch.unique(categories, return_inverse=True)
        new_categories = pd.DataFrame(new_categories).iloc[3000:5000].reset_index(drop=True)
        df = pd.DataFrame(data[3000:5000])
        df_sc = standardize_data(df, 'scaler_BeRfiPl.pkl')
        df = Dataset(dataframe = df_sc)[:][:]

        # compute discretized categories and likelihoods
        likelihood = pd.DataFrame(model.function_likelihood(torch.tensor(df).to(device='cuda')).cpu().detach()).rolling(10).median().fillna(method='bfill')
        pzx_logits, pzx, mu, sigma, pxz, z = model.get_states(torch.tensor(df).to(device='cuda'))
        df_states = pd.DataFrame(torch.zeros(z.shape).to(device='cuda').scatter(1, torch.argmax(pzx_logits, dim=1).unsqueeze(1), 1).cpu().detach().numpy(), index=pd.DataFrame(df).index).astype(int)
        cats = pd.DataFrame(df_states.idxmax(axis=1), index=pd.DataFrame(df).index)  
        
        # compute purity measure
        cluster_assignments = cats[cats.columns[0]]
        class_assignments = new_categories[0] 
        assert len(cluster_assignments) == len(class_assignments)
        
        num_samples = len(cluster_assignments)
        num_clusters = len(np.unique(cluster_assignments))
        num_classes = len(np.unique(class_assignments))
        
        cluster_class_counts = {cluster_: {class_: 0 for class_ in np.unique(class_assignments)}
                                for cluster_ in np.unique(cluster_assignments)}
        
        for cluster_, class_ in zip(cluster_assignments, class_assignments):
            cluster_class_counts[cluster_][class_] += 1
            
        total_intersection = sum([max(list(class_dict.values())) for cluster_, class_dict in cluster_class_counts.items()])
        
        purity = total_intersection/num_samples
        total_purity.append(purity)
    mean = sum(total_purity)/len(total_purity)
    variance = sum([((x - mean) ** 2) for x in total_purity]) / len(total_purity) 
    std = variance ** 0.5
    return {'mean ':mean, 'std ':std}

In [None]:
# plot function to see likelihoods and discretizations
interact(plot_like, idx= range(10))

In [None]:
# computing the purity
calc_purity()