In [1]:
import streamlit as st
import pickle as pkl
import pandas as pd
import torch
from sklearn.model_selection import train_test_split

from denoisers.ConditionalUnetDenoiser import ConditionalUnetDenoiser
from denoisers.ConditionalUnetMatrixDenoiser import ConditionalUnetMatrixDenoiser
from utils.graph_utils import get_process_model_reachability_graph_transition_matrix, get_process_model_petri_net_transition_matrix, get_process_model_reachability_graph_transition_multimatrix
from utils.pm_utils import discover_dk_process, remove_duplicates_dataset, pad_to_multiple_of_n
from utils.Config import Config
import plotly.express as px
import plotly.graph_objects as go
from dataset.dataset import SaladsDataset
from ddpm.ddpm_multinomial import Diffusion
import os
import json
from torch.utils.data import DataLoader
from tqdm.notebook import tqdm
from utils.pm_utils import conformance_measure

In [2]:
def load_experiment_config(target_dir):
    config_path = os.path.join(target_dir, "cfg.json")
    if os.path.exists(config_path):
        with open(config_path, "r") as f:
            return Config(**json.load(f))
    else:
        st.warning("Configuration file not found.")
        return None


def load_experiment_data_and_model(target_dir, cfg):
    with open(cfg.data_path, "rb") as f:
        base_dataset = pkl.load(f)
    dataset = SaladsDataset(base_dataset['target'], base_dataset['stochastic'])
    train_dataset, test_dataset = train_test_split(dataset, train_size=cfg.train_percent, shuffle=True,
                                                   random_state=cfg.seed)
    dk_process_model, dk_init_marking, dk_final_marking = discover_dk_process(train_dataset, cfg,
                                                                              preprocess=remove_duplicates_dataset)
    diffuser = Diffusion(noise_steps=cfg.num_timesteps, device=cfg.device)
    if cfg.enable_matrix:
        rg_nx, rg_transition_matrix = get_process_model_reachability_graph_transition_multimatrix(dk_process_model,
                                                                                    dk_init_marking)
        rg_transition_matrix = torch.tensor(rg_transition_matrix, device=cfg.device).float()
        rg_transition_matrix = pad_to_multiple_of_n(rg_transition_matrix)
        denoiser = ConditionalUnetMatrixDenoiser(in_ch=cfg.num_classes, out_ch=cfg.num_classes,
                                                 max_input_dim=dataset.sequence_length,
                                                 transition_dim=rg_transition_matrix.shape[-1],
                                                 matrix_out_channels=rg_transition_matrix.shape[0],
                                                 device=cfg.device).to(cfg.device).float()
    else:
        rg_transition_matrix = torch.randn((cfg.num_classes, 2, 2)).to(cfg.device)
        denoiser = ConditionalUnetDenoiser(in_ch=cfg.num_classes, out_ch=cfg.num_classes,
                                           max_input_dim=dataset.sequence_length,
                                           device=cfg.device).to(cfg.device).float()
    ckpt_path = os.path.join(target_dir, "best.ckpt")
    denoiser.load_state_dict(torch.load(ckpt_path, map_location=cfg.device)['model_state'])
    final_res_path = os.path.join(target_dir, "final_results.json")
    final_res = None
    if os.path.exists(final_res_path):
        with open(final_res_path, "r") as f:
            final_res = json.load(f)
    else:
        st.warning("Final results not found.")

    return (train_dataset, test_dataset, dk_process_model, dk_init_marking, dk_final_marking, rg_transition_matrix,
            diffuser, denoiser, final_res)


target_dir = r"D:\Projects\trace-denoise\final_runs\50_salads_aug_stoch"
cfg = load_experiment_config(target_dir)
cfg.device = "cuda:0"
train_dataset, test_dataset, dk_process_model, dk_init_marking, dk_final_marking, rg_transition_matrix, diffuser, denoiser, final_res = load_experiment_data_and_model(target_dir, cfg)

In [3]:
# from itertools import groupby
# from sktr.sktr import convert_dataset_to_df
#
#
# def dataset_to_list(dataset: SaladsDataset):
#     deterministics = torch.argmax(torch.stack([x[0] for x in dataset], axis=0).permute(0, 2, 1), dim=1)
#     stochastics = torch.stack([x[1] for x in dataset], axis=0).permute(0, 2, 1)
#     return deterministics, stochastics
#
#
# def remove_duplicates_trace(trace):
#     return torch.tensor([x.item() for x, _ in groupby(trace)])
#
#
# def remove_duplicates_dataset(dataset: SaladsDataset):
#     stochastics = [x[1] for x in dataset]
#     one_hot = torch.argmax(torch.stack([x[0] for x in dataset], axis=0).permute(0, 2, 1), dim=1)
#     deterministics = [remove_duplicates_trace(x) for x in one_hot]
#     return deterministics, stochastics
#
#
# def convert_dataset_to_train_process_df(deterministic, stochastic, cfg: Config):
#     dk_process_df, _ = convert_dataset_to_df(deterministic, stochastic, cfg.activity_names)
#     return prepare_df_cols_for_discovery(dk_process_df)
#
#
# def prepare_df_cols_for_discovery(df):
#     df_copy = df.copy()
#     df_copy.loc[:, 'order'] = df_copy.groupby('case:concept:name').cumcount()
#     df_copy.loc[:, 'time:timestamp'] = pd.to_datetime(df_copy['order'])
#     return df_copy
#
# with open(cfg.data_path, "rb") as f:
#     base_dataset = pkl.load(f)
# dataset = SaladsDataset(base_dataset['target'], base_dataset['stochastic'])
# train_dataset, test_dataset = train_test_split(dataset, train_size=cfg.train_percent, shuffle=True,
#                                                random_state=cfg.seed)
# deterministic, stochastic = remove_duplicates_dataset(dataset)
# df_train = convert_dataset_to_train_process_df(deterministic, stochastic, cfg)
# dk_process_model, dk_init_marking, dk_final_marking = discover_dk_process(train_dataset, cfg,
#                                                                            preprocess=remove_duplicates_dataset)
# rg_nx, rg_transition_matrix = get_process_model_reachability_graph_transition_multimatrix(dk_process_model,
#                                                                                           dk_init_marking)

In [3]:
import numpy as np

noise_levels = ["{:.3f}".format(x) for x in np.arange(0.5, 0.84, 0.01)]
noise_levels_det = ["{:.3f}".format(x) for x in np.arange(0.67, 0.84, 0.01)]
noisy_datasets = {}
noisy_loaders = {}
noisy_datasets_from_det = {}
noisy_loaders_from_det = {}

for noise_level in noise_levels:
    with open(fr"../data/synthetic/50_salads_synth_from_sto_{noise_level}.pkl", "rb") as f:
        noisy_data = pkl.load(f)

    noisy_datasets[noise_level] = SaladsDataset(noisy_data['target'], noisy_data['stochastic'])

for noise_level in noise_levels_det:
    with open(fr"../data/synthetic/50_salads_synth_from_det_{noise_level}.pkl", "rb") as f:
        noisy_data = pkl.load(f)

    noisy_datasets_from_det[noise_level] = SaladsDataset(noisy_data['target'], noisy_data['stochastic'])
    
for noise_level, noisy_dataset in noisy_datasets.items():
    train_noisy_dataset, test_noisy_dataset = train_test_split(noisy_dataset, train_size=cfg.train_percent, shuffle=True, random_state=cfg.seed)
    train_loader = DataLoader(train_noisy_dataset, batch_size=cfg.batch_size, shuffle=False)
    test_loader = DataLoader(test_noisy_dataset, batch_size=cfg.batch_size, shuffle=False)
    noisy_loaders[noise_level] = (train_loader, test_loader)

for noise_level, noisy_dataset in noisy_datasets_from_det.items():
    train_noisy_dataset, test_noisy_dataset = train_test_split(noisy_dataset, train_size=cfg.train_percent, shuffle=True, random_state=cfg.seed)
    train_loader = DataLoader(train_noisy_dataset, batch_size=cfg.batch_size, shuffle=False)
    test_loader = DataLoader(test_noisy_dataset, batch_size=cfg.batch_size, shuffle=False)
    noisy_loaders_from_det[noise_level] = (train_loader, test_loader)

In [4]:
import numpy as np
from scipy.stats import wasserstein_distance
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score

def evaluate_dataset(denoiser, diffuser, cfg, loader, process_model, init_mark, final_mark):
    results_accumulator = {'x': [], 'y': [], 'x_hat': []}
    l = len(loader)
    with torch.no_grad():
        for i, (x, y) in tqdm(enumerate(loader)):
            x = x.permute(0, 2, 1).to(cfg.device).float()
            y = y.permute(0, 2, 1).to(cfg.device).float()
            x_hat, matrix_hat, loss, seq_loss, mat_loss = \
                diffuser.sample_with_matrix(denoiser, y.shape[0], cfg.num_classes, denoiser.max_input_dim,
                                            rg_transition_matrix.shape[-1], rg_transition_matrix, x, y,
                                            cfg.predict_on)
            results_accumulator['x'].append(x)
            results_accumulator['y'].append(y)
            results_accumulator['x_hat'].append(x_hat.permute(0, 2, 1))

            x_argmax = torch.argmax(torch.cat(results_accumulator['x'], dim=0), dim=1).to('cpu')
            y_cat = torch.cat(results_accumulator['y'], dim=0)
            x_hat_logit = torch.cat(results_accumulator['x_hat'], dim=0)
    
            x_argmax_flat = x_argmax.reshape(-1).to('cpu')
            x_hat_flat = x_hat_logit.reshape(-1, cfg.num_classes).to('cpu')
            x_hat_prob_flat = torch.softmax(x_hat_flat, dim=1).to('cpu')
            x_hat_argmax_flat = torch.argmax(x_hat_prob_flat, dim=1).to('cpu')
            x_hat_prob = torch.softmax(x_hat_logit, dim=2).to('cpu')
            x_hat_argmax = torch.argmax(x_hat_prob, dim=2)
    
            w2 = np.mean([wasserstein_distance(xi, xhi) for xi, xhi in zip(x_argmax, x_hat_argmax)])
            accuracy = accuracy_score(x_argmax_flat, x_hat_argmax_flat)
            precision = precision_score(x_argmax_flat, x_hat_argmax_flat, average='macro', zero_division=0)
            recall = recall_score(x_argmax_flat, x_hat_argmax_flat, average='macro', zero_division=0)
            f1 = f1_score(x_argmax_flat, x_hat_argmax_flat, average='macro', zero_division=0)
            alignments = 0
            # alignments = np.mean(
            #     conformance_measure(x_hat_argmax, process_model, init_mark, final_mark, cfg.activity_names,
            #                         limit=1000, remove_duplicates=True, approximate=False)
            # )
            
            return results_accumulator, (accuracy, precision, recall, f1, w2, alignments)

In [5]:
denoiser_no_mat = ConditionalUnetDenoiser(in_ch=cfg.num_classes, out_ch=cfg.num_classes,
                                          max_input_dim=denoiser.max_input_dim,
                                          device=cfg.device).to(cfg.device).float()
denoiser_no_mat.load_state_dict(torch.load("../final_runs/50_salads_unified_gamma_0_5_without_process/best.ckpt", map_location=cfg.device)['model_state'])

<All keys matched successfully>

In [6]:
noisy_results_from_det = {}
noisy_results_no_mat_from_det = {}
results_from_det = [noisy_results_from_det, noisy_results_no_mat_from_det]

for i, denoiser in tqdm(enumerate([denoiser, denoiser_no_mat])):
    for noise_level, (train_loader, test_loader) in tqdm(noisy_loaders_from_det.items()):
        accumulator, metrics = evaluate_dataset(denoiser, diffuser, cfg, test_loader, dk_process_model, dk_init_marking, dk_final_marking)
        results_from_det[i][noise_level] = (accumulator, metrics)

0it [00:00, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [7]:
noisy_results = {}
noisy_results_no_mat = {}
results = [noisy_results, noisy_results_no_mat]

for i, denoiser in tqdm(enumerate([denoiser, denoiser_no_mat])):
    for noise_level, (train_loader, test_loader) in tqdm(noisy_loaders.items()):
        accumulator, metrics = evaluate_dataset(denoiser, diffuser, cfg, test_loader, dk_process_model, dk_init_marking, dk_final_marking)
        results[i][noise_level] = (accumulator, metrics)

0it [00:00, ?it/s]

  0%|          | 0/34 [00:00<?, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/34 [00:00<?, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [8]:
results_measures = [{k: v[1] for k, v in results_dict.items()} for results_dict in results]
results_accumulators = [{k: v[0] for k, v in results_dict.items()} for results_dict in results]

results_measures

[{'0.500': (0.7269643580396922,
   0.7707339017277799,
   0.7376322582306932,
   0.7164322579553358,
   0.5606520858647226,
   0),
  '0.510': (0.7009416767922235,
   0.7486489160645717,
   0.7223776355495517,
   0.6947357738743244,
   0.5200992304576753,
   0),
  '0.520': (0.6887910085054678,
   0.6863115063584545,
   0.7081236062757954,
   0.6709421646511841,
   0.611735520453625,
   0),
  '0.530': (0.6961320372620494,
   0.6919510568516708,
   0.7092745168803085,
   0.673407589350868,
   0.5725496152288376,
   0),
  '0.540': (0.6874240583232077,
   0.6844725565565583,
   0.7047500321494125,
   0.6695374379422449,
   0.5969522073714054,
   0),
  '0.550': (0.6956763872012961,
   0.6905161190059537,
   0.6974277885042325,
   0.6671774072658666,
   0.5803462940461724,
   0),
  '0.560': (0.6795261239368166,
   0.68255529551984,
   0.6791988632799001,
   0.650831852270771,
   0.573005265289591,
   0),
  '0.570': (0.6744633454840017,
   0.6842615396054259,
   0.6840127358682174,
   0.649983

In [9]:
results_measures_from_det = [{k: v[1] for k, v in results_dict.items()} for results_dict in results_from_det]
results_accumulators_from_det = [{k: v[0] for k, v in results_dict.items()} for results_dict in results_from_det]

results_measures_from_det

[{'0.670': (0.9970635884973673,
   0.992701548833662,
   0.9946120706505017,
   0.9935850573327192,
   0.005265289590927519,
   0),
  '0.680': (0.996607938436614,
   0.9929424262783382,
   0.9945755544573085,
   0.9937250371487119,
   0.00622721749696227,
   0),
  '0.690': (0.996607938436614,
   0.9917470659052532,
   0.9938377988560028,
   0.9927224224517092,
   0.007189145402997113,
   0),
  '0.700': (0.9968104495747266,
   0.9917897556769267,
   0.9947128832000839,
   0.9931913783652583,
   0.004961522883758673,
   0),
  '0.710': (0.996607938436614,
   0.9897204090385145,
   0.9941214969098496,
   0.99181212651515,
   0.004860267314702391,
   0),
  '0.720': (0.9967091940056703,
   0.9920561692887514,
   0.993973436653091,
   0.9929512736280227,
   0.005265289590927508,
   0),
  '0.730': (0.9972154718509518,
   0.9911044486714158,
   0.9952436172128181,
   0.9930724045148546,
   0.004505872823005265,
   0),
  '0.740': (0.9964054272985015,
   0.9904302680372397,
   0.9935764961587165,

In [12]:
accuracies_from_det = [v[0] for v in results_measures_from_det[0].values()]
accuracies_not_mat_from_det = [v[0] for v in results_measures_from_det[1].values()]
noise_levels_det_bar = [float(x) for x in noise_levels_det]

fig = go.Figure()

fig.add_trace(go.Scatter(x=noise_levels_det_bar, y=accuracies_from_det, mode='lines+markers', name='With RG',
                         line=dict(color='blue', dash='solid')))
fig.add_trace(go.Scatter(x=noise_levels_det_bar, y=accuracies_not_mat_from_det, mode='lines+markers', name='Without RG',
                         line=dict(color='red', dash='dash')))

fig.update_layout(title='Synth from Det',
                  xaxis_title='Noise Level',
                  yaxis_title='Accuracy',
                  legend_title='Model Type')

fig.show()

In [9]:
fig.write_html("PM vs. no PM deterministic synthetic data.html")

In [13]:
accuracies = [v[0] for v in results_measures[0].values()]
accuracies_not_mat = [v[0] for v in results_measures[1].values()]
noise_levels_bar = [float(x) for x in noise_levels]

fig = go.Figure()

fig.add_trace(go.Scatter(x=noise_levels_bar, y=accuracies, mode='lines+markers', name='With RG',
                         line=dict(color='blue', dash='solid')))
fig.add_trace(go.Scatter(x=noise_levels_bar, y=accuracies_not_mat, mode='lines+markers', name='Without RG',
                         line=dict(color='red', dash='dash')))

fig.update_layout(title='Synth from Stoch',
                  xaxis_title='Noise Level',
                  yaxis_title='Accuracy',
                  legend_title='Model Type')

fig.show()

In [25]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=list(range(len(x))), y=x, mode='lines', name='Original',
                         line=dict(color='blue', dash='solid')))
fig.add_trace(go.Scatter(x=list(range(len(y))), y=y, mode='lines', name='Argmax',
                         line=dict(color='red', dash='dash')))
fig.add_trace(go.Scatter(x=list(range(len(x_hat))), y=x_hat, mode='lines', name='Reconstructed - With PM',
                         line=dict(color='green', dash='dot')))
fig.add_trace(go.Scatter(x=list(range(len(x_hat_no_mat))), y=x_hat_no_mat, mode='lines', name='Reconstructed - No PM',
                         line=dict(color='green', dash='dot')))
fig.show()

In [27]:
# Create a histogram for conformances
fig_conformances = px.histogram(conformances, nbins=50, title='Conformances Distribution')
fig_conformances.update_layout(xaxis_title='Conformance Score', yaxis_title='Count')
fig_conformances.show()

# Create a histogram for conformances_no_mat
fig_conformances_no_mat = px.histogram(conformances_no_mat, nbins=50, title='Conformances No Mat Distribution')
fig_conformances_no_mat.update_layout(xaxis_title='Conformance Score', yaxis_title='Count')
fig_conformances_no_mat.show()