# Imports

In [11]:
import numpy as np
import pandas as pd
import pickle
from generate_data import DataGenerator
import os

# Test Case Summary

In [8]:
# read test cases from file using pickle
with open('simulated_data/test_cases.pkl', 'rb') as f:
    test_cases = pickle.load(f)

In [3]:
# create an overview over all test cases using a dataframe
# we need the cluster ids, the mean positions and stds of each cluster in the test
# also the number of samples in each cluster and the mean velocities and stds

# create a dataframe for the test cases
data_generator = DataGenerator()
df_test_cases = pd.DataFrame()
for i, test_case in enumerate(test_cases):
    for labels, mu_position, mu_std in zip(test_case['clusters'], test_case['mu_position'], test_case['mu_std']):
        mu_velocity = data_generator.df_infos[['U', 'V', 'W']].iloc[labels].values
        std = np.concatenate((mu_std, data_generator.df_infos[['U_std', 'V_std', 'W_std']].iloc[labels].values))
        N = max(int(data_generator.df_infos[['n_cluster']].iloc[labels].values), 200)
        df_test_cases = pd.concat([df_test_cases, pd.DataFrame({
            'test_case': i,
            'cluster': labels,
            'mu_position': [mu_position],
            'mu_std': [mu_std],
            'N': [N],
            'mu_velocity': [mu_velocity],
            'std': [std]
        })])

In [4]:
# calculate difference between the positions, velocities and stds
for i in df_test_cases['test_case'].unique():
    for col in ['mu_position', 'mu_velocity', 'std']:
        positions = df_test_cases.loc[df_test_cases['test_case'] == i, col].apply(np.array)
        diff = np.diff(positions.values, axis=0)
        df_test_cases.loc[df_test_cases['test_case'] == i, f'diff_{col}'] = np.linalg.norm(diff[0])

In [9]:
df_test_cases.head()

Unnamed: 0,test_case,cluster,mu_position,mu_std,N,mu_velocity,std,diff_mu_position,diff_mu_velocity,diff_std
0,0,0,"[-500, -500, 20]","[3, 5, 2]",305,"[-9.778308, -9.736123, -5.0372443]","[3, 5, 2, 2, 2, 2]",20.615528,2.630699,3.162278
0,0,1,"[-490, -490, 35]","[4, 8, 2]",438,"[-11.969043, -8.369388, -4.5339603]","[4, 8, 2, 2, 2, 2]",20.615528,2.630699,3.162278
0,1,0,"[20, 40, 150]","[7, 3, 4]",305,"[-9.778308, -9.736123, -5.0372443]","[7, 3, 4, 2, 2, 2]",0.0,12.713389,2.44949
0,1,2,"[20, 40, 150]","[6, 5, 5]",200,"[-20.984257, -14.273282, -8.970582]","[6, 5, 5, 2, 2, 2]",0.0,12.713389,2.44949
0,2,0,"[-20, 40, 160]","[5, 7, 3]",305,"[-9.778308, -9.736123, -5.0372443]","[5, 7, 3, 2, 2, 2]",20.0,6.661291,2.236068


# Test Case Evaluation

In [26]:
dir = '/home/nico/Desktop/test_case/test_case1/'
data = pd.read_csv(dir + 'data.csv')
nmi = pd.read_csv(dir + 'nmi.csv', index_col=0)

# read all files from the directory starting in split_ using pickle
split_info = {}
for file in os.listdir(dir):
    if file.startswith('split_'):
        key = file[11:-4]
        split_info[key] = pickle.load(open(dir + file, 'rb'))

In [102]:
test_case_overview = pd.DataFrame()
noise_split_overview = pd.DataFrame()

for test_strat in split_info:

    noise_splits = {
        'count': 0,
        'same_velocity': 0,
        'deviation_overwrite': 0,
        'remerge': 0,
    }
    for split in split_info[test_strat]:

        true_labels_new = split['labels_nc_id']
        true_labels_old = split['labels_part_of_old_cluster']

        split_type = ''
        velocity_test_output = ' '.join(split['test_output'].split(" ")[:2])
        same_velocity = False if velocity_test_output == 'different velocity' else True
        deviation_overwrite = True if split['test_output'].split(" ")[-1] == 'overwrite' else False
        if np.all(true_labels_new == -1) and np.all(true_labels_old == -1):
            continue
        elif np.all(true_labels_new == -1) or np.all(true_labels_old == -1):
            noise_splits['count'] += 1
            if same_velocity:
                noise_splits['same_velocity'] += 1
            if deviation_overwrite:
                noise_splits['deviation_overwrite'] += 1
            if split['same_velocity']:
                noise_splits['remerge'] += 1
            split_type = 'noise'
        else:
            split_type = 'cluster'

        test_case_overview = pd.concat([test_case_overview, pd.DataFrame({
            'test_strat': test_strat,
            'remerge': split['same_velocity'],
            'same_velocity': same_velocity,
            'deviation_overwrite': deviation_overwrite,
            'labels_nc_id': [split['labels_nc_id']],
            'count_nc_id': [split['count_nc_id']],
            'labels_part_of_old_cluster': [split['labels_part_of_old_cluster']],
            'count_part_of_old_cluster': [split['count_part_of_old_cluster']],
            'split_type': [split_type]
        })])

    noise_split_overview = pd.concat([noise_split_overview, pd.DataFrame({
        'test_strat': [test_strat],
        'count': noise_splits['count'],
        'same_velocity': noise_splits['same_velocity'],
        'deviation_overwrite': noise_splits['deviation_overwrite'],
        'remerge': noise_splits['remerge']
    })], ignore_index=True)

In [108]:
test_case_overview[(test_case_overview['split_type'] == 'cluster')]

Unnamed: 0,test_strat,remerge,same_velocity,deviation_overwrite,labels_nc_id,count_nc_id,labels_part_of_old_cluster,count_part_of_old_cluster,split_type
0,ttest,False,False,False,"[-1, 0, 1]","[30705, 286, 26]","[-1, 0, 1]","[8142, 19, 412]",cluster
0,xd_sample_bootstrap_range_test,True,False,True,"[-1, 0, 1]","[30705, 286, 26]","[-1, 0, 1]","[8142, 19, 412]",cluster
0,xd_mean_distance,True,False,True,"[-1, 0, 1]","[30705, 286, 26]","[-1, 0, 1]","[8142, 19, 412]",cluster
0,bootstrap_difference_test,True,True,False,"[-1, 0, 1]","[30705, 286, 26]","[-1, 0, 1]","[8142, 19, 412]",cluster
0,xd_sample_ttest,True,False,True,"[-1, 0, 1]","[30705, 286, 26]","[-1, 0, 1]","[8142, 19, 412]",cluster
0,xd_mean_distance_sample_distance,True,False,True,"[-1, 0, 1]","[30705, 286, 26]","[-1, 0, 1]","[8142, 19, 412]",cluster
0,bootstrap_range_test,False,False,False,"[-1, 0, 1]","[30705, 286, 26]","[-1, 0, 1]","[8142, 19, 412]",cluster


In [104]:
noise_split_overview

Unnamed: 0,test_strat,count,same_velocity,deviation_overwrite,remerge
0,ttest,13,0,12,12
1,xd_sample_bootstrap_range_test,13,0,13,13
2,xd_mean_distance,13,0,13,13
3,bootstrap_difference_test,13,13,12,13
4,xd_sample_ttest,13,0,13,13
5,xd_mean_distance_sample_distance,13,0,13,13
6,bootstrap_range_test,13,2,12,12


In [113]:
nmi

Unnamed: 0_level_0,nmi
strategy,Unnamed: 1_level_1
sigma,0.301562
ttest,0.321662
bootstrap_range_test,0.322979
bootstrap_difference_test,0.303381
xd_mean_distance,0.303381
xd_sample_ttest,0.303381
xd_mean_distance_sample_distance,0.303157
xd_sample_bootstrap_range_test,0.303157
