In [1]:
import os
import matplotlib.pyplot as plt
import datetime
import json
import matplotlib.gridspec as gridspec

import numpy as np
import pandas as pd
import seaborn as sns

from itertools import compress

from helpers.expr_data import ExprData
from helpers.scale_data import ScaleData
from helpers.similarity import Similarity
from helpers.feature_selection import FeatureSelection

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix

import warnings
warnings.filterwarnings("ignore")

In [2]:
k=1 # k for knn

In [3]:
import random
np.random.seed(47907)
random.seed(15213)

In [4]:
def pred_stats(simi_calc, n=k, dependent=True, print_label=False):
    true_labels = simi_calc.data.wl_names
    pred_labels = simi_calc.simi_pred(n, dependent)
    if print_label:
        print(true_labels)
        print(pred_labels)
    acc = accuracy_score(true_labels, pred_labels)
    prec = precision_score(true_labels, pred_labels, average='macro')
    f1 = f1_score(true_labels, pred_labels, average='macro')
    rec = recall_score(true_labels, pred_labels, average='macro')
    return acc, prec, f1, rec

### Comparing Feature Sets

In [5]:
data = ExprData()
data.load_pickle()
data = data.remove_by_wlname(['ycsb', 'xml'])
data = data.fix_tpch()
data_by_sku = data.split_by_sku()

# use for later distinguish btw plan and system metrics
perf_features = data.perf_feature_cols
plan_features = data.plan_feature_cols
all_features = data.feature_cols

feature_name_all = ['plan_3', 'plan_7', 'plan', 'resource_3', 'resource_5', 'resource', 'all_3', 'all_7', 'all']
feature_name_plan = ['plan_3', 'plan_7', 'plan']
feature_name_perf = ['resource_3', 'resource_5', 'resource']

In [6]:
# the result sku_result is a dict with its key the SKU,
# the value a list, the classification accuracy for each f_num
simi_dict = {}

for sku in data_by_sku.keys():
    if 'ter' in sku or 'xml' in sku:
        continue
    curr_data = data_by_sku[sku]
    scaler = ScaleData()
    plan_mtxs, plan_col_ranges = scaler.scale(curr_data.plan_mtxs)
    perf_mtxs, perf_col_ranges = scaler.scale(curr_data.perf_mtxs)

    simi_calc = Similarity(curr_data, plan_mtxs, plan_col_ranges, perf_mtxs, perf_col_ranges)
    simi_calc.calc_bined_mtx() # all features
    simi_dict[sku] = simi_calc

The history saving thread hit an unexpected error (DatabaseError('database disk image is malformed')).History will not be written to the database.


In [7]:
# Pick one cpu setting (cpu=16 for example)
simi_calc = simi_dict['cpu16']

In [None]:
print('for all 7 resource features')
for c_list, wl_name in zip(simi_calc.perf_bined, simi_calc.data.wl_names):
    count = 0
    value = 0    
    for i in range(c_list.shape[1]):
        for j in range(c_list.shape[0]):
            if c_list[j][i] > 0:
                count += 1
                value +=  c_list[j][i]
    print(f'{wl_name}: # non empty bins = {count}, average perc = {value/count}')
    print('---')

for all 7 resource features
tpcc: # non empty bins = 17, average perc = 4.11764705882353
---
tpcc: # non empty bins = 16, average perc = 4.375
---
tpcc: # non empty bins = 17, average perc = 4.117647058823529
---
tpcc: # non empty bins = 12, average perc = 5.833333333333333
---
tpcc: # non empty bins = 12, average perc = 5.833333333333333
---
tpcc: # non empty bins = 13, average perc = 5.384615384615385
---
tpcc: # non empty bins = 12, average perc = 5.833333333333333
---
tpcc: # non empty bins = 13, average perc = 5.384615384615385
---
tpcc: # non empty bins = 11, average perc = 6.363636363636363
---
tpch: # non empty bins = 24, average perc = 2.9166666666666665
---
tpch: # non empty bins = 25, average perc = 2.8
---
tpch: # non empty bins = 23, average perc = 3.0434782608695654
---
tpch: # non empty bins = 24, average perc = 2.9166666666666665
---
tpch: # non empty bins = 25, average perc = 2.8
---
tpch: # non empty bins = 25, average perc = 2.8
---
tpch: # non empty bins = 26, avera

In [9]:
# Use top-k feature from all features
fs = FeatureSelection(simi_calc)

top_3 = fs.select_features(3, 'fANOVA', est_name=None, direction=None, feature_type=None)
top_7 = fs.select_features(7, 'fANOVA', est_name=None, direction=None, feature_type=None)
plan_top_3 = fs.select_features(3, 'fANOVA', est_name=None, direction=None, feature_type='plan')
perf_top_3 = fs.select_features(3, 'fANOVA', est_name=None, direction=None, feature_type='perf')
plan_top_7 = fs.select_features(7, 'fANOVA', est_name=None, direction=None, feature_type='plan')
perf_top_5 = fs.select_features(5, 'fANOVA', est_name=None, direction=None, feature_type='perf')

features_all = [plan_top_3, plan_top_7, plan_features, perf_top_3, perf_top_5, perf_features, top_3, top_7, all_features]
features_plan = [plan_top_3, plan_top_7, plan_features]
features_perf = [perf_top_3, perf_top_5, perf_features]

(27, 27, 29)
(27, 27, 29)
(27, 27, 22)
(27, 27, 7)
(27, 27, 22)
(27, 27, 7)


In [10]:
perf_features

['CPU_UTILIZATION',
 'CPU_EFFECTIVE',
 'MEM_UTILIZATION',
 'IOPS_TOTAL',
 'READ_WRITE_RATIO',
 'LOCK_REQ_ABS',
 'LOCK_WAIT_ABS']

In [11]:
top_3

['AvgRowSize', 'StatementSubTreeCost', 'LOCK_WAIT_ABS']

In [12]:
top_7

['AvgRowSize',
 'StatementSubTreeCost',
 'LOCK_WAIT_ABS',
 'CachedPlanSize',
 'MEM_UTILIZATION',
 'MaxCompileMemory',
 'TableCardinality']

In [13]:
plan_top_3

['StatementSubTreeCost', 'AvgRowSize', 'CachedPlanSize']

In [14]:
plan_top_7

['StatementSubTreeCost',
 'AvgRowSize',
 'CachedPlanSize',
 'MaxCompileMemory',
 'CompileMemory',
 'TableCardinality',
 'EstimateIO']

In [15]:
perf_top_3

['LOCK_WAIT_ABS', 'MEM_UTILIZATION', 'LOCK_REQ_ABS']

In [16]:
perf_top_5

['LOCK_WAIT_ABS',
 'MEM_UTILIZATION',
 'LOCK_REQ_ABS',
 'CPU_UTILIZATION',
 'CPU_EFFECTIVE']

In [17]:
keep_cols = simi_calc.filter_by_features(plan_top_7)
print('for top 7 plan features')

for c_list, wl_name in zip(simi_calc.plan_bined, simi_calc.data.wl_names):
    count = 0
    value = 0 
    for i in range(c_list.shape[1]):
        if i not in keep_cols:
            continue
        # print([round(j,2) for j in c_list[  : ,i]])
        for j in range(c_list.shape[0]):
            if c_list[j][i] > 0:
                count += 1
                value +=  c_list[j][i]
    print(f'{wl_name}: # non empty bins = {count}, average perc = {value/count}')
    print('---')

for top 7 plan features
tpcc: # non empty bins = 14, average perc = 5.000000000000001
---
tpcc: # non empty bins = 14, average perc = 4.999999999999999
---
tpcc: # non empty bins = 14, average perc = 5.0
---
tpcc: # non empty bins = 14, average perc = 5.000000000000001
---
tpcc: # non empty bins = 14, average perc = 5.0
---
tpcc: # non empty bins = 14, average perc = 5.0
---
tpcc: # non empty bins = 14, average perc = 5.0
---
tpcc: # non empty bins = 14, average perc = 5.0
---
tpcc: # non empty bins = 14, average perc = 5.000000000000001
---
tpch: # non empty bins = 32, average perc = 2.1875
---
tpch: # non empty bins = 32, average perc = 2.1875000000000004
---
tpch: # non empty bins = 32, average perc = 2.1875000000000004
---
tpch: # non empty bins = 32, average perc = 2.1875
---
tpch: # non empty bins = 32, average perc = 2.1875
---
tpch: # non empty bins = 32, average perc = 2.1875
---
tpch: # non empty bins = 32, average perc = 2.1875
---
tpch: # non empty bins = 32, average perc =

In [18]:
print('for all plan features')

for c_list, wl_name in zip(simi_calc.plan_bined, simi_calc.data.wl_names):
    count = 0
    value = 0 
    for i in range(c_list.shape[1]):
        # print([round(j,2) for j in c_list[  : ,i]])
        for j in range(c_list.shape[0]):
            if c_list[j][i] > 0:
                count += 1
                value +=  c_list[j][i]
    print(f'{wl_name}: # non empty bins = {count}, average perc = {value/count}')
    print('---')

for all plan features
tpcc: # non empty bins = 29, average perc = 7.5862068965517215
---
tpcc: # non empty bins = 29, average perc = 7.5862068965517215
---
tpcc: # non empty bins = 29, average perc = 7.586206896551722
---
tpcc: # non empty bins = 29, average perc = 7.586206896551723
---
tpcc: # non empty bins = 29, average perc = 7.586206896551723
---
tpcc: # non empty bins = 29, average perc = 7.586206896551722
---
tpcc: # non empty bins = 29, average perc = 7.5862068965517215
---
tpcc: # non empty bins = 29, average perc = 7.586206896551722
---
tpcc: # non empty bins = 29, average perc = 7.586206896551721
---
tpch: # non empty bins = 73, average perc = 3.0136986301369846
---
tpch: # non empty bins = 73, average perc = 3.013698630136985
---
tpch: # non empty bins = 75, average perc = 2.9333333333333322
---
tpch: # non empty bins = 72, average perc = 3.0555555555555545
---
tpch: # non empty bins = 73, average perc = 3.0136986301369846
---
tpch: # non empty bins = 73, average perc = 3.0

In [19]:
df = pd.DataFrame(columns=['method','feature_set','accuracy','precision','f1','recall'])

### TS Norm

In [20]:
for feature_group, feature_group_name in zip(features_perf, feature_name_perf):
    simi_calc.calc_simi_matrix(perf_feature_names=feature_group, norm_type='l21')
    result_stats = pred_stats(simi_calc, n=k, dependent=True, print_label=True)
    df.loc[len(df)] = ['L21-Norm', feature_group_name, *result_stats]  # adding a row

['tpcc', 'tpcc', 'tpcc', 'tpcc', 'tpcc', 'tpcc', 'tpcc', 'tpcc', 'tpcc', 'tpch', 'tpch', 'tpch', 'tpch', 'tpch', 'tpch', 'tpch', 'tpch', 'tpch', 'twitter', 'twitter', 'twitter', 'twitter', 'twitter', 'twitter', 'twitter', 'twitter', 'twitter']
['tpcc', 'tpcc', 'tpcc', 'tpcc', 'tpcc', 'tpcc', 'tpcc', 'tpcc', 'tpcc', 'tpch', 'tpch', 'tpch', 'tpch', 'tpch', 'tpch', 'tpch', 'tpch', 'tpch', 'twitter', 'twitter', 'twitter', 'twitter', 'twitter', 'twitter', 'twitter', 'twitter', 'twitter']
['tpcc', 'tpcc', 'tpcc', 'tpcc', 'tpcc', 'tpcc', 'tpcc', 'tpcc', 'tpcc', 'tpch', 'tpch', 'tpch', 'tpch', 'tpch', 'tpch', 'tpch', 'tpch', 'tpch', 'twitter', 'twitter', 'twitter', 'twitter', 'twitter', 'twitter', 'twitter', 'twitter', 'twitter']
['tpcc', 'tpcc', 'tpcc', 'tpcc', 'tpcc', 'tpcc', 'tpcc', 'tpcc', 'tpcc', 'tpch', 'tpch', 'tpch', 'tpch', 'tpch', 'tpch', 'tpch', 'tpch', 'tpch', 'twitter', 'twitter', 'twitter', 'twitter', 'twitter', 'twitter', 'twitter', 'twitter', 'twitter']
['tpcc', 'tpcc', 'tpcc',

In [21]:
df

Unnamed: 0,method,feature_set,accuracy,precision,f1,recall
0,L21-Norm,resource_3,1.0,1.0,1.0,1.0
1,L21-Norm,resource_5,1.0,1.0,1.0,1.0
2,L21-Norm,resource,1.0,1.0,1.0,1.0


In [22]:
for feature_group, feature_group_name in zip(features_perf, feature_name_perf):
    simi_calc.calc_simi_matrix(perf_feature_names=feature_group, norm_type='l11')
    result_stats = pred_stats(simi_calc, n=k, dependent=True)
    df.loc[len(df)] = ['L11-Norm', feature_group_name, *result_stats] 

In [23]:
df[df['method'] == 'L11-Norm']

Unnamed: 0,method,feature_set,accuracy,precision,f1,recall
3,L11-Norm,resource_3,1.0,1.0,1.0,1.0
4,L11-Norm,resource_5,1.0,1.0,1.0,1.0
5,L11-Norm,resource,1.0,1.0,1.0,1.0


In [24]:
for feature_group, feature_group_name in zip(features_perf, feature_name_perf):
    simi_calc.calc_simi_matrix(perf_feature_names=feature_group, norm_type='fro')
    result_stats = pred_stats(simi_calc, n=k, dependent=True)
    df.loc[len(df)] = ['Fro-Norm', feature_group_name, *result_stats] 

In [25]:
df[df['method'] == 'Fro-Norm']

Unnamed: 0,method,feature_set,accuracy,precision,f1,recall
6,Fro-Norm,resource_3,1.0,1.0,1.0,1.0
7,Fro-Norm,resource_5,1.0,1.0,1.0,1.0
8,Fro-Norm,resource,1.0,1.0,1.0,1.0


In [26]:
for feature_group, feature_group_name in zip(features_perf, feature_name_perf):
    simi_calc.calc_simi_matrix(perf_feature_names=feature_group, norm_type='canb')
    result_stats = pred_stats(simi_calc, n=k, dependent=True)
    df.loc[len(df)] = ['Canb-Norm', feature_group_name, *result_stats] 

In [27]:
df[df['method'] == 'Canb-Norm']

Unnamed: 0,method,feature_set,accuracy,precision,f1,recall
9,Canb-Norm,resource_3,1.0,1.0,1.0,1.0
10,Canb-Norm,resource_5,1.0,1.0,1.0,1.0
11,Canb-Norm,resource,1.0,1.0,1.0,1.0


In [28]:
for feature_group, feature_group_name in zip(features_perf, feature_name_perf):
    simi_calc.calc_simi_matrix(perf_feature_names=feature_group, norm_type='corr')
    result_stats = pred_stats(simi_calc, n=k, dependent=True)
    df.loc[len(df)] = ['Corr-Norm', feature_group_name, *result_stats] 

In [29]:
df[df['method'] == 'Corr-Norm']

Unnamed: 0,method,feature_set,accuracy,precision,f1,recall
12,Corr-Norm,resource_3,0.62963,0.462963,0.518519,0.62963
13,Corr-Norm,resource_5,0.62963,0.462963,0.518519,0.62963
14,Corr-Norm,resource,0.333333,0.222222,0.217172,0.333333


In [30]:
for feature_group, feature_group_name in zip(features_perf, feature_name_perf):
    simi_calc.calc_simi_matrix(perf_feature_names=feature_group, norm_type='chi2')
    result_stats = pred_stats(simi_calc, n=k, dependent=True)
    df.loc[len(df)] = ['Chi2-Norm', feature_group_name, *result_stats] 

In [31]:
df[df['method'] == 'Chi2-Norm']

Unnamed: 0,method,feature_set,accuracy,precision,f1,recall
15,Chi2-Norm,resource_3,0.333333,0.222222,0.217172,0.333333
16,Chi2-Norm,resource_5,0.333333,0.222222,0.217172,0.333333
17,Chi2-Norm,resource,0.333333,0.222222,0.217172,0.333333


### Hist-FP

In [32]:
for feature_group, feature_group_name in zip(features_all, feature_name_all):
    simi_calc.calc_dist_simi_matrix(cumulative=True, feature_names=feature_group, norm_type='l21')
    tag = 'resource' in feature_group_name
    if tag:
        print(feature_group_name)
    result_stats = pred_stats(simi_calc, n=k, dependent=True, print_label=tag)
    df.loc[len(df)] = ['Hist-FP-L21', feature_group_name, *result_stats] 

resource_3
['tpcc', 'tpcc', 'tpcc', 'tpcc', 'tpcc', 'tpcc', 'tpcc', 'tpcc', 'tpcc', 'tpch', 'tpch', 'tpch', 'tpch', 'tpch', 'tpch', 'tpch', 'tpch', 'tpch', 'twitter', 'twitter', 'twitter', 'twitter', 'twitter', 'twitter', 'twitter', 'twitter', 'twitter']
['tpcc', 'tpcc', 'tpcc', 'tpcc', 'tpcc', 'tpcc', 'tpcc', 'tpcc', 'tpcc', 'tpch', 'tpch', 'tpch', 'tpch', 'tpch', 'tpch', 'tpch', 'tpch', 'tpch', 'twitter', 'twitter', 'twitter', 'twitter', 'twitter', 'twitter', 'twitter', 'twitter', 'twitter']
resource_5
['tpcc', 'tpcc', 'tpcc', 'tpcc', 'tpcc', 'tpcc', 'tpcc', 'tpcc', 'tpcc', 'tpch', 'tpch', 'tpch', 'tpch', 'tpch', 'tpch', 'tpch', 'tpch', 'tpch', 'twitter', 'twitter', 'twitter', 'twitter', 'twitter', 'twitter', 'twitter', 'twitter', 'twitter']
['tpcc', 'tpcc', 'tpcc', 'tpcc', 'tpcc', 'tpcc', 'tpcc', 'tpcc', 'tpcc', 'tpch', 'tpch', 'tpch', 'tpch', 'tpch', 'tpch', 'tpch', 'tpch', 'tpch', 'twitter', 'twitter', 'twitter', 'twitter', 'twitter', 'twitter', 'twitter', 'twitter', 'twitter']
re

In [33]:
for feature_group, feature_group_name in zip(features_all, feature_name_all):
    simi_calc.calc_dist_simi_matrix(cumulative=True, feature_names=feature_group, norm_type='l11')
    result_stats = pred_stats(simi_calc, n=k, dependent=True)
    df.loc[len(df)] = ['Hist-FP-L11', feature_group_name, *result_stats] 

In [34]:
for feature_group, feature_group_name in zip(features_all, feature_name_all):
    simi_calc.calc_dist_simi_matrix(cumulative=True, feature_names=feature_group, norm_type='fro')
    result_stats = pred_stats(simi_calc, n=k, dependent=True)
    df.loc[len(df)] = ['Hist-FP-Fro', feature_group_name, *result_stats] 

In [35]:
for feature_group, feature_group_name in zip(features_all, feature_name_all):
    simi_calc.calc_dist_simi_matrix(cumulative=True, feature_names=feature_group, norm_type='canb')
    result_stats = pred_stats(simi_calc, n=k, dependent=True)
    df.loc[len(df)] = ['Hist-FP-Canb', feature_group_name, *result_stats] 

In [36]:
for feature_group, feature_group_name in zip(features_all, feature_name_all):
    simi_calc.calc_dist_simi_matrix(cumulative=True, feature_names=feature_group, norm_type='chi2')
    result_stats = pred_stats(simi_calc, n=k, dependent=True)
    df.loc[len(df)] = ['Hist-FP-Chi2', feature_group_name, *result_stats] 

#### Phase Stats FP Distance Measure

In [37]:
for feature_group, feature_group_name in zip(features_all, feature_name_all):
    simi_calc.calc_phase_simi_matrix(feature_names=feature_group, cpd='Kernel', penalty=15, norm_type='l21')
    result_stats = pred_stats(simi_calc, n=k, dependent=True)
    df.loc[len(df)] = ['Phase-FP-L21', feature_group_name, *result_stats] 

In [38]:
for feature_group, feature_group_name in zip(features_all, feature_name_all):
    simi_calc.calc_phase_simi_matrix(feature_names=feature_group, cpd='Kernel', penalty=15, norm_type='l11')
    result_stats = pred_stats(simi_calc, n=k, dependent=True)
    df.loc[len(df)] = ['Phase-FP-L11', feature_group_name, *result_stats] 

In [39]:
for feature_group, feature_group_name in zip(features_all, feature_name_all):
    simi_calc.calc_phase_simi_matrix(feature_names=feature_group, cpd='Kernel', penalty=15, norm_type='fro')
    result_stats = pred_stats(simi_calc, n=k, dependent=True)
    df.loc[len(df)] = ['Phase-FP-Fro', feature_group_name, *result_stats] 

In [40]:
for feature_group, feature_group_name in zip(features_all, feature_name_all):
    simi_calc.calc_phase_simi_matrix(feature_names=feature_group, cpd='Kernel', penalty=15, norm_type='Canb')
    result_stats = pred_stats(simi_calc, n=k, dependent=True)
    df.loc[len(df)] = ['Phase-FP-Canb', feature_group_name, *result_stats] 

#### Dependent DTW

In [41]:
for feature_group, feature_group_name in zip(features_perf, feature_name_perf):
    simi_calc.calc_dtw_simi_matrix(perf_feature_names=feature_group)
    result_stats = pred_stats(simi_calc, n=k, dependent=True)
    df.loc[len(df)] = ['Dependent-DTW', feature_group_name, *result_stats] 

#### Independent DTW

In [42]:
for feature_group, feature_group_name in zip(features_perf, feature_name_perf):
    simi_calc.calc_ind_dtw_simi_matrix(perf_feature_names=feature_group)
    result_stats = pred_stats(simi_calc, n=k, dependent=False)
    df.loc[len(df)] = ['Independent-DTW', feature_group_name, *result_stats] 

#### Dependent LCSS

In [43]:
for feature_group, feature_group_name in zip(features_perf, feature_name_perf):
    simi_calc.calc_lcss_simi_matrix(perf_feature_names=feature_group)
    result_stats = pred_stats(simi_calc, n=k, dependent=True)
    df.loc[len(df)] = ['Dependent-LCSS', feature_group_name, *result_stats] 

#### Independent LCSS

In [None]:
for feature_group, feature_group_name in zip(features_perf, feature_name_perf):
    simi_calc.calc_ind_lcss_simi_matrix(perf_feature_names=feature_group)
    result_stats = pred_stats(simi_calc, n=k, dependent=False)
    df.loc[len(df)] = ['Independent-LCSS', feature_group_name, *result_stats] 

### Comparing methods

In [None]:
df[df['method'] == 'Hist-FP-L21']

Unnamed: 0,method,feature_set,accuracy,precision,f1,recall
18,Hist-FP-L21,plan_3,1.0,1.0,1.0,1.0
19,Hist-FP-L21,plan_7,1.0,1.0,1.0,1.0
20,Hist-FP-L21,plan,1.0,1.0,1.0,1.0
21,Hist-FP-L21,resource_3,1.0,1.0,1.0,1.0
22,Hist-FP-L21,resource_5,1.0,1.0,1.0,1.0
23,Hist-FP-L21,resource,1.0,1.0,1.0,1.0
24,Hist-FP-L21,all_3,1.0,1.0,1.0,1.0
25,Hist-FP-L21,all_7,1.0,1.0,1.0,1.0
26,Hist-FP-L21,all,1.0,1.0,1.0,1.0


In [None]:
df[df['method'] == 'Hist-FP-Fro']

Unnamed: 0,method,feature_set,accuracy,precision,f1,recall
36,Hist-FP-Fro,plan_3,1.0,1.0,1.0,1.0
37,Hist-FP-Fro,plan_7,1.0,1.0,1.0,1.0
38,Hist-FP-Fro,plan,1.0,1.0,1.0,1.0
39,Hist-FP-Fro,resource_3,1.0,1.0,1.0,1.0
40,Hist-FP-Fro,resource_5,1.0,1.0,1.0,1.0
41,Hist-FP-Fro,resource,1.0,1.0,1.0,1.0
42,Hist-FP-Fro,all_3,1.0,1.0,1.0,1.0
43,Hist-FP-Fro,all_7,1.0,1.0,1.0,1.0
44,Hist-FP-Fro,all,1.0,1.0,1.0,1.0


In [None]:
df[df['method'] == 'Phase-FP-L21']

Unnamed: 0,method,feature_set,accuracy,precision,f1,recall
63,Phase-FP-L21,plan_3,1.0,1.0,1.0,1.0
64,Phase-FP-L21,plan_7,1.0,1.0,1.0,1.0
65,Phase-FP-L21,plan,1.0,1.0,1.0,1.0
66,Phase-FP-L21,resource_3,1.0,1.0,1.0,1.0
67,Phase-FP-L21,resource_5,0.962963,0.966667,0.962848,0.962963
68,Phase-FP-L21,resource,1.0,1.0,1.0,1.0
69,Phase-FP-L21,all_3,0.962963,0.966667,0.962848,0.962963
70,Phase-FP-L21,all_7,1.0,1.0,1.0,1.0
71,Phase-FP-L21,all,1.0,1.0,1.0,1.0


In [None]:
df[df['method'] == 'Dependent-DTW']

Unnamed: 0,method,feature_set,accuracy,precision,f1,recall
99,Dependent-DTW,resource_3,1.0,1.0,1.0,1.0
100,Dependent-DTW,resource_5,1.0,1.0,1.0,1.0
101,Dependent-DTW,resource,1.0,1.0,1.0,1.0


In [None]:
df[df['method'] == 'Independent-DTW']

Unnamed: 0,method,feature_set,accuracy,precision,f1,recall
102,Independent-DTW,resource_3,1.0,1.0,1.0,1.0
103,Independent-DTW,resource_5,1.0,1.0,1.0,1.0
104,Independent-DTW,resource,1.0,1.0,1.0,1.0


In [None]:
df[df['method'] == 'Dependent-LCSS']

Unnamed: 0,method,feature_set,accuracy,precision,f1,recall
105,Dependent-LCSS,resource_3,0.888889,0.916667,0.885714,0.888889
106,Dependent-LCSS,resource_5,0.888889,0.916667,0.885714,0.888889
107,Dependent-LCSS,resource,0.888889,0.916667,0.885714,0.888889


In [None]:
df[df['method'] == 'Independent-LCSS']

Unnamed: 0,method,feature_set,accuracy,precision,f1,recall
108,Independent-LCSS,resource_3,1.0,1.0,1.0,1.0
109,Independent-LCSS,resource_5,1.0,1.0,1.0,1.0
110,Independent-LCSS,resource,1.0,1.0,1.0,1.0


In [None]:
df[df['method'] == 'L21-Norm']

Unnamed: 0,method,feature_set,accuracy,precision,f1,recall
0,L21-Norm,resource_3,1.0,1.0,1.0,1.0
1,L21-Norm,resource_5,1.0,1.0,1.0,1.0
2,L21-Norm,resource,1.0,1.0,1.0,1.0


In [None]:
df[df['method'] == 'L11-Norm']

Unnamed: 0,method,feature_set,accuracy,precision,f1,recall
3,L11-Norm,resource_3,1.0,1.0,1.0,1.0
4,L11-Norm,resource_5,1.0,1.0,1.0,1.0
5,L11-Norm,resource,1.0,1.0,1.0,1.0


In [None]:
all_correct = df[df['accuracy'] == 1]

In [None]:
for name in all_correct['method'].unique():
    print(all_correct[all_correct['method'] == name])

     method feature_set  accuracy  precision   f1  recall
0  L21-Norm  resource_3       1.0        1.0  1.0     1.0
1  L21-Norm  resource_5       1.0        1.0  1.0     1.0
2  L21-Norm    resource       1.0        1.0  1.0     1.0
     method feature_set  accuracy  precision   f1  recall
3  L11-Norm  resource_3       1.0        1.0  1.0     1.0
4  L11-Norm  resource_5       1.0        1.0  1.0     1.0
5  L11-Norm    resource       1.0        1.0  1.0     1.0
     method feature_set  accuracy  precision   f1  recall
6  Fro-Norm  resource_3       1.0        1.0  1.0     1.0
7  Fro-Norm  resource_5       1.0        1.0  1.0     1.0
8  Fro-Norm    resource       1.0        1.0  1.0     1.0
       method feature_set  accuracy  precision   f1  recall
9   Canb-Norm  resource_3       1.0        1.0  1.0     1.0
10  Canb-Norm  resource_5       1.0        1.0  1.0     1.0
11  Canb-Norm    resource       1.0        1.0  1.0     1.0
         method feature_set  accuracy  precision   f1  recall
18

In [None]:
print(all_correct['method'].unique())

['L21-Norm' 'L11-Norm' 'Fro-Norm' 'Canb-Norm' 'Hist-FP-L21' 'Hist-FP-L11'
 'Hist-FP-Fro' 'Hist-FP-Canb' 'Hist-FP-Chi2' 'Phase-FP-L21' 'Phase-FP-L11'
 'Phase-FP-Fro' 'Dependent-DTW' 'Independent-DTW' 'Independent-LCSS']


In [None]:
print(all_correct['feature_set'].unique())

['resource_3' 'resource_5' 'resource' 'plan_3' 'plan_7' 'plan' 'all_3'
 'all_7' 'all']


In [None]:
temp = all_correct.drop_duplicates()

In [None]:
all_correct.shape

(78, 6)

In [None]:
df['accuracy'].min()

0.0

In [None]:
df['accuracy'].mean()

0.8595261928595261

In [None]:
df['accuracy'].median()

1.0

In [None]:
df['accuracy'].std()

0.26203978497885105

In [None]:
print(df.shape)

(111, 6)


In [None]:
lower = df[df['accuracy'] <= df['accuracy'].quantile(0.25)]

In [None]:
lower

Unnamed: 0,method,feature_set,accuracy,precision,f1,recall
12,Corr-Norm,resource_3,0.62963,0.462963,0.518519,0.62963
13,Corr-Norm,resource_5,0.62963,0.462963,0.518519,0.62963
14,Corr-Norm,resource,0.333333,0.222222,0.217172,0.333333
15,Chi2-Norm,resource_3,0.333333,0.222222,0.217172,0.333333
16,Chi2-Norm,resource_5,0.333333,0.222222,0.217172,0.333333
17,Chi2-Norm,resource,0.333333,0.222222,0.217172,0.333333
55,Hist-FP-Chi2,plan_7,0.333333,0.111111,0.166667,0.333333
56,Hist-FP-Chi2,plan,0.333333,0.222222,0.217172,0.333333
57,Hist-FP-Chi2,resource_3,0.666667,0.5,0.555556,0.666667
58,Hist-FP-Chi2,resource_5,0.333333,0.111111,0.166667,0.333333


In [None]:
print(lower.shape)

(28, 6)
