In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib notebook

In [2]:
import os
os.chdir('../')

In [4]:
from src.dataset import *
from src.concept_vectors import *
from src.util import *
from src.hierarchy import *
from src.metrics import *
import numpy as np
import matplotlib.pyplot as plt
import glob

### Plot and evaluate current results

In [5]:
all_result_files = glob.glob("results/evaluation/*.txt")

In [9]:
results = {}

for i in all_result_files:
    name = i.replace(".txt","").split("/")[-1]
    
    w = open(i).read().strip().split("\n")
    data = {}
    for row in w:
        key,value = row.split(":")
        data[key] = eval(value)[0]

    results[name] = data

In [19]:
multiplier_by_metric = {
    'Image Responsiveness': 1,
    'Model Responsiveness': 1,
    'Image Robustness': -1,
    'Model Robustness': -1,
    'Stability': -1,
    'Truthfulness': 1
}

In [31]:
avg_robustness = [(results[i]['Model Robustness'] + results[i]['Image Robustness'],i) for i in results]
avg_robustness = sorted(avg_robustness,key=lambda k: k[0])

In [40]:
for key in multiplier_by_metric:
    avg_score = [(i,results[i][key]) for i in results]
    print(key,sorted(avg_score,key = lambda k: k[1]*multiplier_by_metric[key],reverse=True))

Image Responsiveness [('average', 49.0), ('concept2vec', 47.666666666666664), ('concatenate', 47.666666666666664), ('labels', 47.0), ('tcav', 46.666666666666664), ('cem', 44.0)]
Model Responsiveness [('cem', 45.666666666666664), ('average', 35.666666666666664), ('concatenate', 33.666666666666664), ('tcav', 31.333333333333332), ('labels', 0.0), ('concept2vec', 0.0)]
Image Robustness [('labels', 0.0), ('concatenate', 29.333333333333332), ('average', 32.0), ('concept2vec', 36.333333333333336), ('tcav', 44.0), ('cem', 44.333333333333336)]
Model Robustness [('labels', 0.0), ('concept2vec', 0.0), ('tcav', 9.666666666666666), ('concatenate', 17.333333333333332), ('average', 21.333333333333332), ('cem', 44.0)]
Stability [('labels', 0.0), ('tcav', 16.0), ('concatenate', 16.333333333333332), ('average', 16.666666666666668), ('concept2vec', 37.333333333333336), ('cem', 44.0)]
Truthfulness [('concatenate', 0.7333333333333333), ('tcav', 0.6444444444444445), ('labels', 0.5555555555555555), ('concept

In [41]:
embedding_method = create_model_representation_vectors_simple

In [54]:
start = time.time()

dataset = MNIST_Dataset()
hierarchy_method = create_linkage_hierarchy
random_seeds = [43,44,45]
attributes = dataset.get_attributes()
stability = stability_metric(hierarchy_method,embedding_method,dataset,attributes,random_seeds,baseline_hierarchies=None,bulk_attributes=True)
print("Took {} time".foramt(time.time()-start))
stability

NameError: name 'attribute' is not defined

In [49]:
start = time.time()

max_images = 25
model = "VGG16"

with tf.compat.v1.Session() as sess:
    activation_generator = load_activations_model(dataset.experiment_name,max_images,model,sess)
    activations = get_activations_dictionary(dataset.get_attributes(),
                                             sess,
                                             model_name=model,
                                             experiment_name=dataset.experiment_name,
                                             max_examples=max_images)


print("Took {} time".format(time.time()-start))

Took 161.72387886047363 time


### Evaluate new results

In [4]:
dataset = MNIST_Dataset()

In [5]:
seeds = [43,44,45]
hierarchy_creation_method = create_linkage_hierarchy
attributes = dataset.get_attributes()

In [6]:
metrics = [truthfulness_metric,stability_metric,robustness_image_metric,responsiveness_image_metric,robustness_model_metric,responsiveness_model_metric]
metric_names = ['Truthfulness','Stability', 'Image Robustness', 'Image Responsiveness','Model Robustness','Model Responsiveness']

### Evalaute Labels-Only vectors

In [20]:
start = time.time()

results_label = compute_all_metrics(hierarchy_creation_method,
                                    load_label_vectors_simple,
                                    dataset,
                                    attributes,
                                    seeds)
print("Took {} time to compute".format(time.time()-start))
results_label

Stability: (0.0, 0.0)
Image Robustness: (0.0, 0.0)
Image Responsiveness: (47.0, 0.0)
Model Robustness: (0.0, 0.0)
Model Responsiveness: (0.0, 0.0)
Truthfulness: (0.8222222222222223, 0.12570787221094182)
Took 794.776941537857 time to compute


{'Stability': (0.0, 0.0),
 'Image Robustness': (0.0, 0.0),
 'Image Responsiveness': (47.0, 0.0),
 'Model Robustness': (0.0, 0.0),
 'Model Responsiveness': (0.0, 0.0),
 'Truthfulness': (0.8222222222222223, 0.12570787221094182)}

### Evaluate concept2vec

In [23]:
import gc
gc.collect()

432

In [24]:
start = time.time()

results_concept2vec = compute_all_metrics(hierarchy_creation_method,
                                    load_concept2vec_vectors_simple,
                                    dataset,
                                    attributes,
                                    seeds)
print("Took {} time to compute".format(time.time()-start))
results_concept2vec

Stability: (37.333333333333336, 7.3181661333667165)
Image Robustness: (36.333333333333336, 3.299831645537222)
Image Responsiveness: (47.666666666666664, 0.9428090415820634)
Model Robustness: (0.0, 0.0)
Model Responsiveness: (0.0, 0.0)


RuntimeError: can't start new thread

### Evaluate TCAV Vectors

In [None]:
start = time.time()

results_tcav = compute_all_metrics(hierarchy_creation_method,
                                    load_tcav_vectors_simple,
                                    dataset,
                                    attributes,
                                    seeds)
print("Took {} time to compute".format(time.time()-start))
results_tcav

### Evaluate CEM Vectors

In [None]:
start = time.time()

results_cem = compute_all_metrics(hierarchy_creation_method,
                                    load_cem_vectors_simple,
                                    dataset,
                                    attributes,
                                    seeds)
print("Took {} time to compute".format(time.time()-start))
results_cem

### Evaluate aggregated functions

In [None]:
method = combine_embeddings_average(load_label_vectors_simple,load_tcav_vectors_simple)

In [None]:
start = time.time()

results_avg = compute_all_metrics(hierarchy_creation_method,
                                    method,
                                    dataset,
                                    attributes,
                                    seeds)
print("Took {} time to compute".format(time.time()-start))
results_avg

In [None]:
method = combine_embeddings_concatenate(load_label_vectors_simple,load_tcav_vectors_simple)

In [None]:
start = time.time()

results_concatenate = compute_all_metrics(hierarchy_creation_method,
                                    method,
                                    dataset,
                                    attributes,
                                    seeds)
print("Took {} time to compute".format(time.time()-start))
results_concatenate

### Evaluate Model-based Vectors

In [None]:
start = time.time()

results_model = compute_all_metrics(hierarchy_creation_method,
                                    create_model_representation_vectors_simple,
                                    dataset,
                                    attributes,
                                    seeds)
print("Took {} time to compute".format(time.time()-start))
results_model