# Evaluation of Sphere Experiments

This notebook will evaluate entropy, variance, iterations (AIPS) and coverage (ADTS) on the sphere experiements.  

It has to be run from within the folder that holds all experiment folders that shall be evaluated

In [1]:
import pandas as pd
import numpy as np
import glob
import os
from scipy.spatial import distance_matrix
from itertools import combinations, permutations, product
import numba
from  numba import jit,njit
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor

In [2]:
# Auxilliary functions to load, transform data and assign names appropriately

# getting the data 
def get_folders():
    folders = glob.glob("*/")
    folders = [fold for fold in folders]# if "testng" in fold.split("_")]
    return folders

def get_experiments(folders):
    return [f.split("___")[-1][:-4]for f in folders if "sphere" in f and "-1" in f]
    
def split_by_experiment(folders,experiments):
    experiment_map = {experiment: [] for experiment in experiments}
    for f in folders:
        for e in experiments:
            if e in f:
                experiment_map[e].append(f)
                break
    return experiment_map

def to_df(folder, name):
    files = glob.glob(os.path.join(folder,f"*{name}*"))
    data = [np.loadtxt(f) for f in files]
    try:
        data = np.vstack(data)
    except ValueError:
        if "reference" in folder:
            return pd.DataFrame() 
        else:
            data = np.hstack(data)
    if "probs" in name:
        data = pd.DataFrame(data.T, columns=["probs"])
    elif "iterations" in name:
        #try:        
        data = pd.DataFrame(data.flatten(), columns=["iterations"])
        #except ValueError:
        #    data = pd.DataFrame(data.flatten(),columns=["iterations"])
    else:        
        data = pd.DataFrame(data,columns=["x","y","z"])
    experiment = folder.split("_",1)[1][:-1].rsplit("_",1)
    data["experiment"] =  experiment[0]
    data["num"] = experiment[1]
    return data

def get_data(folders,name):
    return pd.concat([to_df(fold,name) for fold in folders])

def get_ref_exp(data):
    experiments = []
    for exp in data["experiment"].unique():
        if "reference" in exp:
            reference = exp
        else:
            experiments.append(exp)
    sample_runs = data["num"].unique().tolist()
    return experiments, reference, sample_runs

In [3]:
# implementation of the metrics


def experiment_as_arr(data,experiment, sample_run):
    return data[(data["experiment"] == experiment) & (data["num"] == sample_run)][["x","y","z"]].values


def get_mean_nn_dist(data,data2,same=False):
    dist = distance_matrix(data, data2)
    if same:
        dist = dist[dist != 0].reshape(len(data),len(data)-1)
    try:
        min_dist = dist.min(0)
    except:
        pdb.set_trace()
        print(2)
    return min_dist.mean()

def resub_entropy(data):
    # entropy for single dataset
    return -np.log(data).sum() / len(data)

def entropy(data):
    # entropy evaluated over a dataset of multiple experiments
    # will group data and evaluated entropy per group
    data = data.groupby(["experiment","num"]).agg(resub_entropy).reset_index()
    data.rename(columns={"probs":"entropy"}, inplace=True)
    return data
    
def coverage(data):
    # old implementation of coverage
    experiments, reference, sample_runs = get_ref_exp(data)
    coverages = []
    for i in sample_runs:
        print("Evaluating coverage for sample run ",i)
        data_ref = experiment_as_arr(data, reference,i)
        coverage2 = lambda x: get_mean_nn_dist(x, data_ref)
        cover = []
        for exp in experiments:
            arr = experiment_as_arr(data,exp,i)
            if len(arr) == 0:
                pdb.set_trace()
            cover.append({"coverage":coverage2(arr),"num":i,"experiment":exp})
        coverages.extend(cover)
    return pd.DataFrame(coverages)
    
    
def variance(data):
    # evaluates variance of KDE estimates for a multiple experiments
    data = data.groupby(["experiment","num"]).agg("var").reset_index()
    data.rename(columns={"probs":"variance"}, inplace=True)
    return data
    

def average_iterations(data):
    data = data.groupby(["experiment","num"]).agg("mean").reset_index()
    data.rename(columns={"probs":"mean"}, inplace=True)
    return data
    

In [4]:
@njit(nogil=True)
def dist(ref, samples):
    # new coverage calculation
    # faster with jit compilation numba
    n,m = ref.shape
    lowest = np.Inf
    current = 0
    min_vals = np.zeros(n)
    for i in range(n):
        row = ref[i]
        for k in range(n):
            for j in range(m):
                current += (row[j] - samples[k,j])**2
            if current < lowest:
                lowest = current
            current = 0
        min_vals[i] = lowest
        lowest = np.Inf
    return np.mean(min_vals)


def get_data_for_dist(data, inputs,reference):
    # helper that arranges data for parallel evaluation of coverage
    experiment, sample_run = inputs
    data_exp = experiment_as_arr(data, experiment, sample_run)
    data_ref = experiment_as_arr(data, reference, sample_run)
    cover = dist(data_ref, data_exp)
    #print(cover)
    return {"experiment": experiment, "num": sample_run, "coverage": cover}
    
def coverage_new(data):
    # parallel evaluation of coverage for multiple experiments
    experiments, reference, sample_runs = get_ref_exp(data)
    coverages = []
    cores = os.cpu_count()
    #print("num cores: ",cores)
    combs = product(experiments, sample_runs)
    with ThreadPoolExecutor() as ex:
        out = ex.map(lambda x: get_data_for_dist(data=data, reference=reference, inputs=x), combs)
        df = pd.DataFrame(out)
        return df


In [5]:
folders = get_folders()
experiments = set(get_experiments(folders))
experiment_map = split_by_experiment(folders,experiments)
experiments = list(experiments)

In [6]:
experiments

['off_center_connected', 'center_connected', 'off_center_disconnected']

In [7]:
center = experiment_map[experiments[0]]
experiment_map[experiments[0]] = []
# adjust off-center to own list entry
for e in center:
    if "off" in e:
        experiment_map["off_center_connected"].append(e)
    else:
        experiment_map["center_connected"].append(e)

In [8]:
# acutual evaluation of metrics happens here
# data is loaded and then evaluated


variances = {}
entropies = {}
iterations = {}
coverages = {}
for exp, folders in experiment_map.items():
    print(exp)
    probs = get_data(folders,"probs")
    variance_df = variance(probs)
    entropy_df = entropy(probs)
    print("finished loading variance and entropy")
    del probs

    samples = get_data(folders,"local*samples")
    coverage_df = coverage_new(samples)
    print("finished loading coverages")
    del samples

    num_iterations = get_data(folders,"iterations")
    avg_iterations_df = average_iterations(num_iterations)
    print("finished loading iterations")
    del num_iterations
    
    variances[exp] = variance_df
    entropies[exp] = entropy_df
    iterations[exp] = avg_iterations_df
    coverages[exp] = coverage_df

off_center_connected
finished loading variance and entropy
finished loading coverages
finished loading iterations
center_connected
finished loading variance and entropy
finished loading coverages
finished loading iterations
off_center_disconnected
finished loading variance and entropy
finished loading coverages
finished loading iterations


In [10]:
# saving data

for exp in experiment_map.keys():
    suffix = exp
    entropies[exp].to_csv(f"entropy_{suffix}.csv")
    coverages[exp].to_csv(f"coverage_{suffix}.csv")
    iterations[exp].to_csv(f"iterations_{suffix}.csv")
    variances[exp].to_csv(f"variance_{suffix}.csv")

In [13]:
def add_word(word,exp,name): 
    if word in exp:
        name = name + "_" + word
    return name

def rename(k):
    name = ""
    name = add_word("RRT",k,name)
    name = add_word("grid-walk",k,name)
    name = add_word("filter",k,name)
    name = add_word("multiple",k,name)
    name = add_word("single",k,name)
    name = add_word("ch",k,name)
    name = add_word("bandit",k,name)
    name = add_word("biased__",k,name)
    name = add_word("reference",k,name)
    
    return name.strip("_")

In [14]:
experiment_to_data = {}
metric_to_experiment_map = {}
metric_to_experiment_map_rev = {}
for exp in experiment_map.keys():
    experiment_to_data[exp] = {"variance": variances[exp],
    "iterations": iterations[exp],
    "entropy": entropies[exp],
    "coverage": coverages[exp]}
    metric_to_experiment_map[exp] = {}
    metric_to_experiment_map_rev[exp] = {}
    for metric, data in experiment_to_data[exp].items():
        experiment_map = {e:str(i) for i,e in enumerate(data.experiment.unique())} 
        metric_to_experiment_map[exp][metric] = experiment_map
        metric_to_experiment_map_rev[exp][metric] = {v:k for k,v in experiment_map.items()}