In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import os
import json

import pandas as pd

In [4]:
def results_to_dataframe(exp_name, in_parts, postfix_list=None, folder=None, return_optional=False):
    
    rows = []
    index = 0
    
    n_parts = 5 if in_parts else 1
    if postfix_list is not None and len(postfix_list) != n_parts:
        raise ValueError("Length of postfix list must match number of parts")
        
    for i in range(n_parts):
        if postfix_list is not None:
            pf_list = ['', postfix_list[i]]
        else:
            pf_list = ['']

        for postfix in pf_list:
        
            name = f"{exp_name}-part{i}" if in_parts else exp_name
            if folder is None:
                path = os.path.join("results", "raw", name + postfix + ".json")
            else:
                path = os.path.join("results", "raw", folder, name + postfix + ".json")

            with open(path, "r") as f:
                results = json.load(f)
                
            dictionary = {
                'n_sources': results['n_sources'],
                'n_dataitems': results['n_dataitems'],
                'n_distinct': results['n_distinct'],
            }

            algorithms = {}
            for algo in results['algorithms_info']:
                algorithms[algo['name']] = algo

            for experiment in results['experiments']:
                for dist in ['coverage_dist', 'truth_dist', 'distinct_dist', 'spread_dist']:
                    prefix = ""
                    if 'flipped' in experiment[dist] and experiment[dist]['flipped']:
                        prefix += "F_"
                    if experiment[dist]['name'] == 'TruncExponential' and experiment[dist]['lmbda'] == 15:
                        prefix += "S_"
                    if experiment[dist]['name'] == 'TruncPareto' and experiment[dist]['alpha'] == 25.3:
                        prefix += "S_"
                    dictionary[dist] = prefix + experiment[dist]['name']

                dictionary['optimal_perc_score'] = experiment['optimal'][0]
                dictionary['optimal_score'] = experiment['optimal'][1]
                dictionary['n_claims'] = experiment['n_claims']
                dictionary['iteration_index'] = experiment['iteration_index'] if 'iteration_index' in experiment else -1

                for key, value in experiment['results'].items():
                    dictionary[f'{key}_time'] = value['time']
                    dictionary[f'{key}_score_perc'] = value['scores'][0]
                    dictionary[f'{key}_score'] = value['scores'][1]
                
                rows.append(pd.DataFrame.from_dict({index: dictionary}, orient='index'))
                index += 1
        
    if return_optional:
        return pd.concat(rows), algorithms
    
    return pd.concat(rows)

In [9]:
from itertools import product

runs = [1, 2, 3]
combs = [(1, 5), (2, 2), (5, 1)]
dist = [5, 20]
parts_list = [True, True, True, False, False, False,
              True, True, True, False, False, False,
              True, True, False, False, False, False]

for (run, (s, di), dist), parts in zip(product(runs, combs, dist), parts_list):
    exp_name = f"run{run}-{s}-{di}-{dist}"
    print(exp_name)
    df = results_to_dataframe(exp_name, in_parts=parts, folder=exp_name)
    df.to_csv(os.path.join("results", "raw", f"{exp_name}.csv"), index=False)

run1-1-5-5
run1-1-5-20
run1-2-2-5
run1-2-2-20
run1-5-1-5
run1-5-1-20
run2-1-5-5
run2-1-5-20
run2-2-2-5
run2-2-2-20
run2-5-1-5
run2-5-1-20
run3-1-5-5
run3-1-5-20
run3-2-2-5
run3-2-2-20
run3-5-1-5
run3-5-1-20


In [5]:
from itertools import product

runs = [1, 2, 3]
combs = [(1, 5), (2, 2), (5, 1)]
dist = [5, 20]
parts = False

for run, (s, di), dist in product(runs, combs, dist):
    exp_name = f"truth{run}-{s}-{di}-{dist}-exttruth"
    print(exp_name)
    df = results_to_dataframe(exp_name, in_parts=parts, folder="truth_extension")
    print(df.shape)
    df.to_csv(os.path.join("results", "raw", f"{exp_name}.csv"), index=False)

truth1-1-5-5-exttruth
(500, 14)
truth1-1-5-20-exttruth
(500, 14)
truth1-2-2-5-exttruth
(500, 14)
truth1-2-2-20-exttruth
(500, 14)
truth1-5-1-5-exttruth
(500, 14)
truth1-5-1-20-exttruth
(500, 14)
truth2-1-5-5-exttruth
(500, 14)
truth2-1-5-20-exttruth
(500, 14)
truth2-2-2-5-exttruth
(500, 14)
truth2-2-2-20-exttruth
(500, 14)
truth2-5-1-5-exttruth
(500, 14)
truth2-5-1-20-exttruth
(500, 14)
truth3-1-5-5-exttruth
(500, 14)
truth3-1-5-20-exttruth
(500, 14)
truth3-2-2-5-exttruth
(500, 14)
truth3-2-2-20-exttruth
(500, 14)
truth3-5-1-5-exttruth
(500, 14)
truth3-5-1-20-exttruth
(500, 14)


In [6]:
from itertools import product

runs = [1, 2, 3]
combinations = [(1, 5), (2, 2), (5, 1)]
distinct = [5, 20]

truth_extension = True

df_dict = {}
for r, c, d in product(runs, combinations, distinct):
    if truth_extension:
        df_dict[(r, c, d)] = pd.read_csv(os.path.join("results", "raw", f"truth{r}-{c[0]}-{c[1]}-{d}-exttruth.csv"))
    else:
        df_dict[(r, c, d)] = pd.read_csv(os.path.join("results", "raw", f"run{r}-{c[0]}-{c[1]}-{d}.csv"))

In [7]:
if truth_extension:
    algos = ["TwoEstimates"]
else:
    algos = ["Majority", "TruthFinder", "TwoEstimates", "ThreeEstimates"]
options = ["time", "score_perc", "score"]
extras = ["optimal_perc_score", "optimal_score", "n_claims"]

df_final = {}
for r in runs:
    renamer = {}
    for al, op in product(algos, options):
        if al == "optimal" and op == "time":
            continue
        renamer[f"{al}_{op}"] = f"{al}_{op}_{r}"
    
    for ex in extras:
        renamer[f"{ex}"] = f"{ex}_{r}"

    for c, d in product(combinations, distinct):
        df_dict[(r, c, d)] = df_dict[(r, c, d)].rename(columns=renamer, errors="raise").drop("iteration_index", axis=1)


In [8]:
merge_on = ['n_sources', 'n_dataitems', 'n_distinct', 'coverage_dist', 'truth_dist', 'distinct_dist', 'spread_dist']

df_final = {}
for c, d in product(combinations, distinct):
    df_final[(c, d)] = df_dict[(1, c, d)].merge(df_dict[(2, c, d)], on=merge_on, validate="one_to_one").merge(df_dict[(3, c, d)], on=merge_on, validate="one_to_one")
    
print(df_final.keys())

dict_keys([((1, 5), 5), ((1, 5), 20), ((2, 2), 5), ((2, 2), 20), ((5, 1), 5), ((5, 1), 20)])


In [9]:
for c, d in product(combinations, distinct):
    for al, op in product(algos, options):
        df_final[(c, d)][f"{al}_{op}"] = (df_final[(c, d)][f"{al}_{op}_{1}"] + df_final[(c, d)][f"{al}_{op}_{2}"] + df_final[(c, d)][f"{al}_{op}_{3}"]) / 3.0
    
    for ex in extras:
        df_final[(c, d)][f"{ex}"] = (df_final[(c, d)][f"{ex}_{1}"] + df_final[(c, d)][f"{ex}_{2}"] + df_final[(c, d)][f"{ex}_{3}"]) / 3.0


In [10]:
for k in df_final:
    print(df_final[k].shape)
    if truth_extension:
        df_final[k].to_csv(os.path.join("results", f"truth_final_{k[0][0] * 100}_{k[0][1] * 100}_{k[1]}.csv"), index=False)
    else:
        df_final[k].to_csv(os.path.join("results", f"final_{k[0][0] * 100}_{k[0][1] * 100}_{k[1]}.csv"), index=False)

(500, 31)
(500, 31)
(500, 31)
(500, 31)
(500, 31)
(500, 31)


In [8]:
for df in df_final.values():
    print(df.columns)
    break

Index(['n_sources', 'n_dataitems', 'n_distinct', 'coverage_dist', 'truth_dist',
       'distinct_dist', 'spread_dist', 'optimal_perc_score_1',
       'optimal_score_1', 'n_claims_1', 'Majority_time_1',
       'Majority_score_perc_1', 'Majority_score_1', 'TruthFinder_time_1',
       'TruthFinder_score_perc_1', 'TruthFinder_score_1',
       'TwoEstimates_time_1', 'TwoEstimates_score_perc_1',
       'TwoEstimates_score_1', 'ThreeEstimates_time_1',
       'ThreeEstimates_score_perc_1', 'ThreeEstimates_score_1',
       'optimal_perc_score_2', 'optimal_score_2', 'n_claims_2',
       'Majority_time_2', 'Majority_score_perc_2', 'Majority_score_2',
       'TruthFinder_time_2', 'TruthFinder_score_perc_2', 'TruthFinder_score_2',
       'TwoEstimates_time_2', 'TwoEstimates_score_perc_2',
       'TwoEstimates_score_2', 'ThreeEstimates_time_2',
       'ThreeEstimates_score_perc_2', 'ThreeEstimates_score_2',
       'optimal_perc_score_3', 'optimal_score_3', 'n_claims_3',
       'Majority_time_3', 'Ma