# Dependencies

In [None]:
# Imports
import re
import os
import random
from os.path import join
import sys
base_path = os.path.abspath(os.path.join('..'))
if base_path not in sys.path:
    sys.path.append(base_path)
from pprint import pprint
    
import json
import math
import numpy as np
import scipy as sc
from scipy.stats import entropy, normaltest, mode
import pandas as pd
import sklearn as sk
from sklearn import svm
from sklearn.utils import resample
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.externals import joblib
from sklearn.feature_selection import RFE
from sklearn.metrics import log_loss
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split, StratifiedShuffleSplit
from sklearn.utils import shuffle
from sklearn.externals import joblib

import seaborn

from IPython.display import display

import pickle

import seaborn as sns
import matplotlib as mpl
mpl.rcParams['figure.figsize']
import matplotlib.patches as mpatches

from collections import Counter

import itertools
from itertools import compress
import matplotlib.pyplot as plt
from matplotlib import pylab

from time import time, strftime

from pprint import pprint

import warnings
warnings.filterwarnings("ignore", category=UserWarning, module="matplotlib")

from config import *

## Helpers

In [None]:
from helpers.analysis import *
from helpers.processing import *

In [None]:
def get_plotly_prediction(qualtrics_id):
    if ('bar' in qualtrics_id): return 1
    if ('line' in qualtrics_id): return 2
    if ('scatter' in qualtrics_id): return 3

## Constants

In [None]:
config = {
    'results_dir': '../results',
    'intermediate_dir': '../intermediate_results',
    'mode': 'two',
    'model': 'dict', #  'model',
    'num_simulated_votes': None,
    'num_bootstraps': 100000,
    'num_mturk_splits': 100000,
    'load_existing_simulations': True,
    'load_accuracy': True,
    'mturk_accuracy_method': 'real'
}

In [None]:
base_results_dir = config['results_dir']
if not os.path.exists(base_results_dir):
    os.mkdir(base_results_dir)
    
base_intermediate_dir = config['intermediate_dir']
if not os.path.exists(base_intermediate_dir):
    os.mkdir(base_intermediate_dir)
    
results_dir = join(base_results_dir, strftime('%Y-%m-%d'))
if not os.path.exists(results_dir):
    os.mkdir(results_dir)

## Load Data

In [None]:
experiment_data_directory = '../experiment_data'

if config['mode'] == 'two':
    experiment_raw_data = 'v3_two_type_raw_data.csv'
    ground_truth_fids_df = pd.read_csv(join(experiment_data_directory, 'ground_truth_fids_66.csv'))
if config['mode'] == 'three':
    experiment_raw_data = 'v3_three_type_raw_data.csv'
    ground_truth_fids_df = pd.read_csv(join(experiment_data_directory, 'ground_truth_fids_99.csv'))

outcome_label_to_value = { 'bar': 1, 'line': 2, 'scatter': 3 }
qualtrics_id_to_fid = {}
for outcome_label, outcome_value in outcome_label_to_value.items():
    for i, row in ground_truth_fids_df[ground_truth_fids_df.plotly == outcome_value].reset_index().iterrows():
        qualtrics_id_to_fid['{}_{}'.format(outcome_label, i + 1)] = row.fid
        
df = pd.read_csv(join(experiment_data_directory, experiment_raw_data))
df = df.iloc[2:, :]
df = df[df['how_easy'].notna()]
type_columns = natural_sort([ c for c in df.columns if ('_type' in c)])
df = df[type_columns]

In [None]:
ground_truth_predictions_dict = {}
if config['model'] == 'model':
    top_performing_model_dir = ''
    # top_performing_model_file_name = 'clf__model-rf__dataset-dataset__featureset-names__outcome-all_one_trace_type__task-two__perclass-296203__acc-0.938055.pkl'
    top_performing_model_file_name = 'clf__model-rf__dataset-dataset__featureset-names__outcome-all_one_trace_type__task-three__perclass-296202__acc-0.878786.pkl'
    top_performing_model = joblib.load(join(base_results_dir, '', top_performing_model_file_name))
    clf = top_performing_model
    
    ground_truth_fids = ground_truth_fids_df.fid
    ground_truth_features_file_name = '{}_ground_truth.csv'.format(features_df_file_name.split('.csv')[0])
    
    ground_truth_features_df = pd.read_csv(join(features_directory, ground_truth_features_file_name))
    ground_truth_features_df = ground_truth_features_df[ground_truth_features_df.fid.isin(ground_truth_fids)]

    feature_set_names = [ c for c in feature_set_names if c in ground_truth_features_df.columns ]
    feature_set_indices = [ ground_truth_features_df.columns.get_loc(c) for c in feature_set_names if c in ground_truth_features_df.columns]

    ground_truth_features_df.sort_values(by=['fid'])
    sorted_fids = ground_truth_features_df.fid
    X_ground_truth = ground_truth_features_df.drop(['fid'], axis=1, inplace=False, errors='ignore')
    X_ground_truth = X_ground_truth[feature_set_names]
    
    # With loaded model
    ground_truth_predictions = [ outcome_label_to_value[x] for x in clf.predict(X_ground_truth)]
    ground_truth_predictions_dict = dict((x, y) for (x, y) in zip(sorted_fids, ground_truth_predictions))
elif config['model'] == 'dict':
    if config['mode'] == 'two':
        ground_truth_predictions_dict_name = 'task_1_nn_ground_truth_2018-05-22_one-per-user.json'
    elif config['mode'] == 'three':
        ground_truth_predictions_dict_name = 'task_2_nn_ground_truth_2018-05-22_one-per-user.json'
    ground_truth_predictions_dict = dict((k, outcome_label_to_value[v]) for (k, v) in json.load(open(join(experiment_data_directory, ground_truth_predictions_dict_name))).items())

In [None]:
from experiment_data.data2vis_predictions import data2vis_predictions_raw
from experiment_data.deepeye_predictions import deepeye_predictions_raw
from experiment_data.showme_predictions import showme_predictions_raw
from experiment_data.compassql_predictions import compassql_predictions_raw


data2vis_predictions = {}
for k, v in data2vis_predictions_raw.items():
    qualtrics_id = k.rsplit('_type')[0]
    if qualtrics_id in qualtrics_id_to_fid:
        fid = qualtrics_id_to_fid[qualtrics_id]
        data2vis_predictions[fid] = v

deepeye_predictions = {}
for k, v in deepeye_predictions_raw.items():
    qualtrics_id = k.rsplit('_type')[0]
    if qualtrics_id in qualtrics_id_to_fid:
        fid = qualtrics_id_to_fid[qualtrics_id]
        deepeye_predictions[fid] = v

showme_predictions = {}
for k, v in showme_predictions_raw.items():
    qualtrics_id = k.rsplit('_type')[0]
    if qualtrics_id in qualtrics_id_to_fid:
        fid = qualtrics_id_to_fid[qualtrics_id]
        showme_predictions[fid] = v
        
compassql_predictions = {}
for k, v in compassql_predictions_raw.items():
    qualtrics_id = k.rsplit('_type')[0]
    if qualtrics_id in qualtrics_id_to_fid:
        fid = qualtrics_id_to_fid[qualtrics_id]
        compassql_predictions[fid] = v

## Descriptive

In [None]:
vote_results_without_na = dict([ (c, df[c].dropna().astype(int)) for c in df ])

In [None]:
num_votes_per_chart = [ len(v) for v in vote_results_without_na.values() ]
np.mean(num_votes_per_chart)

## Consensus-Adjusted Recommendation Score

In [None]:
load_votes = False
if load_votes:
    vote_results_without_na = dict([ (c, df[c].dropna().astype(int)) for c in df ])
    bootstrapped_votes_file_name = 'simulated_votes_{}_{}-type.pkl'.format(config['num_simulated_votes'], config['mode'])
    all_sample_modes_file_name = 'simulated_modes_{}_{}-type.pkl'.format(config['num_simulated_votes'], config['mode'])

    bootstrapped_votes = pickle.load(open(bootstrapped_votes_file_name, 'rb'))

In [None]:
load_probas = True
if load_probas:
    real_probas = pickle.load(
        open(join(base_intermediate_dir, 'real_probas_{}_{}-type.pkl'.format(config['num_simulated_votes'], config['mode'])), 'rb'))
    all_sample_probas = pickle.load(
        open(join(base_intermediate_dir, 'all_samples_probas_{}_{}-type.pkl'.format(config['num_simulated_votes'], config['mode'])), 'rb'))
else:
    def get_probas(votes):
        counts = Counter()
        for v in votes:
            counts[v] += 1
        probas = {}
        num_votes = len(votes)
        for k, v in counts.items():
            probas[k] = v / num_votes
        return probas

    real_probas = {}
    for c, votes in vote_results_without_na.items():
        real_probas[c] = get_probas(votes)

    num_bootstraps = 100000
    all_sample_probas = []
    for bootstrapped_voteset in bootstrapped_votes[:num_bootstraps]:
        bootstrapped_probas = {}
        for c, votes in bootstrapped_voteset.items():
            bootstrapped_probas[c] = get_probas(votes)
        all_sample_probas.append(bootstrapped_probas)

In [None]:
save_probas = False
if save_probas:
    pickle.dump(real_probas, open(
        join(base_intermediate_dir, 'real_probas_{}_{}-type.pkl').format(config['num_simulated_votes'], config['mode']), 'wb'))
    pickle.dump(all_sample_probas, open(
        join(base_intermediate_dir, 'all_samples_probas_{}_{}-type.pkl').format(config['num_simulated_votes'], config['mode']), 'wb'))

In [None]:
def get_score(prediction, probas, classifier='deepeye'):
    scores = {}
    max_proba = max(probas.values())
    for k, v in probas.items():
        scores[k] = v / max_proba
        
    min_score = min(scores.values())
    random_score = random.choice([ s for s in scores.values() ])
    
    if prediction == 'error':
        return random_score
    
    if prediction in ['tiacle', 'none']:
        return random_score
    
    if classifier == 'deepeye':
        if prediction == 3:
            prediction = 2
    
    # Normalized by the max
    return scores.get(prediction, min_score)

In [None]:
def get_random_score(all_sample_probas):
    scores = []
    for sample_probas in all_sample_probas:
        total_score = 0
        for c, probas in sample_probas.items():
            random_score = random.choice([v for v in probas.values()]) / max(probas.values())
            total_score += random_score
        scores.append(total_score)
    return scores

In [None]:
def get_min_score(all_sample_probas):
    scores = []
    for sample_probas in all_sample_probas:
        total_score = 0
        for c, probas in sample_probas.items():
            min_score = min(probas.values()) / max(probas.values())
            total_score += min_score
        scores.append(total_score)
    return scores

In [None]:
def get_data2vis_scores(all_sample_probas):
    print('Calculating Data2Vis Scores')
    scores = []
    for sample_probas in all_sample_probas:
        total_score = 0
        for k, probas in sample_probas.items():
            qualtrics_id = k.rsplit('_type')[0]
            fid = qualtrics_id_to_fid[qualtrics_id]
            prediction = data2vis_predictions[fid]
            score = get_score(prediction, probas)
            total_score += score
        scores.append(total_score)
    return scores

In [None]:
def get_deepeye_scores(all_sample_probas):
    print('Calculating DeepEye Scores')
    scores = []
    for sample_probas in all_sample_probas:
        total_score = 0
        for k, probas in sample_probas.items():
            qualtrics_id = k.rsplit('_type')[0]
            fid = qualtrics_id_to_fid[qualtrics_id]
            prediction = deepeye_predictions[fid]
            score = get_score(prediction, probas, classifier='deepeye')
            total_score += score
        scores.append(total_score)
    return scores

In [None]:
def get_showme_scores(all_sample_probas):
    print('Calculating ShowMe Scores')
    scores = []
    for sample_probas in all_sample_probas:
        total_score = 0
        for k, probas in sample_probas.items():
            qualtrics_id = k.rsplit('_type')[0]
            fid = qualtrics_id_to_fid[qualtrics_id]
            prediction = showme_predictions[fid]
            score = get_score(prediction, probas)
            total_score += score
        scores.append(total_score)
    return scores

In [None]:
def get_compassql_scores(all_sample_probas):
    print('Calculating CompassQL Scores')
    scores = []
    for sample_probas in all_sample_probas:
        total_score = 0
        for k, probas in sample_probas.items():
            qualtrics_id = k.rsplit('_type')[0]
            fid = qualtrics_id_to_fid[qualtrics_id]
            prediction = compassql_predictions[fid]
            score = get_score(prediction, probas)
            total_score += score
        scores.append(total_score)
    return scores

In [None]:
def get_vizml_scores(all_sample_probas):
    print('Calculating VizML Scores')
    vizml_scores = []
    for sample_probas in all_sample_probas:
        total_score = 0
        for k, probas in sample_probas.items():
            qualtrics_id = k.rsplit('_type')[0]
            fid = qualtrics_id_to_fid[qualtrics_id]
            prediction = ground_truth_predictions_dict[fid]
            score = get_score(prediction, probas)
            total_score += score
        vizml_scores.append(total_score)
    return vizml_scores

In [None]:
def get_plotly_scores(all_sample_probas):
    print('Calculating Plot.ly Accuracies')
    scores = []
    for sample_probas in all_sample_probas:
        total_score = 0
        for k, probas in sample_probas.items():
            prediction = get_plotly_prediction(k)
            score = get_score(prediction, probas)
            total_score += score
        scores.append(total_score)
    return scores

In [None]:
def get_mturk_scores(vote_results_without_na, real_probas, num_splits=1000, split_size=1):
    scores = []
    print('Split size:', split_size)

    for i in range(0, num_splits):
        if (i % 1000 == 0): print(i)
            
        total_score = 0
        
        for c, votes in vote_results_without_na.items():
            # probas = real_probas[c]
     
            num_votes = len(votes)
            if split_size == 1: real_split_size = 1
            else: real_split_size = math.ceil(split_size * num_votes)

            votes = list(votes)
            random.shuffle(votes)

            consensus_slice = votes[:real_split_size]
            test_slice = votes[real_split_size:]
            consensus_mode = mode(consensus_slice).mode[0]
            # test_mode = mode(test_slice).mode[0]

            probas = get_probas(test_slice)
            score = get_score(consensus_mode, probas)
            total_score += score
        scores.append(total_score)
    return scores

In [None]:
accuracies = {
    # 'plotly': get_plotly_scores(all_sample_probas),
    # 'vizml': get_vizml_scores(all_sample_probas),
    'compassql': get_compassql_scores(all_sample_probas),
    # 'data2viz': get_data2vis_scores(all_sample_probas),
    # 'deepeye': get_deepeye_scores(all_sample_probas),
    # 'minimum': get_min_score(all_sample_probas),
    # 'random': get_random_score(all_sample_probas),
    'showme': get_showme_scores(all_sample_probas),
    # 'mturk_1': get_mturk_scores(vote_results_without_na, real_probas, num_splits=config['num_mturk_splits'], split_size=1),
    # 'mturk_50': get_mturk_scores(vote_results_without_na, real_probas, num_splits=config['num_mturk_splits'], split_size=0.5),
}

if config['mode'] == 'two': total = 66
if config['mode'] == 'three': total = 99
normalized_accuracies = {}
for k, v in accuracies.items():
    normalized_accuracies[k] = np.array(v) / total

Save Accurracies

In [None]:
pickle.dump(normalized_accuracies, open(
    join(base_intermediate_dir, 'cars_benchmark_{}_{}_both_random-type.pkl').format(config['num_simulated_votes'], config['mode']), 'wb'))

In [None]:
if config['load_accuracy']:
    normalized_accuracies = pickle.load(
        open(join(base_intermediate_dir, 'cars_benchmark_{}_{}_both_random-type.pkl').format(config['num_simulated_votes'], config['mode']), 'rb'))

## Bar Chart of Accuracies with CI

In [None]:
conf_int = {}
for classifier, accuracies in normalized_accuracies.items():
    raw_scores = get_conf_int(accuracies, 0.95)
    final_scores = {}
    for k, v in raw_scores.items():
        final_scores[k] = v * 100
    conf_int[classifier] = final_scores

In [None]:
for k, v in conf_int.items():
    print(k, v['mean'], v['upper'] - v['mean'])

In [None]:
fig = plt.figure(figsize=(width, height))
plt.figure(figsize=(10, 7.5))
# sns.set_style("whitegrid")

legend = (config['mode'] == 'three')

colors_dict = {
    'green': '#009E73',
    'grey': '#A8A496',
    'blue': '#0072B2',
    'orange': '#D55E00',
    'pink': '#CC79A7'
}

accuracies_df = pd.DataFrame({
    'estimators': ['Random', 'Plotly', 'VizML', 'Data2Viz', 'DeepEye', 'MTurk', 'Show Me', 'CompassQL'],  # 'Minimum', , 'Group\nMTurk\n(50%)'],
    'accuracies': [
        # conf_int['minimum']['mean'],
        conf_int['random']['mean'],
        conf_int['plotly']['mean'],
        conf_int['vizml']['mean'],
        conf_int['data2viz']['mean'],
        conf_int['deepeye']['mean'],
        conf_int['mturk_1']['mean'],
        conf_int['showme']['mean'],
        conf_int['compassql']['mean'],
        #conf_int['mturk_10']['mean'],
        #conf_int['mturk_50']['mean']
    ],
    'errors': [ 
        # conf_int['minimum']['error'],
        conf_int['random']['error'],
        conf_int['plotly']['error'],
        conf_int['vizml']['error'],
        conf_int['data2viz']['error'],
        conf_int['deepeye']['error'],
        conf_int['mturk_1']['error'],
        conf_int['showme']['error'],
        conf_int['compassql']['error'],
    ],
    'colors': [ 
        colors_dict['grey'],
        colors_dict['blue'],
        colors_dict['green'],
        colors_dict['green'],
        colors_dict['green'],
        colors_dict['blue'],
        colors_dict['orange'],
        colors_dict['orange']
    
})

plt.rcParams['font.family'] = 'Helvetica Neue LT Com'

accuracies_df['accuracies'] = accuracies_df['accuracies']
accuracies_df['errors'] = accuracies_df['errors']

accuracies_df.sort_values(['accuracies'], ascending=True, inplace=True)

plt.rcParams['errorbar.capsize']=10
plt.rcParams['lines.markeredgewidth']=1.5

ax = accuracies_df.plot(
    kind='bar',
    x='estimators',
    y='accuracies',
    yerr='errors',
    color=accuracies_df['colors'],
    figsize=(width, height),
    width=0.8,
    linewidth=1,
    legend=legend,
    grid=False,
    alpha=1,
    rot=0,
)

for p in ax.patches:
    ax.annotate(
        np.round(p.get_height(),decimals=1),
        (p.get_x()+p.get_width()/2., 0.005),
        ha='center',
        va='center',
        xytext=(0, 10),
        textcoords='offset points',
        bbox=dict(boxstyle='Square', fc='white', lw=0, alpha=0.8)
    )

ax.yaxis.grid(which='major')
ax.yaxis.set_ticks(np.arange(0, 101, 10))
ax.yaxis.set_label_coords(-0.08, 0.5)
ax.set_ylim([0, 100])

print(conf_int['minimum']['mean'])
ax.axhline(
    y=conf_int['minimum']['mean'],
    color='gray',
    linestyle='dashed'
)

ax.axhline(
    y=np.max(accuracies_df['accuracies']),
    color='gray',
    linestyle='dotted'
)

baseline = mpatches.Patch(color=colors_dict['grey'], label='Baseline')
machine = mpatches.Patch(color=colors_dict['blue'], label='Human')
single_human = mpatches.Patch(color=colors_dict['green'], label='ML-based')
rule = mpatches.Patch(color=colors_dict['orange'], label='Rule-based')

if legend:
    pylab.legend(handles=[baseline, machine, rule, single_human], ncol=4, loc=9, bbox_to_anchor=(0.5, -0.1))

rotate_x_labels = False
offset_y_x_labels = False

for tick in ax.xaxis.get_major_ticks()[1::2]:
    tick.set_pad(15)

plt.xlabel('')
plt.ylabel('CARS')
plt.tight_layout()
plt.savefig(join(results_dir, "ground_truth_accuracy_bar_charts_{}_random_for_error_random_for_other_with_rules.png".format(config['mode'])), format="png", bbox_inches='tight')
plt.savefig(join(results_dir, "ground_truth_accuracy_bar_charts_{}_random_for_error_random_for_other_with_rules.svg".format(config['mode'])), format="svg", bbox_inches='tight')
plt.savefig(join(results_dir, "ground_truth_accuracy_bar_charts_{}_random_for_error_random_for_other_with_rules.pdf".format(config['mode'])), format="pdf", bbox_inches='tight')
plt.show()

In [None]:
fig = plt.figure(figsize=(width, height))

sns.set_style("whitegrid")
plt.rcParams['font.family'] = 'Helvetica Neue LT Com'
plt.rcParams['font.weight'] = 'light'
plt.rcParams['figure.autolayout'] = True

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

legend = (config['mode'] == 'three')

colors_dict = {
    'green': '#009E73',
    'grey': '#A8A496',
    'blue': '#0072B2',
    'orange': '#D55E00',
    'pink': '#CC79A7'
}

accuracies_df = pd.DataFrame({
    'estimators': ['Random', 'Plotly', 'VizML', 'Data2Viz', 'DeepEye', 'MTurk', 'Show Me', 'CompassQL'],  # 'Minimum', , 'Group\nMTurk\n(50%)'],
    'accuracies': [ conf_int[predictor]['mean'] for predictor in [ 'random', 'plotly', 'vizml', 'data2viz', 'deepeye', 'mturk_1', 'showme', 'compassql']],
    'errors': [ conf_int[predictor]['error'] for predictor in [ 'random', 'plotly', 'vizml', 'data2viz', 'deepeye', 'mturk_1', 'showme', 'compassql']],
    'colors': [ 
        # colors_dict['grey'],
        colors_dict['grey'],
        colors_dict['blue'],
        colors_dict['green'],
        colors_dict['green'],
        colors_dict['green'],
        colors_dict['blue'],
        colors_dict['orange'],
        colors_dict['orange']
    ]
})

accuracies_df['accuracies'] = accuracies_df['accuracies']
accuracies_df['errors'] = accuracies_df['errors']

accuracies_df.sort_values(['accuracies'], ascending=True, inplace=True)

plt.rcParams['errorbar.capsize']=5
plt.rcParams['lines.markeredgewidth']=1

ax = accuracies_df.plot(
    kind='barh',
    x='estimators',
    y='accuracies',
    xerr='errors',
    color=accuracies_df['colors'],
    # edgecolor=accuracies_df['edgecolors'],
    figsize=(width, height + 0.5),
    width=0.85,
    linewidth=1,
    legend=legend,
    grid=False,
    alpha=1,
    rot=0,
)

for p in ax.patches:
    ax.annotate(
        np.round(p.get_width(),decimals=1),
        (8, p.get_y() - p.get_height() / 3.5),
        ha='center',
        va='center',
        xytext=(0, 12),
        textcoords='offset points',
        bbox=dict(boxstyle='Square', fc='white', lw=0, alpha=0.8)
    )

ax.xaxis.grid(which='major')
ax.xaxis.set_ticks(np.arange(0, 101, 10))
ax.set_xlim([0, 100])

# Baselines
ax.axvline(
    x=conf_int['minimum']['mean'],
    color='gray',
    linestyle='dashed'
) 

ax.axvline(
    x=np.max(accuracies_df['accuracies']),
    color='gray',
    linestyle='dotted'
)

baseline = mpatches.Patch(color=colors_dict['grey'], label='Baseline')
machine = mpatches.Patch(color=colors_dict['blue'], label='Human')
single_human = mpatches.Patch(color=colors_dict['green'], label='ML-based')
rule = mpatches.Patch(color=colors_dict['orange'], label='Rule-based')

if legend:
    predictor_legend = pylab.legend(handles=[baseline, machine, rule, single_human], ncol=2, loc=9, bbox_to_anchor=(0.5, -0.1))
    predictor_legend.set_title('Predictor Type')

plt.xlabel('Consensus-Adjusted Recommendation Score') 
plt.ylabel('Predictor')

plt.tight_layout()
plt.savefig(join(results_dir, "ground_truth_accuracy_bar_charts_{}_random_for_error_random_for_other_with_rules_horizontal.png".format(config['mode'])), format="png", bbox_inches='tight')
plt.savefig(join(results_dir, "ground_truth_accuracy_bar_charts_{}_random_for_error_random_for_other_with_rules_horizontal.svg".format(config['mode'])), format="svg", bbox_inches='tight')
plt.savefig(join(results_dir, "ground_truth_accuracy_bar_charts_{}_random_for_error_random_for_other_with_rules_horizontal.pdf".format(config['mode'])), format="pdf", bbox_inches='tight')
plt.show()

In [None]:
conf_int

## Measures of Agreement

In [None]:
def get_gini(x):
    # (Warning: This is a concise implementation, but it is O(n**2)
    # in time and memory, where n = len(x).  *Don't* pass in huge
    # samples!)

    # Mean absolute difference
    mad = np.abs(np.subtract.outer(x, x)).mean()
    # Relative mean absolute difference
    rmad = mad/np.mean(x)
    # Gini coefficient
    g = 0.5 * rmad
    return g

In [None]:
gini_coeffs = []
name_to_gini = {}
for k, probas_dict in real_probas.items():
    probas = [ p for p in probas_dict.values() ]
    gini = get_gini(probas)
    gini_coeffs.append(gini)
    name_to_gini[k] = gini

fig = plt.figure(figsize=(width, height))
sns.set_style("whitegrid")

upper_limit = 1/2
if config['mode'] == 'three':
    upper_limit = 2/3
    
ax = pd.Series(gini_coeffs).hist(
    color=colors_dict['blue'],
    alpha=1,
    edgecolor='white',
    bins=np.linspace(0, upper_limit, 11),
    figsize=(width, height)
)


ax.set_xlim([0, upper_limit])

plt.tight_layout()
fig.tight_layout()
plt.xlabel('Gini Coefficient')
plt.ylabel('Frequency')

plt.savefig(join(results_dir, "gini_coeff_{}.svg".format(config['mode'])), format="svg", bbox_inches='tight')
plt.savefig(join(results_dir, "gini_coeff_{}.pdf".format(config['mode'])), format="pdf", bbox_inches='tight')
plt.show()