In [1]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm, trange
import torch
import matplotlib.pyplot as plt
import pickle5 as pickle
import plotly.express as px
import argparse
import scipy.stats
import scipy.special as special
from typing import Dict, List, Any, Tuple

In [2]:
def seed_everything(seed: int):
    import random, os
    import numpy as np
    
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
seed_everything(42)

In [3]:
def read_pickle(file_path: str) -> Any:
	with open(file_path, "rb") as handle:
		return pickle.load(handle)

In [4]:
def write_pickle(file: Any, file_path: str) -> None:
    with open(file_path, 'wb') as handle:
        pickle.dump(file, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [5]:
STRING_TRUNCATE = 50

def get_scores(dir_path: str, converge_epoch: int, string_truncate: int) -> Tuple[Dict[int, Dict[str, List[float]]], Dict[str, List[Any]]]:
	file_list = os.listdir(dir_path)
	idx_to_sentences: Dict[int, Dict[str, str]] = read_pickle(os.path.join(dir_path, "idx_to_sentences.pickle"))

	file_list = [f for f in file_list if f[:5] == "epoch"]
	file_list = [f for f in file_list if int(f.split("_")[0].replace("epoch", "")) > 3 and int(f.split("_")[0].replace("epoch", "")) < converge_epoch]
	file_list = sorted(file_list, key= lambda s: int(s.split("_")[1].replace("stepidx", "")))

	print("Loading files in:", dir_path)
	idxs, ppls, chias, bleus = [], [], [], []
	for file_name in file_list:
		file_path = f"{dir_path}/{file_name}"
		# print(file_name)
		if "ppl" in file_path:
			ppls.extend(read_pickle(file_path).tolist())
		elif "chia" in file_path:
			chias.extend(read_pickle(file_path).tolist())
		elif "bleu" in file_path:
			bleus.extend(read_pickle(file_path))
		elif "idx" in file_path:
			idxs.extend(read_pickle(file_path).tolist())
		else:
			output_csv_name = file_path

	items = list(zip(idxs, ppls, chias, bleus))
	items = sorted(items, key=lambda i: i[0])
	idx_dict: Dict[int, Dict[str, List[float]]] = {}
	for item in items:
		if item[0] not in idx_dict:
			idx_dict[item[0]] = {"inv_ppl": [1 / item[1]], "chia": [item[2]], "bleu": [item[3]]}
		else:
			idx_dict[item[0]]["inv_ppl"].append(1 / item[1])
			idx_dict[item[0]]["chia"].append(item[2])
			idx_dict[item[0]]["bleu"].append(item[3])

	i2s = {"Index": [], "In": [], "Out": [], "In Len": [], "Out Len": []}
	for k, v in idx_to_sentences.items():
		i2s["Index"].append(k)
		i2s["In"].append(v["in"])
		i2s["Out"].append(v["out"])
		i2s["In Len"].append(len(v["in"].split()))
		i2s["Out Len"].append(len(v["out"].split()))

	return idx_dict, i2s

In [6]:
from collections import Counter

def create_vocab(df):
	in_v = Counter()
	out_v = Counter()
    
	for idx, txt in df["In"].items():
		tokens = txt.split()
		in_v.update(tokens)
         
	for idx, txt in df["Out"].items():
		tokens = txt.split()
		out_v.update(tokens)

	return set(in_v.keys()), set(out_v.keys()), in_v, out_v

In [7]:
def calculate_statistics(epoch: int, idx_dict: Dict[int, Dict[str, List[float]]], i2s: Dict[str, List[Any]]) -> pd.DataFrame:
	idx_mean_var_dict: Dict[int, Dict[str, Tuple[float, float]]] = {}
	idx_mean_var_list: List[Tuple[int, float, float, float, float, float, float, float, float]] = []
	score_names = ["inv_ppl", "chia", "bleu"]
	for idx, scores in idx_dict.items():
		scores_list = []
		for score_name in score_names:
			score_arr = np.array(scores[score_name][:epoch])
			mean = score_arr.mean()
			var = score_arr.var()
			scores_list.extend([mean, var])
		
		idx_mean_var_list.append(tuple((idx, *scores_list)))

	i2s_df = pd.DataFrame.from_dict(i2s)


	df = pd.DataFrame(idx_mean_var_list, columns =['Index', 'Confidence - Inverse PPL', 'Variability - Inverse PPL', \
													'Confidence - CHIA', 'Variability - CHIA', \
													'Confidence - BLEU', 'Variability - BLEU'])

	cartography = pd.merge(df, i2s_df, on="Index")

	return cartography

In [8]:
def load_scores(dir_path: str, plot_path: str, converge_epoch: int) -> None:
	idx_dict = get_scores(dir_path, plot_path, converge_epoch)
	
	for epoch in trange(3, converge_epoch, 2):
		df = calculate_statistics(epoch, idx_dict)

		plot_types = ["inv_ppl", "chia", "bleu"]

		for plot_type in tqdm(plot_types, "Plots"):
			plot(df, plot_path, str(epoch), plot_type)

In [9]:
from pprint import pprint

def choose_subset(df: pd.DataFrame, metric: str, criteria: str, ds_name: str, subset_fname:str, ratio:float = 0.33) -> pd.DataFrame:
    assert metric in ["Inverse PPL", "Neg PPL", "CHIA", "BLEU"]
    assert criteria in ["Easy to Learn", "Ambiguous", "Hard to Learn", "Random"]
    
    if criteria == "Easy to Learn":
        sort_by = f"Confidence - {metric}"
        ascending = False
    elif criteria == "Ambiguous":
        sort_by = f"Variability - {metric}"
        ascending = False
    elif criteria == "Hard to Learn":
        sort_by = f"Confidence - {metric}"
        ascending = True
        
    if criteria == "Random":
        sorted_df = df.sample(frac=1)
    else:
        sorted_df = df.sort_values(by=[sort_by], ascending=ascending)

    sorted_df = sorted_df.reset_index(drop=True)
    subset_df = sorted_df.iloc[:int(len(df)*ratio),:]
    
    all_in_v, all_out_v, _, _ = create_vocab(df)
    subset_in_v, subset_out_v, subset_in_v_counts, subset_out_v_counts = create_vocab(subset_df)

    add_ex_i = []
    remove_ex_i = []
    
    for i in trange(int(len(df)*ratio), len(df)):
        new_in, new_out = sorted_df.iloc[i, 7], sorted_df.iloc[i, 8]
        new_in_tokens, new_out_tokens = set(new_in.split()), set(new_out.split())
        
        if (new_in_tokens - subset_in_v) or (new_out_tokens - subset_out_v):
            print(f"In vocab dif: {(new_in_tokens - subset_in_v)}")
            print(f"Out vocab dif: {(new_out_tokens - subset_out_v)}")
            add_ex_i.append(i)
            subset_in_v = subset_in_v.union(new_in_tokens)
            subset_out_v = subset_out_v.union(new_out_tokens)
            subset_in_v_counts.update(new_in.split())
            subset_out_v_counts.update(new_out.split())
            
    in_counter = subset_in_v_counts
    out_counter = subset_out_v_counts
    
    removed_amount = 0
    for i in trange(int(len(df)*ratio)-1, -1, -1):
        if len(remove_ex_i) == len(add_ex_i):
            break
            
        ex_in, ex_out = sorted_df.iloc[i, 7], sorted_df.iloc[i, 8]
        ex_in_counter, ex_out_counter = Counter(ex_in.split()), Counter(ex_out.split())
        
        upd_in_counter = in_counter - ex_in_counter
        upd_out_counter = out_counter - ex_out_counter
        
        ex_in_words, ex_out_words = list(set(ex_in.split())), list(set(ex_out.split()))
        
        remove = True
        for word in ex_in_words:
            if upd_in_counter[word] <= 1:
                remove = False
        
        for word in ex_out_words:
            if upd_out_counter[word] <= 1:
                remove = False
                
        if remove:
            in_counter = upd_in_counter
            out_counter = upd_out_counter
            remove_ex_i.append(i)
    
    subset_df = pd.concat([subset_df, df.iloc[add_ex_i]])
    subset_df = subset_df.drop(remove_ex_i, axis=0)
    subset_df = subset_df.reset_index(drop=True)
    
    assert all_in_v == set(in_counter.keys()), "The process is wrong"
    assert all_out_v == set(out_counter.keys()), "The process is wrong 2"
    
    subset_idx = subset_df["Index"].tolist()
    subset_idx = [int(i) for i in subset_idx]
    subset_idx = set(subset_idx)
    
    os.makedirs(os.path.join("subsets", ds_name), exist_ok=True)
    write_pickle(subset_idx, os.path.join("subsets", ds_name, subset_fname))
    
    print(len(remove_ex_i), len(add_ex_i))
    
    return subset_df

In [10]:
def plot(df, path_name, extra_path_info, plot_type="inv_ppl"):
	if plot_type == "inv_ppl":
		fig = px.scatter(df, x="Variability - Inverse PPL", y="Confidence - Inverse PPL", custom_data=['In abbv.', 'Out abbv.', 'In Len', 'Out Len'], color='Confidence - BLEU', range_color=[0,1])
		fig.update_layout(yaxis_range=[0, 1])
		fig.update_traces(
			hovertemplate="<br>".join([
				"Variability - Inverse PPL: %{x}",
				"Confidence - Inverse PPL: %{y}",
				"In: %{customdata[0]}",
				"Out: %{customdata[1]}",
                "In Len: %{customdata[2]}",
                "Out Len: %{customdata[3]}", 
			])
		)
	elif plot_type == "chia":
		fig = px.scatter(df, x="Variability - CHIA", y="Confidence - CHIA", custom_data=['In abbv.', 'Out abbv.', 'In Len', 'Out Len'], color='Confidence - BLEU', range_color=[0,1])
		fig.update_layout(yaxis_range=[0, 1])
		fig.update_traces(
			hovertemplate="<br>".join([
				"Variability - CHIA: %{x}",
				"Confidence - CHIA: %{y}",
				"In: %{customdata[0]}",
				"Out: %{customdata[1]}",
                "In Len: %{customdata[2]}",
                "Out Len: %{customdata[3]}", 
			])
		)
	elif plot_type == "bleu":
		fig = px.scatter(df, x="Variability - BLEU", y="Confidence - BLEU", custom_data=['In abbv.', 'Out abbv.', 'In Len', 'Out Len'], color='Confidence - BLEU', range_color=[0,1])
		fig.update_layout(yaxis_range=[0, 1])
		fig.update_traces(
			hovertemplate="<br>".join([
				"Variability - BLEU: %{x}",
				"Confidence - BLEU: %{y}",
				"In: %{customdata[0]}",
				"Out: %{customdata[1]}",
                "In Len: %{customdata[2]}",
                "Out Len: %{customdata[3]}", 
			])
		)	
	fig.update_traces(marker=dict(size=3), selector=dict(mode='markers'))
	fig.update_layout(
		autosize=False,
		width=800,
		height=900
	)
	fig.show()

In [11]:
STRING_TRUNCATE = 120

mtrc2abv = {"Inverse PPL": "inv_ppl", "Neg PPL": "neg_ppl", "CHIA": "chia", "BLEU": "bleu"}
crit2abv = {"Easy to Learn": "easy_to_learn", "Ambiguous": "ambiguous", "Hard to Learn": "hard_to_learn", "Random": "random"}
create_fname = lambda m, cr, c_e: f"{mtrc2abv[m]}_{crit2abv[cr]}_{c_e}.pickle"
outputs_path = lambda x: f"../scores/{x}"

In [12]:
DATASET_NAME = "cogs"
OUTPUTS_PATH = outputs_path(DATASET_NAME)
METRIC = "CHIA"
CRITERIA = "Easy to Learn"
CONVERGE_EPOCH = 10

idx_dict, i2s = get_scores(OUTPUTS_PATH, CONVERGE_EPOCH, STRING_TRUNCATE)
df = calculate_statistics(CONVERGE_EPOCH, idx_dict, i2s)
idx_fname = create_fname(METRIC, CRITERIA, CONVERGE_EPOCH)
subset_df = choose_subset(df, METRIC, CRITERIA, DATASET_NAME, idx_fname)

Loading files in: ../scores/cogs


 17%|█▋        | 876/5302 [00:00<00:00, 8759.38it/s]

In vocab dif: {'bench'}
Out vocab dif: {'bench'}
In vocab dif: {'bin'}
Out vocab dif: {'bin'}
In vocab dif: {'storage'}
Out vocab dif: {'storage'}
In vocab dif: {'painting'}
Out vocab dif: {'painting'}
In vocab dif: {'truck'}
Out vocab dif: {'truck'}
In vocab dif: {'threw'}
Out vocab dif: set()
In vocab dif: {'declared'}
Out vocab dif: {'declare'}
In vocab dif: {'giggled'}
Out vocab dif: set()
In vocab dif: {'sheet'}
Out vocab dif: {'sheet'}
In vocab dif: {'pot'}
Out vocab dif: {'pot'}
In vocab dif: {'vehicle'}
Out vocab dif: {'vehicle'}
In vocab dif: {'guard'}
Out vocab dif: {'guard'}
In vocab dif: {'piano'}
Out vocab dif: {'piano'}
In vocab dif: {'confessed'}
Out vocab dif: {'confess'}
In vocab dif: {'tent'}
Out vocab dif: {'tent'}
In vocab dif: {'said'}
Out vocab dif: {'say'}
In vocab dif: {'proved'}
Out vocab dif: {'prove'}
In vocab dif: {'surface'}
Out vocab dif: {'surface'}
In vocab dif: {'pool'}
Out vocab dif: {'pool'}
In vocab dif: {'believed'}
Out vocab dif: {'believe'}
In voc

 47%|████▋     | 2516/5302 [00:00<00:00, 6359.18it/s]

In vocab dif: {'glacier'}
Out vocab dif: {'glacier'}
In vocab dif: {'van'}
Out vocab dif: {'van'}
In vocab dif: {'pedestal'}
Out vocab dif: {'pedestal'}
In vocab dif: {'pit'}
Out vocab dif: {'pit'}
In vocab dif: {'keyboard'}
Out vocab dif: {'keyboard'}
In vocab dif: {'valve'}
Out vocab dif: {'valve'}
In vocab dif: {'tin'}
Out vocab dif: {'tin'}
In vocab dif: {'sofa'}
Out vocab dif: {'sofa'}
In vocab dif: {'hole'}
Out vocab dif: {'hole'}
In vocab dif: {'beast'}
Out vocab dif: {'beast'}
In vocab dif: {'shelf'}
Out vocab dif: {'shelf'}
In vocab dif: {'cage'}
Out vocab dif: {'cage'}
In vocab dif: set()
Out vocab dif: {'10'}
In vocab dif: {'giant'}
Out vocab dif: {'giant'}
In vocab dif: {'crate'}
Out vocab dif: {'crate'}
In vocab dif: {'sink'}
Out vocab dif: {'sink'}
In vocab dif: {'imagined'}
Out vocab dif: {'imagine'}
In vocab dif: {'lemon'}
Out vocab dif: {'lemon'}
In vocab dif: {'tray'}
Out vocab dif: {'tray'}
In vocab dif: {'cassette'}
Out vocab dif: {'cassette'}
In vocab dif: {'cloth'

 77%|███████▋  | 4066/5302 [00:00<00:00, 7161.32it/s]

In vocab dif: {'crib'}
Out vocab dif: {'crib'}
In vocab dif: {'parcel'}
Out vocab dif: {'parcel'}
In vocab dif: {'podium'}
Out vocab dif: {'podium'}
In vocab dif: {'soup'}
Out vocab dif: {'soup'}
In vocab dif: set()
Out vocab dif: {'11'}
In vocab dif: {'corpse'}
Out vocab dif: {'corpse'}
In vocab dif: {'futon'}
Out vocab dif: {'futon'}
In vocab dif: {'well'}
Out vocab dif: {'well'}
In vocab dif: {'skull'}
Out vocab dif: {'skull'}
In vocab dif: {'poster'}
Out vocab dif: {'poster'}
In vocab dif: {'sack'}
Out vocab dif: {'sack'}
In vocab dif: {'whale'}
Out vocab dif: {'whale'}
In vocab dif: {'philosopher'}
Out vocab dif: {'philosopher'}
In vocab dif: {'bush'}
Out vocab dif: {'bush'}
In vocab dif: {'container'}
Out vocab dif: {'container'}
In vocab dif: {'tripod'}
Out vocab dif: {'tripod'}
In vocab dif: {'sock'}
Out vocab dif: {'sock'}
In vocab dif: {'headmaster'}
Out vocab dif: {'headmaster'}
In vocab dif: {'crack'}
Out vocab dif: {'crack'}
In vocab dif: {'hanger'}
Out vocab dif: {'hanger

 91%|█████████ | 4806/5302 [00:00<00:00, 5545.88it/s]

In vocab dif: {'cushion'}
Out vocab dif: {'cushion'}
In vocab dif: {'trunk'}
Out vocab dif: {'trunk'}
In vocab dif: {'blender'}
Out vocab dif: {'blender'}
In vocab dif: {'pile'}
Out vocab dif: {'pile'}
In vocab dif: {'casket'}
Out vocab dif: {'casket'}
In vocab dif: {'dragon'}
Out vocab dif: {'dragon'}
In vocab dif: {'leaflet'}
Out vocab dif: {'leaflet'}
In vocab dif: {'wardrobe'}
Out vocab dif: {'wardrobe'}
In vocab dif: set()
Out vocab dif: {'12'}
In vocab dif: {'panel'}
Out vocab dif: {'panel'}
In vocab dif: {'condo'}
Out vocab dif: {'condo'}
In vocab dif: {'rod'}
Out vocab dif: {'rod'}
In vocab dif: {'trampoline'}
Out vocab dif: {'trampoline'}
In vocab dif: {'trainee'}
Out vocab dif: {'trainee'}


100%|██████████| 5302/5302 [00:00<00:00, 5538.01it/s]


In vocab dif: {'mound'}
Out vocab dif: {'mound'}
In vocab dif: {'notebook'}
Out vocab dif: {'notebook'}
In vocab dif: set()
Out vocab dif: {'13'}
In vocab dif: {'gravel'}
Out vocab dif: {'gravel'}
In vocab dif: set()
Out vocab dif: {'16'}
In vocab dif: {'observe'}
Out vocab dif: {'e', 'b', 'LAMBDA', 'a'}
In vocab dif: {'gasp'}
Out vocab dif: set()


  5%|▍         | 119/2610 [00:00<00:05, 453.25it/s]

113 113





In [13]:
df.describe()

Unnamed: 0,Index,Confidence - Inverse PPL,Variability - Inverse PPL,Confidence - CHIA,Variability - CHIA,Confidence - BLEU,Variability - BLEU,In Len,Out Len
count,7912.0,7912.0,7912.0,7912.0,7912.0,7912.0,7912.0,7912.0,7912.0
mean,12251.921132,0.743155,0.05643365,0.953297,0.001626647,0.865501,0.004002,7.066102,41.562816
std,6969.151019,0.112421,0.02465934,0.029091,0.002019921,0.055878,0.00352,2.657084,20.064013
min,3.0,0.061768,2.471398e-09,0.466622,2.464467e-09,0.282586,0.0,1.0,7.0
25%,6100.5,0.67845,0.03933769,0.940825,0.0006872559,0.848056,0.001786,5.0,21.0
50%,12313.5,0.762664,0.04911568,0.958653,0.001208314,0.878722,0.00317,7.0,41.0
75%,18298.25,0.826527,0.06590771,0.972077,0.001878755,0.900743,0.005038,9.0,54.0
max,24154.0,0.999965,0.1964477,0.999965,0.0494819,0.954123,0.05068,18.0,140.0


In [14]:
subset_df.describe()

Unnamed: 0,Index,Confidence - Inverse PPL,Variability - Inverse PPL,Confidence - CHIA,Variability - CHIA,Confidence - BLEU,Variability - BLEU,In Len,Out Len
count,2610.0,2610.0,2610.0,2610.0,2610.0,2610.0,2610.0,2610.0,2610.0
mean,12502.609962,0.836924,0.0510445,0.977075,0.0007719296,0.898933,0.002148,6.349808,33.65364
std,6876.644266,0.057876,0.02002858,0.009705,0.0005959035,0.028448,0.001784,1.782288,12.300519
min,3.0,0.403,2.471398e-09,0.866334,2.464467e-09,0.616008,0.0,3.0,10.0
25%,6744.0,0.80764,0.03782279,0.972453,0.0004130184,0.884267,0.000997,5.0,20.0
50%,12668.0,0.843157,0.0459358,0.976843,0.0006164206,0.904273,0.001708,6.0,31.0
75%,18421.5,0.873766,0.05997504,0.982664,0.001036837,0.918722,0.002865,8.0,41.0
max,24153.0,0.999965,0.1710564,0.999965,0.01040988,0.954123,0.022026,14.0,86.0


In [15]:
DATASET_NAME = "cfq"
OUTPUTS_PATH = outputs_path(DATASET_NAME)
METRIC = "CHIA"
CRITERIA = "Easy to Learn"
CONVERGE_EPOCH = 20

idx_dict, i2s = get_scores(OUTPUTS_PATH, CONVERGE_EPOCH, STRING_TRUNCATE)
df = calculate_statistics(CONVERGE_EPOCH, idx_dict, i2s)
idx_fname = create_fname(METRIC, CRITERIA, CONVERGE_EPOCH)
subset_df = choose_subset(df, METRIC, CRITERIA, DATASET_NAME, idx_fname)

Loading files in: ../scores/cfq


  6%|▌         | 3813/64148 [00:00<00:11, 5099.40it/s]

In vocab dif: {'M5'}
Out vocab dif: {'M5'}


 12%|█▏        | 7850/64148 [00:01<00:10, 5185.60it/s]

In vocab dif: {'M6'}
Out vocab dif: {'M6'}


 31%|███▏      | 20148/64148 [00:03<00:08, 5093.40it/s]

In vocab dif: set()
Out vocab dif: {'film.film.film_art_direction_by'}
In vocab dif: set()
Out vocab dif: {'film.film.cinematography'}


 39%|███▉      | 25135/64148 [00:04<00:07, 5501.04it/s]

In vocab dif: set()
Out vocab dif: {'?x3'}


 42%|████▏     | 26868/64148 [00:05<00:07, 5173.66it/s]

In vocab dif: {'M7'}
Out vocab dif: {'M7'}


 51%|█████     | 32849/64148 [00:06<00:06, 5138.85it/s]

In vocab dif: set()
Out vocab dif: {'film.film.costume_design_by'}


 81%|████████  | 52055/64148 [00:10<00:02, 4710.51it/s]

In vocab dif: {'M8'}
Out vocab dif: {'M8'}


 90%|█████████ | 57828/64148 [00:11<00:01, 4782.27it/s]

In vocab dif: set()
Out vocab dif: {'?x4'}


 98%|█████████▊| 62965/64148 [00:12<00:00, 4616.46it/s]

In vocab dif: {'M9'}
Out vocab dif: {'M9'}


100%|██████████| 64148/64148 [00:12<00:00, 4975.29it/s]


In vocab dif: set()
Out vocab dif: {'?x5'}


  0%|          | 11/31595 [00:00<00:05, 6308.95it/s]


11 11


In [16]:
df.describe()

Unnamed: 0,Index,Confidence - Inverse PPL,Variability - Inverse PPL,Confidence - CHIA,Variability - CHIA,Confidence - BLEU,Variability - BLEU,In Len,Out Len
count,95743.0,95743.0,95743.0,95743.0,95743.0,95743.0,95743.0,95743.0,95743.0
mean,47871.0,0.596287,0.053109,0.780005,0.015477,0.66893,0.019741,13.53489,27.743208
std,27638.767749,0.109867,0.015851,0.065346,0.00474,0.066137,0.006747,4.614813,9.115588
min,0.0,0.041502,0.000765,0.426212,0.001418,0.378244,0.001241,3.0,11.0
25%,23935.5,0.524393,0.04216,0.739208,0.012072,0.626582,0.014885,10.0,21.0
50%,47871.0,0.609993,0.052956,0.79022,0.015081,0.675332,0.019377,13.0,26.0
75%,71806.5,0.68,0.063758,0.829581,0.018522,0.715932,0.024234,17.0,33.0
max,95742.0,0.871246,0.120869,0.928129,0.039429,0.859264,0.05341,29.0,95.0


In [17]:
subset_df.describe()

Unnamed: 0,Index,Confidence - Inverse PPL,Variability - Inverse PPL,Confidence - CHIA,Variability - CHIA,Confidence - BLEU,Variability - BLEU,In Len,Out Len
count,31595.0,31595.0,31595.0,31595.0,31595.0,31595.0,31595.0,31595.0,31595.0
mean,47718.322488,0.707158,0.060811,0.846431,0.015057,0.7311,0.021315,11.064346,22.944042
std,27707.412628,0.042691,0.013781,0.020224,0.004294,0.035443,0.006234,4.564937,7.525532
min,1.0,0.394109,0.010953,0.640861,0.002831,0.541024,0.002626,3.0,11.0
25%,23721.5,0.678647,0.051537,0.830015,0.011989,0.706743,0.016893,8.0,17.0
50%,47573.0,0.705504,0.060442,0.843403,0.014743,0.728782,0.020933,10.0,21.0
75%,71794.5,0.735152,0.069616,0.859496,0.017839,0.753138,0.025374,12.0,26.0
max,95742.0,0.871246,0.116628,0.928129,0.033906,0.859264,0.05341,26.0,55.0


In [18]:
DATASET_NAME = "scan_length"
OUTPUTS_PATH = outputs_path(DATASET_NAME)
METRIC = "CHIA"
CRITERIA = "Easy to Learn"
CONVERGE_EPOCH = 30

idx_dict, i2s = get_scores(OUTPUTS_PATH, CONVERGE_EPOCH, STRING_TRUNCATE)
df = calculate_statistics(CONVERGE_EPOCH, idx_dict, i2s)
idx_fname = create_fname(METRIC, CRITERIA, CONVERGE_EPOCH)
subset_df = choose_subset(df, METRIC, CRITERIA, DATASET_NAME, idx_fname)

Loading files in: ../scores/scan_length


100%|██████████| 11384/11384 [00:01<00:00, 7617.63it/s]
  0%|          | 0/5606 [00:00<?, ?it/s]


0 0


In [19]:
df.describe()

Unnamed: 0,Index,Confidence - Inverse PPL,Variability - Inverse PPL,Confidence - CHIA,Variability - CHIA,Confidence - BLEU,Variability - BLEU,In Len,Out Len
count,16990.0,16990.0,16990.0,16990.0,16990.0,16990.0,16990.0,16990.0,16990.0
mean,8494.5,0.742467,0.043591,0.819368,0.022703,0.682157,0.022302,7.034726,10.795762
std,4904.734872,0.070584,0.020354,0.056308,0.013535,0.105665,0.01109,1.191665,4.929817
min,0.0,0.310544,0.001331,0.517003,0.000121,0.23912,0.000526,1.0,1.0
25%,4247.25,0.700695,0.027413,0.786796,0.011691,0.620271,0.013616,6.0,7.0
50%,8494.5,0.741937,0.042685,0.817985,0.020885,0.689094,0.020965,7.0,10.0
75%,12741.75,0.788462,0.057986,0.858736,0.031788,0.759686,0.029718,8.0,14.0
max,16989.0,0.954591,0.130763,0.967662,0.087877,0.924969,0.081414,9.0,22.0


In [20]:
subset_df.describe()

Unnamed: 0,Index,Confidence - Inverse PPL,Variability - Inverse PPL,Confidence - CHIA,Variability - CHIA,Confidence - BLEU,Variability - BLEU,In Len,Out Len
count,5606.0,5606.0,5606.0,5606.0,5606.0,5606.0,5606.0,5606.0,5606.0
mean,11323.933999,0.815575,0.026866,0.880675,0.010984,0.769802,0.0145,7.177667,13.899394
std,4516.276229,0.040096,0.012234,0.026784,0.006448,0.0683,0.007759,0.994458,5.184779
min,127.0,0.701768,0.001331,0.841403,0.000121,0.469467,0.000526,3.0,3.0
25%,7937.25,0.787445,0.017822,0.859237,0.00635,0.727538,0.008705,7.0,10.0
50%,12403.0,0.811238,0.025969,0.877404,0.009982,0.781906,0.013173,7.0,14.0
75%,15466.75,0.837989,0.035125,0.897296,0.014873,0.820625,0.018817,8.0,19.0
max,16989.0,0.954591,0.078329,0.967662,0.040925,0.924969,0.080198,9.0,22.0


In [21]:
DATASET_NAME = "scan_jump"
OUTPUTS_PATH = outputs_path(DATASET_NAME)
METRIC = "CHIA"
CRITERIA = "Easy to Learn"
CONVERGE_EPOCH = 30

idx_dict, i2s = get_scores(OUTPUTS_PATH, CONVERGE_EPOCH, STRING_TRUNCATE)
df = calculate_statistics(CONVERGE_EPOCH, idx_dict, i2s)
idx_fname = create_fname(METRIC, CRITERIA, CONVERGE_EPOCH)
subset_df = choose_subset(df, METRIC, CRITERIA, DATASET_NAME, idx_fname)

Loading files in: ../scores/scan_jump


100%|██████████| 9829/9829 [00:00<00:00, 12910.65it/s]
  0%|          | 0/4841 [00:00<?, ?it/s]


0 0


In [22]:
df.describe()

Unnamed: 0,Index,Confidence - Inverse PPL,Variability - Inverse PPL,Confidence - CHIA,Variability - CHIA,Confidence - BLEU,Variability - BLEU,In Len,Out Len
count,14670.0,14670.0,14670.0,14670.0,14670.0,14670.0,14670.0,14670.0,14670.0
mean,7334.5,0.811877,0.03109984,0.870645,0.01570789,0.675831,0.015743,6.643763,12.727812
std,4235.00856,0.092899,0.0222453,0.069127,0.01362532,0.213221,0.011695,2.208942,9.162258
min,0.0,0.276485,3.660198e-10,0.52935,3.663849e-10,0.13165,0.0,1.0,1.0
25%,3667.25,0.751254,0.01256169,0.823701,0.00423943,0.622686,0.006194,6.0,6.0
50%,7334.5,0.798249,0.02819625,0.8616,0.0121125,0.717946,0.013982,7.0,10.0
75%,11001.75,0.869623,0.04748547,0.920733,0.02486033,0.829987,0.023542,8.0,18.0
max,14669.0,0.999992,0.139437,0.999992,0.07669706,0.969229,0.06851,9.0,48.0


In [23]:
subset_df.describe()

Unnamed: 0,Index,Confidence - Inverse PPL,Variability - Inverse PPL,Confidence - CHIA,Variability - CHIA,Confidence - BLEU,Variability - BLEU,In Len,Out Len
count,4841.0,4841.0,4841.0,4841.0,4841.0,4841.0,4841.0,4841.0,4841.0
mean,7299.800248,0.916737,0.008714789,0.949774,0.002979504,0.648876,0.004872,5.727122,17.055154
std,4225.382839,0.058532,0.0079244,0.034721,0.00288292,0.340669,0.004992,3.206995,12.650403
min,0.0,0.77128,3.660198e-10,0.902366,3.663849e-10,0.13165,0.0,1.0,1.0
25%,3619.0,0.87024,0.000128132,0.921348,0.0001196647,0.135335,0.0,1.0,1.0
50%,7344.0,0.89782,0.007814343,0.938508,0.002416191,0.858633,0.003972,7.0,19.0
75%,11006.0,0.997359,0.01416525,0.997428,0.004777632,0.891147,0.007389,8.0,27.0
max,14669.0,0.999992,0.04472808,0.999992,0.01671976,0.969229,0.03622,9.0,48.0


In [24]:
import gc
gc.collect()

69

In [25]:
DATASET_NAME = "pcfg"
OUTPUTS_PATH = outputs_path(DATASET_NAME)
METRIC = "CHIA"
CRITERIA = "Easy to Learn"
CONVERGE_EPOCH = 140

idx_dict, i2s = get_scores(OUTPUTS_PATH, CONVERGE_EPOCH, STRING_TRUNCATE)
df = calculate_statistics(CONVERGE_EPOCH, idx_dict, i2s)
idx_fname = create_fname(METRIC, CRITERIA, CONVERGE_EPOCH)
subset_df = choose_subset(df, METRIC, CRITERIA, DATASET_NAME, idx_fname)

Loading files in: ../scores/pcfg


100%|██████████| 55053/55053 [00:06<00:00, 8208.82it/s]
  0%|          | 0/27115 [00:00<?, ?it/s]


0 0


In [26]:
df.describe()

Unnamed: 0,Index,Confidence - Inverse PPL,Variability - Inverse PPL,Confidence - CHIA,Variability - CHIA,Confidence - BLEU,Variability - BLEU,In Len,Out Len
count,82168.0,82168.0,82168.0,82168.0,82168.0,82168.0,82168.0,82168.0,82168.0
mean,41083.5,0.657339,0.126839,0.712272,0.103254,0.506744,0.061316,17.699883,8.895884
std,23720.002797,0.124807,0.033832,0.102243,0.033217,0.068689,0.033513,9.41067,7.794699
min,0.0,0.096914,0.003238,0.186722,0.001352,0.175278,0.000825,3.0,2.0
25%,20541.75,0.59753,0.105946,0.658073,0.079857,0.472124,0.031961,11.0,4.0
50%,41083.5,0.68025,0.13159,0.726545,0.104896,0.520684,0.060412,16.0,7.0
75%,61625.25,0.743193,0.154147,0.784396,0.130419,0.556863,0.088102,23.0,11.0
max,82167.0,0.98121,0.194359,0.98401,0.17624,0.754878,0.149322,71.0,736.0


In [27]:
subset_df.describe()

Unnamed: 0,Index,Confidence - Inverse PPL,Variability - Inverse PPL,Confidence - CHIA,Variability - CHIA,Confidence - BLEU,Variability - BLEU,In Len,Out Len
count,27115.0,27115.0,27115.0,27115.0,27115.0,27115.0,27115.0,27115.0,27115.0
mean,41071.957994,0.775119,0.097556,0.812889,0.068986,0.474676,0.027068,13.21962,4.033598
std,23666.382964,0.03957,0.025795,0.03428,0.020761,0.069725,0.016396,7.780048,1.447622
min,0.0,0.6713,0.003238,0.76713,0.001352,0.313976,0.000825,3.0,2.0
25%,20694.0,0.743761,0.082042,0.784896,0.055014,0.432303,0.014323,7.0,3.0
50%,40928.0,0.766691,0.101947,0.804836,0.07382,0.485087,0.028109,11.0,4.0
75%,61569.0,0.799565,0.117997,0.835164,0.085151,0.521435,0.040865,17.0,5.0
max,82165.0,0.98121,0.151187,0.98401,0.11622,0.754878,0.088177,70.0,12.0


subset_df_in = set(subset_df["In"].tolist())
subset_df_out = set(subset_df["Out"].tolist())

subset_pkl = read_pickle("../scores/cogs/idx_to_sentences.pickle")

subset_pkl_in = []
subset_pkl_out = []

for i, text in subset_pkl.items():
    subset_pkl_in.append(text["in"])
    subset_pkl_out.append(text["out"])

subset_pkl_in = set(subset_pkl_in)
subset_pkl_out = set(subset_pkl_out)

subset_df_in - subset_pkl_in, len(subset_df_in), len(subset_pkl_in)