In [1]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm, trange
import torch
import matplotlib.pyplot as plt
import pickle5 as pickle
import plotly.express as px
import itertools
import argparse
import scipy.stats
import scipy.special as special
from typing import Dict, List, Any, Tuple

In [2]:
def seed_everything(seed: int):
    import random, os
    import numpy as np
    
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
seed_everything(42)

In [3]:
def read_pickle(file_path: str) -> Any:
	with open(file_path, "rb") as handle:
		return pickle.load(handle)

In [4]:
def write_pickle(file: Any, file_path: str) -> None:
    with open(file_path, 'wb') as handle:
        pickle.dump(file, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [5]:
def get_word_freqs(i2s):
    in_v = Counter()
    out_v = Counter()
    
    for txt in i2s["In"]:
        tokens = txt.split()
        in_v.update(tokens)

    for txt in i2s["Out"]:
        tokens = txt.split()
        out_v.update(tokens)
    
    total = sum(in_v.values())
    for k in in_v:
        in_v[k] /= total

    total = sum(out_v.values())
    for k in out_v:
        out_v[k] /= total
        
    return in_v, out_v

In [6]:
def get_rarity(in_txt, out_txt, in_v, out_v):
    in_toks = in_txt.split()
    out_toks = out_txt.split()
    
    in_rarity, out_rarity = 0, 0
    in_len, out_len = len(in_toks), len(out_toks)
    
    for tok in in_toks:
        in_rarity += in_v[tok]
        
    in_rarity /= in_len
    
    for tok in out_toks:
        out_rarity += out_v[tok]
    
    out_rarity /= out_len
    
    return -np.log(in_rarity), -np.log(out_rarity)

In [7]:
STRING_TRUNCATE = 50

def get_scores(dir_path: str, converge_epoch: int, string_truncate: int, min_epoch: int = 3) -> Tuple[Dict[int, Dict[str, List[float]]], Dict[str, List[Any]]]:
    file_list = os.listdir(dir_path)
    idx_to_sentences: Dict[int, Dict[str, str]] = read_pickle(os.path.join(dir_path, "idx_to_sentences.pickle"))

    file_list = [f for f in file_list if f[:5] == "epoch"]
    file_list = [f for f in file_list if int(f.split("_")[0].replace("epoch", "")) > min_epoch and int(f.split("_")[0].replace("epoch", "")) < converge_epoch]
    file_list = sorted(file_list, key= lambda s: int(s.split("_")[1].replace("stepidx", "")))

    # print("Loading files in:", dir_path)
    idxs, ppls, chias, bleus = [], [], [], []
    for file_name in file_list:
        file_path = f"{dir_path}/{file_name}"
        # print(file_name)
        if "ppl" in file_path:
            ppls.extend(read_pickle(file_path).tolist())
        elif "chia" in file_path:
            chias.extend(read_pickle(file_path).tolist())
        elif "bleu" in file_path:
            bleus.extend(read_pickle(file_path))
        elif "idx" in file_path:
            idxs.extend(read_pickle(file_path).tolist())
        else:
            output_csv_name = file_path

    items = list(zip(idxs, ppls, chias, bleus))
    items = sorted(items, key=lambda i: i[0])
    idx_dict: Dict[int, Dict[str, List[float]]] = {}
    for item in items:
        if item[0] not in idx_dict:
            idx_dict[item[0]] = {"inv_ppl": [1 / item[1]], "chia": [item[2]], "bleu": [item[3]]}
        else:
            idx_dict[item[0]]["inv_ppl"].append(1 / item[1])
            idx_dict[item[0]]["chia"].append(item[2])
            idx_dict[item[0]]["bleu"].append(item[3])

    i2s = {"Index": [], "In": [], "Out": [], "In abbv.": [], "Out abbv.": [], "In Len": [], "Out Len": [], "In Rarity": [], "Out Rarity": []}

    for k, v in idx_to_sentences.items():
        i2s["Index"].append(k)
        i2s["In"].append(v["in"])
        i2s["Out"].append(v["out"])
        i2s["In abbv."].append(v["in"][:STRING_TRUNCATE])
        i2s["Out abbv."].append(v["out"][:STRING_TRUNCATE])
        i2s["In Len"].append(len(v["in"].split()))
        i2s["Out Len"].append(len(v["out"].split()))

    in_v, out_v = get_word_freqs(i2s)
    for k, v in idx_to_sentences.items():
        in_rarity, out_rarity = get_rarity(v["in"], v["out"], in_v, out_v)
        i2s["In Rarity"].append(in_rarity)
        i2s["Out Rarity"].append(out_rarity)

    return idx_dict, i2s

In [8]:
from collections import Counter

def create_vocab(df):
	in_v = Counter()
	out_v = Counter()
    
	for idx, txt in df["In"].items():
		tokens = txt.split()
		in_v.update(tokens)
         
	for idx, txt in df["Out"].items():
		tokens = txt.split()
		out_v.update(tokens)

	return set(in_v.keys()), set(out_v.keys()), in_v, out_v

In [9]:
def calculate_statistics(epoch: int, idx_dict: Dict[int, Dict[str, List[float]]], i2s: Dict[str, List[Any]]) -> pd.DataFrame:
	idx_mean_var_dict: Dict[int, Dict[str, Tuple[float, float]]] = {}
	idx_mean_var_list: List[Tuple[int, float, float, float, float, float, float, float, float]] = []
	score_names = ["inv_ppl", "chia", "bleu"]
	for idx, scores in idx_dict.items():
		scores_list = []
		for score_name in score_names:
			score_arr = np.array(scores[score_name][:epoch])
			mean = score_arr.mean()
			var = score_arr.var()
			scores_list.extend([mean, var])
		
		idx_mean_var_list.append(tuple((idx, *scores_list)))

	i2s_df = pd.DataFrame.from_dict(i2s)


	df = pd.DataFrame(idx_mean_var_list, columns =['Index', 'Confidence - Inverse PPL', 'Variability - Inverse PPL', \
													'Confidence - CHIA', 'Variability - CHIA', \
													'Confidence - BLEU', 'Variability - BLEU'])

	cartography = pd.merge(df, i2s_df, on="Index")

	return cartography

In [10]:
def load_scores(dir_path: str, plot_path: str, converge_epoch: int) -> None:
	idx_dict = get_scores(dir_path, plot_path, converge_epoch)
	
	for epoch in trange(3, converge_epoch, 2):
		df = calculate_statistics(epoch, idx_dict)

		plot_types = ["inv_ppl", "chia", "bleu"]

		for plot_type in tqdm(plot_types, "Plots"):
			plot(df, plot_path, str(epoch), plot_type)

In [11]:
def save_subset(subset_df: pd.DataFrame, ds_name: str, subset_fname: str) -> None:
    subset_idx = subset_df["Index"].tolist()
    subset_idx = [int(i) for i in subset_idx]
    subset_idx = set(subset_idx)
    
    os.makedirs(os.path.join("subsets", ds_name), exist_ok=True)
    write_pickle(subset_idx, os.path.join("subsets", ds_name, subset_fname))
    print(f"subset_idx: {len(subset_idx)}")

In [12]:
from pprint import pprint

def choose_subset(df: pd.DataFrame, metric: str, criteria: str, ds_name: str, subset_fname:str, ratio:float = 0.33, write=True) -> pd.DataFrame:
    assert metric in ["Inverse PPL", "Neg PPL", "CHIA", "BLEU"]
    assert criteria in ["Easy to Learn", "Ambiguous", "Hard to Learn", "Random"]
    
    if criteria == "Easy to Learn":
        sort_by = f"Confidence - {metric}"
        ascending = False
    elif criteria == "Ambiguous":
        sort_by = f"Variability - {metric}"
        ascending = False
    elif criteria == "Hard to Learn":
        sort_by = f"Confidence - {metric}"
        ascending = True
        
    if criteria == "Random":
        sorted_df = df.sample(frac=1)
    else:
        sorted_df = df.sort_values(by=[sort_by], ascending=ascending)

    sorted_df = sorted_df.reset_index(drop=True)
    subset_df = sorted_df.iloc[:int(len(df)*ratio),:]
    
    subset_idx = subset_df["Index"].tolist()
    subset_idx = [int(i) for i in subset_idx]
    subset_idx = set(subset_idx)
    print("start subset", len(subset_idx))
    
    all_in_v, all_out_v, _, _ = create_vocab(df)
    subset_in_v, subset_out_v, subset_in_v_counts, subset_out_v_counts = create_vocab(subset_df)

    add_ex_i = []
    remove_ex_i = []
    
    for i in trange(int(len(df)*ratio), len(df)):
        new_in, new_out = sorted_df.iloc[i, 7], sorted_df.iloc[i, 8]
        new_in_tokens, new_out_tokens = set(new_in.split()), set(new_out.split())
        
        if (new_in_tokens - subset_in_v) or (new_out_tokens - subset_out_v):
            # print(f"In vocab dif: {(new_in_tokens - subset_in_v)}")
            # print(f"Out vocab dif: {(new_out_tokens - subset_out_v)}")
            add_ex_i.append(i)
            subset_in_v = subset_in_v.union(new_in_tokens)
            subset_out_v = subset_out_v.union(new_out_tokens)
            subset_in_v_counts.update(new_in.split())
            subset_out_v_counts.update(new_out.split())
            
    in_counter = subset_in_v_counts
    out_counter = subset_out_v_counts
    
    removed_amount = 0
    
    for i in trange(0, int(len(df)*ratio)):
        print(len(remove_ex_i), len(add_ex_i))
        if len(remove_ex_i) == len(add_ex_i):
            break
            
        ex_in, ex_out = sorted_df.iloc[i, 7], sorted_df.iloc[i, 8]
        ex_in_counter, ex_out_counter = Counter(ex_in.split()), Counter(ex_out.split())
        
        upd_in_counter = in_counter - ex_in_counter
        upd_out_counter = out_counter - ex_out_counter
        
        ex_in_words, ex_out_words = list(set(ex_in.split())), list(set(ex_out.split()))
        
        remove = True
        for word in ex_in_words:
            if upd_in_counter[word] <= 1:
                remove = False
        
        for word in ex_out_words:
            if upd_out_counter[word] <= 1:
                remove = False
                
        if remove:
            in_counter = upd_in_counter
            out_counter = upd_out_counter
            remove_ex_i.append(i)
            
    subset_df = pd.concat([subset_df, df.iloc[add_ex_i]])
    subset_df = subset_df.drop(remove_ex_i, axis=0)
    subset_df = subset_df.reset_index(drop=True)
    
    assert all_in_v == set(in_counter.keys()), "The process is wrong"
    assert all_out_v == set(out_counter.keys()), "The process is wrong 2"
    
    if write:
        save_subset(subset_df, ds_name, subset_fname)
    
    print(len(remove_ex_i), len(add_ex_i))
    
    return subset_df

In [13]:
def combine_subsets(df: pd.DataFrame, subset_dfs: List[pd.DataFrame], ds_name: str, subset_fname: str) -> pd.DataFrame:
    
    combined_set = pd.concat(subset_dfs)
    combined_set = combined_set.drop_duplicates(keep="first")
    
    if len(combined_set) > (len(df) / 2):
        combined_set = combined_set.iloc[:int(len(df) / 2)]
    else:
        count = 0
        while len(combined_set) < (len(df) / 2):
            example = df.sample(n=1)
            if not example.iloc[0]["In"] in combined_set['In'].tolist():
                combined_set = combined_set.append(example)
                
    save_subset(combined_set, ds_name, subset_fname)
    
    return combined_set

In [14]:
def plot(df, plot_type="inv_ppl", color_column="_merge"):
	if plot_type == "inv_ppl":
            # print(df["_merge"].unique())
            #df["_merge"] = df["_merge"].cat.remove_categories("right_only")
            # print(df["_merge"].unique())
            #assert '_merge' in df.columns, "_merge not in columns"
            fig = px.scatter(df, x="Variability - Inverse PPL", y="Confidence - Inverse PPL", custom_data=['In abbv.', 'Out abbv.', 'In Len', 'Out Len'], color=color_column) # , range_color=[0,1]
            fig.update_layout(yaxis_range=[0, 1])
            fig.update_traces(
                hovertemplate="<br>".join([
                    "Variability - Inverse PPL: %{x}",
                    "Confidence - Inverse PPL: %{y}",
                    "In: %{customdata[0]}",
                    "Out: %{customdata[1]}",
                    "In Len: %{customdata[2]}",
                    "Out Len: %{customdata[3]}", 
                ])
            )
	elif plot_type == "chia":
		fig = px.scatter(df, x="Variability - CHIA", y="Confidence - CHIA", custom_data=['In abbv.', 'Out abbv.', 'In Len', 'Out Len'], color='Confidence - BLEU', range_color=[0,1])
		fig.update_layout(yaxis_range=[0, 1])
		fig.update_traces(
			hovertemplate="<br>".join([
				"Variability - CHIA: %{x}",
				"Confidence - CHIA: %{y}",
				"In: %{customdata[0]}",
				"Out: %{customdata[1]}",
                "In Len: %{customdata[2]}",
                "Out Len: %{customdata[3]}", 
			])
		)
	elif plot_type == "bleu":
		fig = px.scatter(df, x="Variability - BLEU", y="Confidence - BLEU", custom_data=['In abbv.', 'Out abbv.', 'In Len', 'Out Len'], color='Confidence - BLEU', range_color=[0,1])
		fig.update_layout(yaxis_range=[0, 1])
		fig.update_traces(
			hovertemplate="<br>".join([
				"Variability - BLEU: %{x}",
				"Confidence - BLEU: %{y}",
				"In: %{customdata[0]}",
				"Out: %{customdata[1]}",
                "In Len: %{customdata[2]}",
                "Out Len: %{customdata[3]}", 
			])
		)	
	fig.update_traces(marker=dict(size=3), selector=dict(mode='markers'))
	fig.update_layout(
		autosize=False,
		width=800,
		height=900
	)
	fig.show()

In [15]:
STRING_TRUNCATE = 120

mtrc2abv = {"Inverse PPL": "inv_ppl", "Neg PPL": "neg_ppl", "CHIA": "chia", "BLEU": "bleu"}
crit2abv = {"Easy to Learn": "easy_to_learn", "Ambiguous": "ambiguous", "Hard to Learn": "hard_to_learn", "Random": "random"}
create_fname = lambda m, cr, c_e: f"{mtrc2abv[m]}_{crit2abv[cr]}_{c_e}.pickle"
create_ratio_fname = lambda m, cr, c_e, rto: f"{mtrc2abv[m]}_{crit2abv[cr]}_{c_e}_{rto}.pickle"
create_comb_fname = lambda m, cr1, cr2, c_e: f"{mtrc2abv[m]}_{crit2abv[cr1]}_{crit2abv[cr2]}_{c_e}.pickle"
outputs_path = lambda x: f"../scores/{x}"

In [16]:
# i2s = read_pickle(os.path.join(outputs_path("cfq"), "idx_to_sentences.pickle"))
# i2s_htl = read_pickle(os.path.join(outputs_path("cfq"), "idx_to_sentences_htl_20.pickle"))
# len(i2s), len(i2s_htl)

In [None]:
DATASET_NAMES = ["0/cogs", "0/cfq"]
METRICS = ["Inverse PPL", "CHIA", "BLEU"]
CRITERIA = ["Easy to Learn", "Ambiguous", "Hard to Learn", "Random"]
CONVERGE_EPOCHS = [10, 20]
RATIOS = [0.33]

for DATASET_NAME, CONVERGE_EPOCH in zip(DATASET_NAMES, CONVERGE_EPOCHS):
    OUTPUTS_PATH = outputs_path(DATASET_NAME)
    idx_dict, i2s = get_scores(OUTPUTS_PATH, CONVERGE_EPOCH, STRING_TRUNCATE)
    df = calculate_statistics(CONVERGE_EPOCH, idx_dict, i2s)
    for METRIC in METRICS:
        for CRITERION in CRITERIA:
            for RATIO in RATIOS:
                idx_fname = create_ratio_fname(METRIC, CRITERION, CONVERGE_EPOCH, RATIO)
                subset_df = choose_subset(df, METRIC, CRITERION, DATASET_NAME, idx_fname, ratio=RATIO)

In [22]:
DATASET_NAMES = ["0/cogs", "0/cfq"]
METRICS = ["Inverse PPL", "CHIA", "BLEU"]
CRITERIA = ["Easy to Learn", "Ambiguous", "Hard to Learn", "Random"]
CONVERGE_EPOCHS = [10, 20]
RATIOS = [0.33]

for DATASET_NAME, CONVERGE_EPOCH in zip(DATASET_NAMES, CONVERGE_EPOCHS):
    OUTPUTS_PATH = outputs_path(DATASET_NAME)
    for METRIC in METRICS:
        for CRITERION in CRITERIA:
            for RATIO in RATIOS:
                idx_fname = create_ratio_fname(METRIC, CRITERION, CONVERGE_EPOCH, RATIO)
                subset_df = read_pickle("subsets/" + DATASET_NAME + "/" + idx_fname)
                print(len(subset_df))

7866
7920
7971
7942
7888
7962
7970
7942
7842
7964
7971
7943
31591
31591
31595
31595
31591
31593
31595
31594
31590
31592
31595
31594


In [18]:
DATASET_NAME = "42/cogs"
OUTPUTS_PATH = outputs_path(DATASET_NAME)
METRICS = ["Inverse PPL", "CHIA"] # "BLEU" 
CRITERIA = ["Easy to Learn", "Ambiguous", "Hard to Learn"] # , "Random" 
CONVERGE_EPOCH = 10
RATIOS = [0.5] # 0.33, 

for METRIC in METRICS:
    for CRITERION in CRITERIA:
        for RATIO in RATIOS:
            idx_fname = create_ratio_fname(METRIC, CRITERION, CONVERGE_EPOCH, RATIO)
            print(idx_fname, len(list(read_pickle(os.path.join("subsets", DATASET_NAME, idx_fname)))))
            


inv_ppl_easy_to_learn_10_0.5.pickle 11938
inv_ppl_ambiguous_10_0.5.pickle 12026
inv_ppl_hard_to_learn_10_0.5.pickle 12077
chia_easy_to_learn_10_0.5.pickle 11961
chia_ambiguous_10_0.5.pickle 12072
chia_hard_to_learn_10_0.5.pickle 12077


In [None]:
DATASET_NAME = "cogs"
OUTPUTS_PATH = outputs_path(DATASET_NAME)

METRICS = ["Inverse PPL"]#, "CHIA"]
CRITERIA = ["Hard to Learn", "Ambiguous", "Easy to Learn"] #, "Ambiguous",  "Random"]
COMBINED_CRITERIA = list(itertools.combinations(["Hard to Learn", "Ambiguous", "Easy to Learn"], 2))
RATIOS = [0.5]
CONVERGE_EPOCHS = [10]

for RATIO in RATIOS:
    for CONVERGE_EPOCH in CONVERGE_EPOCHS:
        idx_dict, i2s = get_scores(OUTPUTS_PATH, CONVERGE_EPOCH, STRING_TRUNCATE)
        df = calculate_statistics(CONVERGE_EPOCH, idx_dict, i2s)
        for METRIC in METRICS:
            merge_dfs = []
            for CRITERION in CRITERIA:
                idx_fname = create_ratio_fname(METRIC, CRITERION, CONVERGE_EPOCH, RATIO)
                subset_df = choose_subset(df, METRIC, CRITERION, DATASET_NAME, idx_fname, ratio=RATIO)
                merge_df = pd.merge(df, subset_df, on=["Index", "In", "Out", "In abbv.", "Out abbv.", "In Len", "Out Len", "In Rarity", "Out Rarity", \
                                                       'Confidence - Inverse PPL', 'Variability - Inverse PPL', \
                                                        'Confidence - CHIA', 'Variability - CHIA', \
                                                        'Confidence - BLEU', 'Variability - BLEU'], indicator=f"merge_{crit2abv[CRITERION]}", how='outer')
                merge_dfs.append(merge_df)

            merge_df = merge_dfs[0]
            for i in range(1, len(merge_dfs)):
                merge_df = pd.merge(merge_df, merge_dfs[i], on=["Index", "In", "Out", "In abbv.", "Out abbv.", "In Len", "Out Len", "In Rarity", "Out Rarity", \
                                                        'Confidence - Inverse PPL', 'Variability - Inverse PPL', \
                                                        'Confidence - CHIA', 'Variability - CHIA', \
                                                        'Confidence - BLEU', 'Variability - BLEU'], how='outer')
                print(merge_df.columns)

In [None]:
DATASET_NAME = "cogs"
OUTPUTS_PATH = outputs_path(DATASET_NAME)
METRIC = "CHIA"
CRITERIA = "Hard to Learn"
CONVERGE_EPOCH = 10

idx_dict, i2s = get_scores(OUTPUTS_PATH, CONVERGE_EPOCH, STRING_TRUNCATE)
df = calculate_statistics(CONVERGE_EPOCH, idx_dict, i2s)
idx_fname = create_fname(METRIC, CRITERIA, CONVERGE_EPOCH)
subset_df = choose_subset(df, METRIC, CRITERIA, DATASET_NAME, idx_fname)

In [None]:
DATASET_NAME = "42/cogs"
OUTPUTS_PATH = outputs_path(DATASET_NAME)
METRICS = ["Inverse PPL"] #["CHIA", "BLEU"]
CRITERIA = ["Hard to Learn", "Ambiguous", "Easy to Learn", "Random"]
COMBINED_CRITERIA = list(itertools.combinations(["Hard to Learn", "Ambiguous", "Easy to Learn"], 2))
RATIOS = [0.5]
CONVERGE_EPOCHS = [16]

for RATIO in RATIOS:
    for CONVERGE_EPOCH in CONVERGE_EPOCHS:
        idx_dict, i2s = get_scores(OUTPUTS_PATH, CONVERGE_EPOCH, STRING_TRUNCATE, min_epoch=6)
        df = calculate_statistics(CONVERGE_EPOCH, idx_dict, i2s)
        for METRIC in METRICS:
            for CRITERION in CRITERIA:
                idx_fname = create_ratio_fname(METRIC, CRITERION, CONVERGE_EPOCH, RATIO)
                subset_df = choose_subset(df, METRIC, CRITERION, DATASET_NAME, idx_fname, ratio=RATIO)

In [None]:
df.describe()

In [None]:
subset_df.describe()

In [None]:
DATASET_NAME = "cfq"
OUTPUTS_PATH = outputs_path(DATASET_NAME)

METRICS = ["Inverse PPL"]#, "CHIA"]
CRITERIA = ["Hard to Learn", "Ambiguous", "Easy to Learn"] #, "Ambiguous",  "Random"]
COMBINED_CRITERIA = list(itertools.combinations(["Hard to Learn", "Ambiguous", "Easy to Learn"], 2))
RATIOS = [0.33, 0.5]
CONVERGE_EPOCHS = [20]

for RATIO in RATIOS:
    for CONVERGE_EPOCH in CONVERGE_EPOCHS:
        idx_dict, i2s = get_scores(OUTPUTS_PATH, CONVERGE_EPOCH, STRING_TRUNCATE)
        df = calculate_statistics(CONVERGE_EPOCH, idx_dict, i2s)
        for METRIC in METRICS:
            merge_dfs = []
            for CRITERION in CRITERIA:
                idx_fname = create_ratio_fname(METRIC, CRITERION, CONVERGE_EPOCH, RATIO)
                subset_df = choose_subset(df, METRIC, CRITERION, DATASET_NAME, idx_fname, ratio=RATIO)
                merge_df = pd.merge(df, subset_df, on=["Index", "In", "Out", "In abbv.", "Out abbv.", "In Len", "Out Len", "In Rarity", "Out Rarity", \
                                                       'Confidence - Inverse PPL', 'Variability - Inverse PPL', \
                                                        'Confidence - CHIA', 'Variability - CHIA', \
                                                        'Confidence - BLEU', 'Variability - BLEU'], indicator=f"merge_{crit2abv[CRITERION]}", how='outer')
                # assert len(df) == len(merge_df), f"Original and merged dataset sizes do not match!: Original size: {len(df)}, Merged size: {len(merge_df)}"
                # plot(merge_df, plot_type="inv_ppl")
                merge_dfs.append(merge_df)
                # desc_df = subset_df.describe()
                # print(f"{METRIC} - {CRITERION}: ", f'In Len: {desc_df["In Len"][1]:.2f}, Out Len: {desc_df["Out Len"][1]:.2f}, In Rarity: {desc_df["In Rarity"][1]:.2f}, Out Rarity: {desc_df["Out Rarity"][1]:.2f}')

            merge_df = merge_dfs[0]
            for i in range(1, len(merge_dfs)):
                merge_df = pd.merge(merge_df, merge_dfs[i], on=["Index", "In", "Out", "In abbv.", "Out abbv.", "In Len", "Out Len", "In Rarity", "Out Rarity", \
                                                        'Confidence - Inverse PPL', 'Variability - Inverse PPL', \
                                                        'Confidence - CHIA', 'Variability - CHIA', \
                                                        'Confidence - BLEU', 'Variability - BLEU'], how='outer')
                #merge_subset_df["_merge"] = merge_subset_df["_merge"].cat.remove_categories("right_only")
                print(merge_df.columns)

            merge_df["combined"] = merge_df["merge_ambiguous"].astype(str) + merge_df["merge_easy_to_learn"].astype(str) + merge_df["merge_hard_to_learn"].astype(str)
            plot(merge_df, plot_type="inv_ppl", color_column="combined")

In [None]:
DATASET_NAME = "cfq"
OUTPUTS_PATH = outputs_path(DATASET_NAME)
METRICS = ["BLEU"] #["Inverse PPL", "CHIA"]
CRITERIA = ["Hard to Learn", "Ambiguous", "Easy to Learn"] #, "Ambiguous",  "Random"]
COMBINED_CRITERIA = list(itertools.combinations(["Hard to Learn", "Ambiguous", "Easy to Learn"], 2))
RATIOS = [0.5]
CONVERGE_EPOCHS = [20]

for RATIO in RATIOS:
    for CONVERGE_EPOCH in CONVERGE_EPOCHS:
        idx_dict, i2s = get_scores(OUTPUTS_PATH, CONVERGE_EPOCH, STRING_TRUNCATE, min_epoch=3)
        df = calculate_statistics(CONVERGE_EPOCH, idx_dict, i2s)
        for METRIC in METRICS:
            for CRITERION in CRITERIA:
                idx_fname = create_ratio_fname(METRIC, CRITERION, CONVERGE_EPOCH, RATIO)
                subset_df = choose_subset(df, METRIC, CRITERION, DATASET_NAME, idx_fname, ratio=RATIO)

In [None]:
merge_df["combined"].unique()

In [16]:
DATASET_NAME = "42/cogs"
OUTPUTS_PATH = outputs_path(DATASET_NAME)

METRICS = ["Inverse PPL", "CHIA", "BLEU"]
CRITERIA = ["Hard to Learn", "Easy to Learn", "Ambiguous"]
COMBINED_CRITERIA = list(itertools.combinations(["Hard to Learn", "Ambiguous", "Easy to Learn"], 2))
CONVERGE_EPOCH = 10

for METRIC in METRICS:
    for CRITERIA in COMBINED_CRITERIA:
        subset_dfs = []
        for CRITERION in CRITERIA:
            idx_dict, i2s = get_scores(OUTPUTS_PATH, CONVERGE_EPOCH, STRING_TRUNCATE)
            df = calculate_statistics(CONVERGE_EPOCH, idx_dict, i2s)
            idx_fname = create_fname(METRIC, CRITERION, CONVERGE_EPOCH)
            subset_df = choose_subset(df, METRIC, CRITERION, DATASET_NAME, idx_fname, write=False)
            subset_dfs.append(subset_df)
        idx_fname = create_comb_fname(METRIC, CRITERIA[0], CRITERIA[1], CONVERGE_EPOCH)
        combined_set_df = combine_subsets(df, subset_dfs, DATASET_NAME, idx_fname)
        
        print(len(combined_set_df) / len(df))
        desc_df = subset_df.describe()
        #print(METRIC, CRITERION, f'In Len Mean: {desc_df["In Len"][1]}', f'Out Len Mean: {desc_df["Out Len"][1]}', f'In Rar Mean: {desc_df["In Rarity"][1]}', f'Out Rar Mean: {desc_df["Out Rarity"][1]}')

In [18]:
DATASET_NAME = "42/cogs"
OUTPUTS_PATH = outputs_path(DATASET_NAME)

METRICS = ["Inverse PPL", "CHIA", "BLEU"]
CRITERIA = ["Hard to Learn", "Easy to Learn", "Ambiguous"]
COMBINED_CRITERIA = list(itertools.combinations(["Hard to Learn", "Ambiguous", "Easy to Learn"], 2))
CONVERGE_EPOCH = 10

for METRIC in METRICS:
    for CRITERIA in COMBINED_CRITERIA:
            idx_fname = create_comb_fname(METRIC, CRITERIA[0], CRITERIA[1], CONVERGE_EPOCH)
            print(idx_fname, len(list(read_pickle(os.path.join("subsets", DATASET_NAME, idx_fname)))))


# for METRIC in METRICS:
#    for CRITERION in CRITERIA:
#        for RATIO in RATIOS:
#            idx_fname = create_ratio_fname(METRIC, CRITERION, CONVERGE_EPOCH, RATIO)
#            print(idx_fname, len(list(read_pickle(os.path.join("subsets", DATASET_NAME, idx_fname)))))

inv_ppl_hard_to_learn_ambiguous_10.pickle 12078
inv_ppl_hard_to_learn_easy_to_learn_10.pickle 12077
inv_ppl_ambiguous_easy_to_learn_10.pickle 12077
chia_hard_to_learn_ambiguous_10.pickle 12078
chia_hard_to_learn_easy_to_learn_10.pickle 12077
chia_ambiguous_easy_to_learn_10.pickle 12077
bleu_hard_to_learn_ambiguous_10.pickle 12078
bleu_hard_to_learn_easy_to_learn_10.pickle 12077
bleu_ambiguous_easy_to_learn_10.pickle 12077


In [17]:
DATASET_NAME = "42/cfq"
OUTPUTS_PATH = outputs_path(DATASET_NAME)

METRICS = ["Inverse PPL", "CHIA", "BLEU"]
CRITERIA = ["Hard to Learn", "Easy to Learn", "Ambiguous"]
COMBINED_CRITERIA = list(itertools.combinations(["Hard to Learn", "Ambiguous", "Easy to Learn"], 2))
CONVERGE_EPOCH = 20

for METRIC in METRICS:
    for CRITERIA in COMBINED_CRITERIA:
        subset_dfs = []
        for CRITERION in CRITERIA:
            idx_dict, i2s = get_scores(OUTPUTS_PATH, CONVERGE_EPOCH, STRING_TRUNCATE)
            df = calculate_statistics(CONVERGE_EPOCH, idx_dict, i2s)
            idx_fname = create_fname(METRIC, CRITERION, CONVERGE_EPOCH)
            subset_df = choose_subset(df, METRIC, CRITERION, DATASET_NAME, idx_fname, write=False)
            subset_dfs.append(subset_df)
        idx_fname = create_comb_fname(METRIC, CRITERIA[0], CRITERIA[1], CONVERGE_EPOCH)
        combined_set_df = combine_subsets(df, subset_dfs, DATASET_NAME, idx_fname)
        
        print(len(combined_set_df) / len(df))
        desc_df = subset_df.describe()
        #print(METRIC, CRITERION, f'In Len Mean: {desc_df["In Len"][1]}', f'Out Len Mean: {desc_df["Out Len"][1]}', f'In Rar Mean: {desc_df["In Rarity"][1]}', f'Out Rar Mean: {desc_df["Out Rarity"][1]}')

start subset 31595


100%|██████████| 64148/64148 [00:02<00:00, 22948.48it/s]
  0%|          | 0/31595 [00:00<?, ?it/s]


0 0
0 0
start subset 31595


100%|██████████| 64148/64148 [00:02<00:00, 22210.00it/s]
  0%|          | 7/31595 [00:00<00:05, 5681.14it/s]


0 7
1 7
2 7
3 7
4 7
5 7
6 7
7 7
7 7
subset_idx: 47871
0.4999947776860972
start subset 31595


100%|██████████| 64148/64148 [00:02<00:00, 22664.66it/s]
  0%|          | 0/31595 [00:00<?, ?it/s]


0 0
0 0
start subset 31595


100%|██████████| 64148/64148 [00:02<00:00, 21621.29it/s]
  0%|          | 10/31595 [00:00<00:05, 5838.40it/s]


0 10
1 10
2 10
3 10
4 10
5 10
6 10
7 10
8 10
9 10
10 10
10 10
subset_idx: 47871
0.4999947776860972
start subset 31595


100%|██████████| 64148/64148 [00:02<00:00, 21728.13it/s]
  0%|          | 7/31595 [00:00<00:05, 5346.95it/s]


0 7
1 7
2 7
3 7
4 7
5 7
6 7
7 7
7 7


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/kuacc/users/oince22/.conda/envs/plotly-2/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3457, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/tmp/ipykernel_440432/1737004244.py", line 13, in <module>
    idx_dict, i2s = get_scores(OUTPUTS_PATH, CONVERGE_EPOCH, STRING_TRUNCATE)
  File "/tmp/ipykernel_440432/884308723.py", line 51, in get_scores
    in_rarity, out_rarity = get_rarity(v["in"], v["out"], in_v, out_v)
  File "/tmp/ipykernel_440432/1708361665.py", line 18, in get_rarity
    return -np.log(in_rarity), -np.log(out_rarity)
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/kuacc/users/oince22/.conda/envs/plotly-2/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 2077, in showtraceback
    stb = value._render_traceback_()
AttributeError: 'KeyboardInterrupt' object has no attribute '_render_t

TypeError: object of type 'NoneType' has no len()

In [None]:
subset_df.describe()

In [None]:
df.describe()

In [None]:
DATASET_NAME = "scan_length"
OUTPUTS_PATH = outputs_path(DATASET_NAME)
METRIC = "CHIA"
CRITERIA = "Hard to Learn"
CONVERGE_EPOCH = 30

idx_dict, i2s = get_scores(OUTPUTS_PATH, CONVERGE_EPOCH, STRING_TRUNCATE)
df = calculate_statistics(CONVERGE_EPOCH, idx_dict, i2s)
idx_fname = create_fname(METRIC, CRITERIA, CONVERGE_EPOCH)
subset_df = choose_subset(df, METRIC, CRITERIA, DATASET_NAME, idx_fname)

In [None]:
df.describe()

In [None]:
subset_df.describe()

In [None]:
DATASET_NAME = "scan_jump"
OUTPUTS_PATH = outputs_path(DATASET_NAME)
METRIC = "CHIA"
CRITERIA = "Hard to Learn"
CONVERGE_EPOCH = 30

idx_dict, i2s = get_scores(OUTPUTS_PATH, CONVERGE_EPOCH, STRING_TRUNCATE)
df = calculate_statistics(CONVERGE_EPOCH, idx_dict, i2s)
idx_fname = create_fname(METRIC, CRITERIA, CONVERGE_EPOCH)
subset_df = choose_subset(df, METRIC, CRITERIA, DATASET_NAME, idx_fname)

In [None]:
df.describe()

In [None]:
subset_df.describe()

In [None]:
import gc
gc.collect()

In [None]:
DATASET_NAME = "pcfg"
OUTPUTS_PATH = outputs_path(DATASET_NAME)
METRIC = "CHIA"
CRITERIA = "Hard to Learn"
CONVERGE_EPOCH = 140

idx_dict, i2s = get_scores(OUTPUTS_PATH, CONVERGE_EPOCH, STRING_TRUNCATE)
df = calculate_statistics(CONVERGE_EPOCH, idx_dict, i2s)
idx_fname = create_fname(METRIC, CRITERIA, CONVERGE_EPOCH)
subset_df = choose_subset(df, METRIC, CRITERIA, DATASET_NAME, idx_fname)

In [None]:
df.describe()

In [None]:
subset_df.describe()

subset_df_in = set(subset_df["In"].tolist())
subset_df_out = set(subset_df["Out"].tolist())

subset_pkl = read_pickle("../scores/cogs/idx_to_sentences.pickle")

subset_pkl_in = []
subset_pkl_out = []

for i, text in subset_pkl.items():
    subset_pkl_in.append(text["in"])
    subset_pkl_out.append(text["out"])

subset_pkl_in = set(subset_pkl_in)
subset_pkl_out = set(subset_pkl_out)

subset_df_in - subset_pkl_in, len(subset_df_in), len(subset_pkl_in)