In [13]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm, trange
import torch
import matplotlib.pyplot as plt
import pickle5 as pickle
import plotly.express as px
import itertools
import argparse
import scipy.stats
import scipy.special as special
from typing import Dict, List, Any, Tuple

In [14]:
def seed_everything(seed: int):
    import random, os
    import numpy as np
    
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
seed_everything(42)

In [15]:
def read_pickle(file_path: str) -> Any:
	with open(file_path, "rb") as handle:
		return pickle.load(handle)

In [33]:
l = read_pickle("/scratch/users/oince22/hpc_run/data-cartography-for-compositionality/code/dataset_gen/transformer_generalization/cartography/subsets/curriculum/cfq/inlens_bleu_ambiguous_20.pickle")

In [34]:
len(l)

95743

In [None]:
l

In [16]:
def write_pickle(file: Any, file_path: str) -> None:
    with open(file_path, 'wb') as handle:
        pickle.dump(file, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [17]:
def get_word_freqs(i2s):
    in_v = Counter()
    out_v = Counter()
    
    for txt in i2s["In"]:
        tokens = txt.split()
        in_v.update(tokens)

    for txt in i2s["Out"]:
        tokens = txt.split()
        out_v.update(tokens)
    
    total = sum(in_v.values())
    for k in in_v:
        in_v[k] /= total

    total = sum(out_v.values())
    for k in out_v:
        out_v[k] /= total
        
    return in_v, out_v

In [18]:
def get_rarity(in_txt, out_txt, in_v, out_v):
    in_toks = in_txt.split()
    out_toks = out_txt.split()
    
    in_rarity, out_rarity = 0, 0
    in_len, out_len = len(in_toks), len(out_toks)
    
    for tok in in_toks:
        in_rarity += in_v[tok]
        
    in_rarity /= in_len
    
    for tok in out_toks:
        out_rarity += out_v[tok]
    
    out_rarity /= out_len
    
    return -np.log(in_rarity), -np.log(out_rarity)

In [19]:
STRING_TRUNCATE = 50

def get_scores(dir_path: str, converge_epoch: int, string_truncate: int, min_epoch: int = 3) -> Tuple[Dict[int, Dict[str, List[float]]], Dict[str, List[Any]]]:
    file_list = os.listdir(dir_path)
    idx_to_sentences: Dict[int, Dict[str, str]] = read_pickle(os.path.join(dir_path, "idx_to_sentences.pickle"))

    file_list = [f for f in file_list if f[:5] == "epoch"]
    file_list = [f for f in file_list if int(f.split("_")[0].replace("epoch", "")) > min_epoch and int(f.split("_")[0].replace("epoch", "")) < converge_epoch]
    file_list = sorted(file_list, key= lambda s: int(s.split("_")[1].replace("stepidx", "")))

    # print("Loading files in:", dir_path)
    idxs, ppls, chias, bleus = [], [], [], []
    print("Read pickles")
    for file_name in tqdm(file_list):
        file_path = f"{dir_path}/{file_name}"
        # print(file_name)
        if "ppl" in file_path:
            ppls.extend(read_pickle(file_path).tolist())
        elif "chia" in file_path:
            chias.extend(read_pickle(file_path).tolist())
        elif "bleu" in file_path:
            bleus.extend(read_pickle(file_path))
        elif "idx" in file_path:
            idxs.extend(read_pickle(file_path).tolist())
        else:
            output_csv_name = file_path

    items = list(zip(idxs, ppls, chias, bleus))
    items = sorted(items, key=lambda i: i[0])
    idx_dict: Dict[int, Dict[str, List[float]]] = {}
    
    print("Process items")
    for item in tqdm(items):
        if item[0] not in idx_dict:
            idx_dict[item[0]] = {"inv_ppl": [1 / item[1]], "chia": [item[2]], "bleu": [item[3]]}
        else:
            idx_dict[item[0]]["inv_ppl"].append(1 / item[1])
            idx_dict[item[0]]["chia"].append(item[2])
            idx_dict[item[0]]["bleu"].append(item[3])

    i2s = {"Index": [], "In": [], "Out": [], "In abbv.": [], "Out abbv.": [], "In Len": [], "Out Len": [], "In Rarity": [], "Out Rarity": []}

    print("Create items list")
    for k, v in tqdm(idx_to_sentences.items()):
        i2s["Index"].append(k)
        i2s["In"].append(v["in"])
        i2s["Out"].append(v["out"])
        i2s["In abbv."].append(v["in"][:STRING_TRUNCATE])
        i2s["Out abbv."].append(v["out"][:STRING_TRUNCATE])
        i2s["In Len"].append(len(v["in"].split()))
        i2s["Out Len"].append(len(v["out"].split()))

    in_v, out_v = get_word_freqs(i2s)
    
    print("Process item rarity")
    for k, v in tqdm(idx_to_sentences.items()):
        in_rarity, out_rarity = get_rarity(v["in"], v["out"], in_v, out_v)
        i2s["In Rarity"].append(in_rarity)
        i2s["Out Rarity"].append(out_rarity)

    return idx_dict, i2s

In [20]:
from collections import Counter

def create_vocab(df):
	in_v = Counter()
	out_v = Counter()
    
	for idx, txt in df["In"].items():
		tokens = txt.split()
		in_v.update(tokens)
         
	for idx, txt in df["Out"].items():
		tokens = txt.split()
		out_v.update(tokens)

	return set(in_v.keys()), set(out_v.keys()), in_v, out_v

In [21]:
def calculate_statistics(epoch: int, idx_dict: Dict[int, Dict[str, List[float]]], i2s: Dict[str, List[Any]]) -> pd.DataFrame:
    idx_mean_var_dict: Dict[int, Dict[str, Tuple[float, float]]] = {}
    idx_mean_var_list: List[Tuple[int, float, float, float, float, float, float, float, float]] = []
    score_names = ["inv_ppl", "chia", "bleu"]

    print("Calculate statistics")
    for idx, scores in tqdm(idx_dict.items()):
        scores_list = []
        for score_name in score_names:
            score_arr = np.array(scores[score_name][:epoch])
            mean = score_arr.mean()
            var = score_arr.var()
            scores_list.extend([mean, var])

        idx_mean_var_list.append(tuple((idx, *scores_list)))

    i2s_df = pd.DataFrame.from_dict(i2s)


    df = pd.DataFrame(idx_mean_var_list, columns =['Index', 'Confidence - Inverse PPL', 'Variability - Inverse PPL', \
                                                    'Confidence - CHIA', 'Variability - CHIA', \
                                                    'Confidence - BLEU', 'Variability - BLEU'])

    cartography = pd.merge(df, i2s_df, on="Index")

    return cartography

In [22]:
def load_scores(dir_path: str, plot_path: str, converge_epoch: int) -> None:
	idx_dict = get_scores(dir_path, plot_path, converge_epoch)
	
	for epoch in trange(3, converge_epoch, 2):
		df = calculate_statistics(epoch, idx_dict)

		plot_types = ["inv_ppl", "chia", "bleu"]

		for plot_type in tqdm(plot_types, "Plots"):
			plot(df, plot_path, str(epoch), plot_type)

In [29]:
def save_subset(subset_df: pd.DataFrame, sort_by: str, ds_name: str, subset_fname: str) -> None:
    subset_idx = subset_df["Index"].tolist()
    subset_diffs = subset_df[sort_by].tolist()
    subset_inlens = subset_df["In Len"].tolist()
    subset_outlens = subset_df["Out Len"].tolist()
    
    subset_idx = [int(i) for i in subset_idx]
    subset_diffs = [float(f) for f in subset_diffs]
    subset_inlens = [int(i) for i in subset_inlens]
    subset_outlens = [int(i) for i in subset_outlens]
    
    print(f"subset_inlens: {len(subset_inlens)}")
    print(f"subset_outlens: {len(subset_outlens)}")
    
    os.makedirs(os.path.join("subsets", "curriculum", ds_name), exist_ok=True)
    write_pickle(subset_idx, os.path.join("subsets", "curriculum", ds_name, subset_fname))
    write_pickle(subset_diffs, os.path.join("subsets", "curriculum", ds_name, f"diffs_{subset_fname}"))
    write_pickle(subset_inlens, os.path.join("subsets", "curriculum", ds_name, f"inlens_{subset_fname}"))
    write_pickle(subset_outlens, os.path.join("subsets", "curriculum", ds_name, f"outlens_{subset_fname}"))
    print(f"subset_idx: {len(subset_idx)}, subset_diffs: {len(subset_diffs)}")

In [31]:
from pprint import pprint

def choose_subset(df: pd.DataFrame, metric: str, criteria: str, ds_name: str, subset_fname:str, write=True) -> pd.DataFrame:
    assert metric in ["Inverse PPL", "Neg PPL", "CHIA", "BLEU"]
    assert criteria in ["Easy to Learn", "Ambiguous", "Hard to Learn", "Random"]
    
    if criteria == "Easy to Learn":
        sort_by = f"Confidence - {metric}"
        ascending = False
    elif criteria == "Ambiguous":
        sort_by = f"Variability - {metric}"
        ascending = False
    elif criteria == "Hard to Learn":
        sort_by = f"Confidence - {metric}"
        ascending = True
        
    if criteria == "Random":
        sorted_df = df.sample(frac=1)
    else:
        sorted_df = df.sort_values(by=[sort_by], ascending=ascending)

    sorted_df = sorted_df.reset_index(drop=True)
    
    sorted_idx = sorted_df["Index"].tolist()
    sorted_idx = [int(i) for i in sorted_idx]
    
    if write:
        save_subset(sorted_df, sort_by, ds_name, subset_fname)
    
    return sorted_df

In [25]:
def combine_subsets(df: pd.DataFrame, subset_dfs: List[pd.DataFrame], ds_name: str, subset_fname: str) -> pd.DataFrame:
    
    combined_set = pd.concat(subset_dfs)
    combined_set = combined_set.drop_duplicates(keep="first")
    
    if len(combined_set) > (len(df) / 2):
        combined_set = combined_set.iloc[:int(len(df) / 2)]
    else:
        count = 0
        while len(combined_set) < (len(df) / 2):
            example = df.sample(n=1)
            if not example.iloc[0]["In"] in combined_set['In'].tolist():
                combined_set = combined_set.append(example)
                
    save_subset(combined_set, ds_name, subset_fname)
    
    return combined_set

In [26]:
def plot(df, plot_type="inv_ppl", color_column="_merge"):
	if plot_type == "inv_ppl":
            # print(df["_merge"].unique())
            #df["_merge"] = df["_merge"].cat.remove_categories("right_only")
            # print(df["_merge"].unique())
            #assert '_merge' in df.columns, "_merge not in columns"
            fig = px.scatter(df, x="Variability - Inverse PPL", y="Confidence - Inverse PPL", custom_data=['In abbv.', 'Out abbv.', 'In Len', 'Out Len'], color=color_column) # , range_color=[0,1]
            fig.update_layout(yaxis_range=[0, 1])
            fig.update_traces(
                hovertemplate="<br>".join([
                    "Variability - Inverse PPL: %{x}",
                    "Confidence - Inverse PPL: %{y}",
                    "In: %{customdata[0]}",
                    "Out: %{customdata[1]}",
                    "In Len: %{customdata[2]}",
                    "Out Len: %{customdata[3]}", 
                ])
            )
	elif plot_type == "chia":
		fig = px.scatter(df, x="Variability - CHIA", y="Confidence - CHIA", custom_data=['In abbv.', 'Out abbv.', 'In Len', 'Out Len'], color='Confidence - BLEU', range_color=[0,1])
		fig.update_layout(yaxis_range=[0, 1])
		fig.update_traces(
			hovertemplate="<br>".join([
				"Variability - CHIA: %{x}",
				"Confidence - CHIA: %{y}",
				"In: %{customdata[0]}",
				"Out: %{customdata[1]}",
                "In Len: %{customdata[2]}",
                "Out Len: %{customdata[3]}", 
			])
		)
	elif plot_type == "bleu":
		fig = px.scatter(df, x="Variability - BLEU", y="Confidence - BLEU", custom_data=['In abbv.', 'Out abbv.', 'In Len', 'Out Len'], color='Confidence - BLEU', range_color=[0,1])
		fig.update_layout(yaxis_range=[0, 1])
		fig.update_traces(
			hovertemplate="<br>".join([
				"Variability - BLEU: %{x}",
				"Confidence - BLEU: %{y}",
				"In: %{customdata[0]}",
				"Out: %{customdata[1]}",
                "In Len: %{customdata[2]}",
                "Out Len: %{customdata[3]}", 
			])
		)	
	fig.update_traces(marker=dict(size=3), selector=dict(mode='markers'))
	fig.update_layout(
		autosize=False,
		width=800,
		height=900
	)
	fig.show()

In [19]:
!jupyter nbconvert --to script interactive_plot-Copy3.ipynb

[NbConvertApp] Converting notebook interactive_plot-Copy3.ipynb to script
[NbConvertApp] Writing 25514 bytes to interactive_plot-Copy3.py


In [27]:
STRING_TRUNCATE = 120

mtrc2abv = {"Inverse PPL": "inv_ppl", "Neg PPL": "neg_ppl", "CHIA": "chia", "BLEU": "bleu"}
crit2abv = {"Easy to Learn": "easy_to_learn", "Ambiguous": "ambiguous", "Hard to Learn": "hard_to_learn", "Random": "random"}
create_fname = lambda m, cr, c_e: f"{mtrc2abv[m]}_{crit2abv[cr]}_{c_e}.pickle"
create_ratio_fname = lambda m, cr, c_e, rto: f"{mtrc2abv[m]}_{crit2abv[cr]}_{c_e}_{rto}.pickle"
create_comb_fname = lambda m, cr1, cr2, c_e: f"{mtrc2abv[m]}_{crit2abv[cr1]}_{crit2abv[cr2]}_{c_e}.pickle"
outputs_path = lambda x: f"../scores/{x}"

In [17]:
# i2s = read_pickle(os.path.join(outputs_path("cfq"), "idx_to_sentences.pickle"))
# i2s_htl = read_pickle(os.path.join(outputs_path("cfq"), "idx_to_sentences_htl_20.pickle"))
# len(i2s), len(i2s_htl)

In [32]:
DATASET_NAMES = ["cfq", "cogs", "pcfg"] #, "scan_length", "scan_jump", "pcfg"
METRICS = ["Inverse PPL", "BLEU"] #  "CHIA",
CRITERIA = ["Ambiguous", "Easy to Learn", "Hard to Learn"]
CONVERGE_EPOCHS = [20, 10, 110] # , 30, 30, 140
MIN_EPOCHS = [3, 3, 44]

for CONVERGE_EPOCH, MIN_EPOCH, DATASET_NAME in zip(CONVERGE_EPOCHS, MIN_EPOCHS, DATASET_NAMES):
    OUTPUTS_PATH = outputs_path(DATASET_NAME)
    idx_dict, i2s = get_scores(OUTPUTS_PATH, CONVERGE_EPOCH, STRING_TRUNCATE, min_epoch=MIN_EPOCH)
    df = calculate_statistics(CONVERGE_EPOCH, idx_dict, i2s)
    for METRIC in METRICS:
        for CRITERION in CRITERIA:
            print(METRIC, CRITERION)
            idx_fname = create_fname(METRIC, CRITERION, CONVERGE_EPOCH)
            subset_df = choose_subset(df, METRIC, CRITERION, DATASET_NAME, idx_fname)

Read pickles


100%|██████████| 6016/6016 [00:11<00:00, 533.53it/s]


Process items


100%|██████████| 1540096/1540096 [00:03<00:00, 414043.97it/s]


Create items list


100%|██████████| 95743/95743 [00:00<00:00, 224422.95it/s]


Process item rarity


100%|██████████| 95743/95743 [00:01<00:00, 94224.38it/s] 


Calculate statistics


100%|██████████| 95743/95743 [00:06<00:00, 14367.26it/s]


Inverse PPL Ambiguous
subset_inlens: 95743
subset_outlens: 95743
subset_idx: 95743, subset_diffs: 95743
Inverse PPL Easy to Learn
subset_inlens: 95743
subset_outlens: 95743
subset_idx: 95743, subset_diffs: 95743
Inverse PPL Hard to Learn
subset_inlens: 95743
subset_outlens: 95743
subset_idx: 95743, subset_diffs: 95743
BLEU Ambiguous
subset_inlens: 95743
subset_outlens: 95743
subset_idx: 95743, subset_diffs: 95743
BLEU Easy to Learn
subset_inlens: 95743
subset_outlens: 95743
subset_idx: 95743, subset_diffs: 95743
BLEU Hard to Learn
subset_inlens: 95743
subset_outlens: 95743
subset_idx: 95743, subset_diffs: 95743
Read pickles


100%|██████████| 4536/4536 [00:08<00:00, 529.29it/s]


Process items


100%|██████████| 145152/145152 [00:00<00:00, 238096.06it/s]


Create items list


100%|██████████| 24155/24155 [00:00<00:00, 169663.85it/s]


Process item rarity


100%|██████████| 24155/24155 [00:00<00:00, 90986.21it/s]


Calculate statistics


100%|██████████| 24155/24155 [00:01<00:00, 14215.56it/s]


Inverse PPL Ambiguous
subset_inlens: 24155
subset_outlens: 24155
subset_idx: 24155, subset_diffs: 24155
Inverse PPL Easy to Learn
subset_inlens: 24155
subset_outlens: 24155
subset_idx: 24155, subset_diffs: 24155
Inverse PPL Hard to Learn
subset_inlens: 24155
subset_outlens: 24155
subset_idx: 24155, subset_diffs: 24155
BLEU Ambiguous
subset_inlens: 24155
subset_outlens: 24155
subset_idx: 24155, subset_diffs: 24155
BLEU Easy to Learn
subset_inlens: 24155
subset_outlens: 24155
subset_idx: 24155, subset_diffs: 24155
BLEU Hard to Learn
subset_inlens: 24155
subset_outlens: 24155
subset_idx: 24155, subset_diffs: 24155
Read pickles


 22%|██▏       | 143997/667680 [05:59<21:46, 400.89it/s]  


KeyboardInterrupt: 

In [None]:
DATASET_NAME = "cfq"
OUTPUTS_PATH = outputs_path(DATASET_NAME)
METRIC = "Inverse PPL"
CRITERIA = "Hard to Learn"
CONVERGE_EPOCH = 20

idx_dict, i2s = get_scores(OUTPUTS_PATH, CONVERGE_EPOCH, STRING_TRUNCATE)
df = calculate_statistics(CONVERGE_EPOCH, idx_dict, i2s)
idx_fname = create_fname(METRIC, CRITERIA, CONVERGE_EPOCH)
subset_df = choose_subset(df, METRIC, CRITERIA, DATASET_NAME, idx_fname)

In [None]:
subset_df.head()

In [None]:
sorted_idx = subset_df["Index"].tolist()
sorted_idx = [int(i) for i in sorted_idx]
sorted_idx[:50]

In [None]:
DATASET_NAME = "cogs"
OUTPUTS_PATH = outputs_path(DATASET_NAME)

METRICS = ["Inverse PPL"]#, "CHIA"]
CRITERIA = ["Hard to Learn", "Ambiguous", "Easy to Learn"] #, "Ambiguous",  "Random"]
COMBINED_CRITERIA = list(itertools.combinations(["Hard to Learn", "Ambiguous", "Easy to Learn"], 2))
RATIOS = [0.33, 0.5]
CONVERGE_EPOCHS = [7]

for RATIO in RATIOS:
    for CONVERGE_EPOCH in CONVERGE_EPOCHS:
        idx_dict, i2s = get_scores(OUTPUTS_PATH, CONVERGE_EPOCH, STRING_TRUNCATE)
        df = calculate_statistics(CONVERGE_EPOCH, idx_dict, i2s)
        for METRIC in METRICS:
            merge_dfs = []
            for CRITERION in CRITERIA:
                idx_fname = create_ratio_fname(METRIC, CRITERION, CONVERGE_EPOCH, RATIO)
                subset_df = choose_subset(df, METRIC, CRITERION, DATASET_NAME, idx_fname, ratio=RATIO)
                merge_df = pd.merge(df, subset_df, on=["Index", "In", "Out", "In abbv.", "Out abbv.", "In Len", "Out Len", "In Rarity", "Out Rarity", \
                                                       'Confidence - Inverse PPL', 'Variability - Inverse PPL', \
                                                        'Confidence - CHIA', 'Variability - CHIA', \
                                                        'Confidence - BLEU', 'Variability - BLEU'], indicator=f"merge_{crit2abv[CRITERION]}", how='outer')
                # assert len(df) == len(merge_df), f"Original and merged dataset sizes do not match!: Original size: {len(df)}, Merged size: {len(merge_df)}"
                # plot(merge_df, plot_type="inv_ppl")
                merge_dfs.append(merge_df)
                # desc_df = subset_df.describe()
                # print(f"{METRIC} - {CRITERION}: ", f'In Len: {desc_df["In Len"][1]:.2f}, Out Len: {desc_df["Out Len"][1]:.2f}, In Rarity: {desc_df["In Rarity"][1]:.2f}, Out Rarity: {desc_df["Out Rarity"][1]:.2f}')

            merge_df = merge_dfs[0]
            for i in range(1, len(merge_dfs)):
                merge_df = pd.merge(merge_df, merge_dfs[i], on=["Index", "In", "Out", "In abbv.", "Out abbv.", "In Len", "Out Len", "In Rarity", "Out Rarity", \
                                                        'Confidence - Inverse PPL', 'Variability - Inverse PPL', \
                                                        'Confidence - CHIA', 'Variability - CHIA', \
                                                        'Confidence - BLEU', 'Variability - BLEU'], how='outer')
                #merge_subset_df["_merge"] = merge_subset_df["_merge"].cat.remove_categories("right_only")
                print(merge_df.columns)

            merge_df["combined"] = merge_df["merge_ambiguous"].astype(str) + merge_df["merge_easy_to_learn"].astype(str) + merge_df["merge_hard_to_learn"].astype(str)
            plot(merge_df, plot_type="inv_ppl", color_column="combined")

In [None]:
DATASET_NAME = "cogs"
OUTPUTS_PATH = outputs_path(DATASET_NAME)
METRIC = "CHIA"
CRITERIA = "Hard to Learn"
CONVERGE_EPOCH = 10

idx_dict, i2s = get_scores(OUTPUTS_PATH, CONVERGE_EPOCH, STRING_TRUNCATE)
df = calculate_statistics(CONVERGE_EPOCH, idx_dict, i2s)
idx_fname = create_fname(METRIC, CRITERIA, CONVERGE_EPOCH)
subset_df = choose_subset(df, METRIC, CRITERIA, DATASET_NAME, idx_fname)

In [None]:
DATASET_NAME = "cogs"
OUTPUTS_PATH = outputs_path(DATASET_NAME)
METRICS = ["Inverse PPL"] #["CHIA", "BLEU"]
CRITERIA = ["Hard to Learn", "Ambiguous", "Easy to Learn", "Random"]
COMBINED_CRITERIA = list(itertools.combinations(["Hard to Learn", "Ambiguous", "Easy to Learn"], 2))
RATIOS = [0.5]
CONVERGE_EPOCHS = [16]

for RATIO in RATIOS:
    for CONVERGE_EPOCH in CONVERGE_EPOCHS:
        idx_dict, i2s = get_scores(OUTPUTS_PATH, CONVERGE_EPOCH, STRING_TRUNCATE, min_epoch=6)
        df = calculate_statistics(CONVERGE_EPOCH, idx_dict, i2s)
        for METRIC in METRICS:
            for CRITERION in CRITERIA:
                idx_fname = create_ratio_fname(METRIC, CRITERION, CONVERGE_EPOCH, RATIO)
                subset_df = choose_subset(df, METRIC, CRITERION, DATASET_NAME, idx_fname, ratio=RATIO)

In [None]:
df.describe()

In [None]:
subset_df.describe()

In [None]:
DATASET_NAME = "cfq"
OUTPUTS_PATH = outputs_path(DATASET_NAME)

METRICS = ["Inverse PPL"]#, "CHIA"]
CRITERIA = ["Hard to Learn", "Ambiguous", "Easy to Learn"] #, "Ambiguous",  "Random"]
COMBINED_CRITERIA = list(itertools.combinations(["Hard to Learn", "Ambiguous", "Easy to Learn"], 2))
RATIOS = [0.33, 0.5]
CONVERGE_EPOCHS = [20]

for RATIO in RATIOS:
    for CONVERGE_EPOCH in CONVERGE_EPOCHS:
        idx_dict, i2s = get_scores(OUTPUTS_PATH, CONVERGE_EPOCH, STRING_TRUNCATE)
        df = calculate_statistics(CONVERGE_EPOCH, idx_dict, i2s)
        for METRIC in METRICS:
            merge_dfs = []
            for CRITERION in CRITERIA:
                idx_fname = create_ratio_fname(METRIC, CRITERION, CONVERGE_EPOCH, RATIO)
                subset_df = choose_subset(df, METRIC, CRITERION, DATASET_NAME, idx_fname, ratio=RATIO)
                merge_df = pd.merge(df, subset_df, on=["Index", "In", "Out", "In abbv.", "Out abbv.", "In Len", "Out Len", "In Rarity", "Out Rarity", \
                                                       'Confidence - Inverse PPL', 'Variability - Inverse PPL', \
                                                        'Confidence - CHIA', 'Variability - CHIA', \
                                                        'Confidence - BLEU', 'Variability - BLEU'], indicator=f"merge_{crit2abv[CRITERION]}", how='outer')
                # assert len(df) == len(merge_df), f"Original and merged dataset sizes do not match!: Original size: {len(df)}, Merged size: {len(merge_df)}"
                # plot(merge_df, plot_type="inv_ppl")
                merge_dfs.append(merge_df)
                # desc_df = subset_df.describe()
                # print(f"{METRIC} - {CRITERION}: ", f'In Len: {desc_df["In Len"][1]:.2f}, Out Len: {desc_df["Out Len"][1]:.2f}, In Rarity: {desc_df["In Rarity"][1]:.2f}, Out Rarity: {desc_df["Out Rarity"][1]:.2f}')

            merge_df = merge_dfs[0]
            for i in range(1, len(merge_dfs)):
                merge_df = pd.merge(merge_df, merge_dfs[i], on=["Index", "In", "Out", "In abbv.", "Out abbv.", "In Len", "Out Len", "In Rarity", "Out Rarity", \
                                                        'Confidence - Inverse PPL', 'Variability - Inverse PPL', \
                                                        'Confidence - CHIA', 'Variability - CHIA', \
                                                        'Confidence - BLEU', 'Variability - BLEU'], how='outer')
                #merge_subset_df["_merge"] = merge_subset_df["_merge"].cat.remove_categories("right_only")
                print(merge_df.columns)

            merge_df["combined"] = merge_df["merge_ambiguous"].astype(str) + merge_df["merge_easy_to_learn"].astype(str) + merge_df["merge_hard_to_learn"].astype(str)
            plot(merge_df, plot_type="inv_ppl", color_column="combined")

In [None]:
DATASET_NAME = "cfq"
OUTPUTS_PATH = outputs_path(DATASET_NAME)
METRICS = ["BLEU"] #["Inverse PPL", "CHIA"]
CRITERIA = ["Hard to Learn", "Ambiguous", "Easy to Learn"] #, "Ambiguous",  "Random"]
COMBINED_CRITERIA = list(itertools.combinations(["Hard to Learn", "Ambiguous", "Easy to Learn"], 2))
RATIOS = [0.5]
CONVERGE_EPOCHS = [20]

for RATIO in RATIOS:
    for CONVERGE_EPOCH in CONVERGE_EPOCHS:
        idx_dict, i2s = get_scores(OUTPUTS_PATH, CONVERGE_EPOCH, STRING_TRUNCATE, min_epoch=3)
        df = calculate_statistics(CONVERGE_EPOCH, idx_dict, i2s)
        for METRIC in METRICS:
            for CRITERION in CRITERIA:
                idx_fname = create_ratio_fname(METRIC, CRITERION, CONVERGE_EPOCH, RATIO)
                subset_df = choose_subset(df, METRIC, CRITERION, DATASET_NAME, idx_fname, ratio=RATIO)

In [None]:
merge_df["combined"].unique()

In [None]:
DATASET_NAME = "cfq"
OUTPUTS_PATH = outputs_path(DATASET_NAME)

METRICS = ["Inverse PPL", "CHIA"]
CRITERIA = ["Hard to Learn", "Easy to Learn", "Ambiguous",  "Random"]
COMBINED_CRITERIA = list(itertools.combinations(["Hard to Learn", "Ambiguous", "Easy to Learn"], 2))
CONVERGE_EPOCH = 20

for METRIC in METRICS:
    for CRITERIA in COMBINED_CRITERIA:
        subset_dfs = []
        for CRITERION in CRITERIA:
            idx_dict, i2s = get_scores(OUTPUTS_PATH, CONVERGE_EPOCH, STRING_TRUNCATE)
            df = calculate_statistics(CONVERGE_EPOCH, idx_dict, i2s)
            idx_fname = create_fname(METRIC, CRITERION, CONVERGE_EPOCH)
            subset_df = choose_subset(df, METRIC, CRITERION, DATASET_NAME, idx_fname, write=False)
            subset_dfs.append(subset_df)
        idx_fname = create_comb_fname(METRIC, CRITERIA[0], CRITERIA[1], CONVERGE_EPOCH)
        combined_set_df = combine_subsets(df, subset_dfs, DATASET_NAME, idx_fname)
        
        print(len(combined_set_df) / len(df))
        desc_df = subset_df.describe()
        #print(METRIC, CRITERION, f'In Len Mean: {desc_df["In Len"][1]}', f'Out Len Mean: {desc_df["Out Len"][1]}', f'In Rar Mean: {desc_df["In Rarity"][1]}', f'Out Rar Mean: {desc_df["Out Rarity"][1]}')

In [None]:
DATASET_NAME = "cfq"
OUTPUTS_PATH = outputs_path(DATASET_NAME)

METRICS = ["Inverse PPL", "CHIA"]
CRITERIA = ["Hard to Learn", "Easy to Learn", "Ambiguous",  "Random"]
COMBINED_CRITERIA = list(itertools.combinations(["Hard to Learn", "Ambiguous", "Easy to Learn"], 2))
CONVERGE_EPOCH = 20

for METRIC in METRICS:
    for CRITERIA in COMBINED_CRITERIA:
        subset_dfs = []
        for CRITERION in CRITERIA:
            idx_dict, i2s = get_scores(OUTPUTS_PATH, CONVERGE_EPOCH, STRING_TRUNCATE)
            df = calculate_statistics(CONVERGE_EPOCH, idx_dict, i2s)
            idx_fname = create_fname(METRIC, CRITERION, CONVERGE_EPOCH)
            subset_df = choose_subset(df, METRIC, CRITERION, DATASET_NAME, idx_fname, write=False)
            subset_dfs.append(subset_df)
        idx_fname = create_comb_fname(METRIC, CRITERIA[0], CRITERIA[1], CONVERGE_EPOCH)
        combined_set_df = combine_subsets(df, subset_dfs, DATASET_NAME, idx_fname)
        
        print(len(combined_set_df) / len(df))
        desc_df = subset_df.describe()
        #print(METRIC, CRITERION, f'In Len Mean: {desc_df["In Len"][1]}', f'Out Len Mean: {desc_df["Out Len"][1]}', f'In Rar Mean: {desc_df["In Rarity"][1]}', f'Out Rar Mean: {desc_df["Out Rarity"][1]}')

In [None]:
subset_df.describe()

In [None]:
df.describe()

In [None]:
DATASET_NAME = "scan_length"
OUTPUTS_PATH = outputs_path(DATASET_NAME)
METRIC = "CHIA"
CRITERIA = "Hard to Learn"
CONVERGE_EPOCH = 30

idx_dict, i2s = get_scores(OUTPUTS_PATH, CONVERGE_EPOCH, STRING_TRUNCATE)
df = calculate_statistics(CONVERGE_EPOCH, idx_dict, i2s)
idx_fname = create_fname(METRIC, CRITERIA, CONVERGE_EPOCH)
subset_df = choose_subset(df, METRIC, CRITERIA, DATASET_NAME, idx_fname)

In [None]:
df.describe()

In [None]:
subset_df.describe()

In [None]:
DATASET_NAME = "scan_jump"
OUTPUTS_PATH = outputs_path(DATASET_NAME)
METRIC = "CHIA"
CRITERIA = "Hard to Learn"
CONVERGE_EPOCH = 30

idx_dict, i2s = get_scores(OUTPUTS_PATH, CONVERGE_EPOCH, STRING_TRUNCATE)
df = calculate_statistics(CONVERGE_EPOCH, idx_dict, i2s)
idx_fname = create_fname(METRIC, CRITERIA, CONVERGE_EPOCH)
subset_df = choose_subset(df, METRIC, CRITERIA, DATASET_NAME, idx_fname)

In [None]:
df.describe()

In [None]:
subset_df.describe()

In [None]:
import gc
gc.collect()

In [None]:
DATASET_NAME = "pcfg"
OUTPUTS_PATH = outputs_path(DATASET_NAME)
METRIC = "CHIA"
CRITERIA = "Hard to Learn"
CONVERGE_EPOCH = 140

idx_dict, i2s = get_scores(OUTPUTS_PATH, CONVERGE_EPOCH, STRING_TRUNCATE)
df = calculate_statistics(CONVERGE_EPOCH, idx_dict, i2s)
idx_fname = create_fname(METRIC, CRITERIA, CONVERGE_EPOCH)
subset_df = choose_subset(df, METRIC, CRITERIA, DATASET_NAME, idx_fname)

In [None]:
df.describe()

In [None]:
subset_df.describe()

subset_df_in = set(subset_df["In"].tolist())
subset_df_out = set(subset_df["Out"].tolist())

subset_pkl = read_pickle("../scores/cogs/idx_to_sentences.pickle")

subset_pkl_in = []
subset_pkl_out = []

for i, text in subset_pkl.items():
    subset_pkl_in.append(text["in"])
    subset_pkl_out.append(text["out"])

subset_pkl_in = set(subset_pkl_in)
subset_pkl_out = set(subset_pkl_out)

subset_df_in - subset_pkl_in, len(subset_df_in), len(subset_pkl_in)