In [1]:
# Required format:

# 							FLEURS	EdAcc	CV
# ET
# duseqs,w2v2-att4-1000
# phoneseqs,vl107,att=8
# ET+PS_combo
# ET+duseq_linear
# ET+duseqembed
# ET+PS_linear


In [2]:
from collections import defaultdict
import pickle as pkl
import numpy as np
import os, sys

In [3]:
def read_edacc_results(predictions_dir):
    
    predictions_path = predictions_dir + "/edacc_predictions.pkl"
    if not os.path.exists(predictions_path):
        predictions_path = predictions_dir + "/predictions.pkl"

    if not os.path.exists(predictions_path):
        print(f"No predictions found in the specified directory: {predictions_dir}")
        sys.exit(1)

    with open(predictions_path, "rb") as f:
        eval_data = pkl.load(f)

    results_by_accent = defaultdict(lambda: defaultdict(int))
    # print(list(zip(eval_data["preds"], eval_data["accents"], eval_data["labels"])))
    for prediction, accent, label in zip(eval_data["preds"], eval_data["accents"], eval_data["labels"]):
        if prediction == label:
            results_by_accent[accent]["correct"] += 1
        results_by_accent[accent]["total"] += 1
    # %%

    # Merge "us" and "american"  accents
    results_by_accent["us"] = {k: results_by_accent["us"].get(k, 0) + results_by_accent["american"].get(k, 0) for k in set(results_by_accent["us"]) | set(results_by_accent["american"])}
    # results_by_accent["us"]["total"] = sum([results_by_accent["us"]["total"], results_by_accent["american"]["total"]])
    del results_by_accent["american"]

    # Compute macro-average accuracy, ignoring accents with < 10 samples
    acc_by_accent = {accent: results_by_accent[accent]["correct"]/results_by_accent[accent]["total"] \
                        for accent in results_by_accent}

    macro_avg = np.mean([acc for accent, acc in acc_by_accent.items() if results_by_accent[accent]["total"] >= 10])
    micro_avg = sum([results_by_accent[accent]["correct"] for accent in results_by_accent]) \
        / sum([results_by_accent[accent]["total"] for accent in results_by_accent])
    std_dev = np.std([acc for accent, acc in acc_by_accent.items() if results_by_accent[accent]["total"] >= 10])
    
    return results_by_accent, macro_avg, micro_avg, std_dev



def read_edacc_for_bootstrap_ci(predictions_dir):
    '''
    Should return a list of tuples [(pred, label, speaker_id)]
    '''
    
    predictions_path = predictions_dir + "/edacc_predictions.pkl"
    if not os.path.exists(predictions_path):
        predictions_path = predictions_dir + "/predictions.pkl"

    if not os.path.exists(predictions_path):
        print(f"No predictions found in the specified directory: {predictions_dir}")
        sys.exit(1)

    with open(predictions_path, "rb") as f:
        eval_data = pkl.load(f)

    
    data = []
    for pred, label, accent, audio_file in zip(eval_data["preds"], eval_data["labels"], eval_data["accents"], eval_data["audio_files"]):
        data.append((pred, label, accent, audio_file))

    


    
    



In [4]:

def read_cv_results(predictions_dir):
    predictions_path = predictions_dir + "/cv_predictions.pkl"
    with open(predictions_path, "rb") as f:
        eval_data = pkl.load(f)

    results_by_accent = defaultdict(lambda: defaultdict(int))
    # print(list(zip(eval_data["preds"], eval_data["accents"], eval_data["labels"])))
    for prediction, accent, label in zip(eval_data["preds"], eval_data["accents"], eval_data["labels"]):
        if prediction == label:
            results_by_accent[accent]["correct"] += 1
        results_by_accent[accent]["total"] += 1

    # Compute macro-average accuracy, ignoring accents with < 10 samples
    acc_by_accent = {accent: results_by_accent[accent]["correct"]/results_by_accent[accent]["total"] \
                        for accent in results_by_accent}

    macro_avg = np.mean([acc for accent, acc in acc_by_accent.items() if results_by_accent[accent]["total"] >= 10])
    micro_avg = sum([results_by_accent[accent]["correct"] for accent in results_by_accent]) \
        / sum([results_by_accent[accent]["total"] for accent in results_by_accent])
    std_dev = np.std([acc for accent, acc in acc_by_accent.items() if results_by_accent[accent]["total"] >= 10])
    
    return results_by_accent, macro_avg, micro_avg, std_dev


In [12]:

def read_cv_from_hf_results(predictions_dir):
    predictions_path = predictions_dir + "/cv_from_hf_predictions.pkl"
    with open(predictions_path, "rb") as f:
        eval_data = pkl.load(f)

    results_by_accent = defaultdict(lambda: defaultdict(int))
    # print(list(zip(eval_data["preds"], eval_data["accents"], eval_data["labels"])))
    for prediction, accent, label in zip(eval_data["preds"], eval_data["accents"], eval_data["labels"]):
        if prediction == label:
            results_by_accent[accent]["correct"] += 1
        results_by_accent[accent]["total"] += 1

    # Compute macro-average accuracy, ignoring accents with < 10 samples
    acc_by_accent = {accent: results_by_accent[accent]["correct"]/results_by_accent[accent]["total"] \
                        for accent in results_by_accent}

    macro_avg = np.mean([acc for accent, acc in acc_by_accent.items() if results_by_accent[accent]["total"] >= 10])
    micro_avg = sum([results_by_accent[accent]["correct"] for accent in results_by_accent]) \
        / sum([results_by_accent[accent]["total"] for accent in results_by_accent])
    std_dev = np.std([acc for accent, acc in acc_by_accent.items() if results_by_accent[accent]["total"] >= 10])
    
    return results_by_accent, macro_avg, micro_avg, std_dev


def read_cv_from_hf_l2_results(predictions_dir):
    predictions_path = predictions_dir + "/cv_from_hf_predictions.pkl"
    with open(predictions_path, "rb") as f:
        eval_data = pkl.load(f)

    results_by_accent = defaultdict(lambda: defaultdict(int))
    l2_accents = {"de_Amerikanisches Deutsch",
        "de_Britisches Deutsch",
        "de_Schweizerdeutsch",
        "de_Französisch Deutsch",
        "de_Italienisch Deutsch",
        "de_Polnisch Deutsch",
        "de_Russisch Deutsch",
        "fr_Français de Suisse",
        "fr_Français des États-Unis",
        "fr_Français du Royaume-Uni",
        "fr_Français d’Allemagne"}
    # print(list(zip(eval_data["preds"], eval_data["accents"], eval_data["labels"])))
    for prediction, accent, label in zip(eval_data["preds"], eval_data["accents"], eval_data["labels"]):
        if f"{label}_{accent.strip()}" not in l2_accents:
            continue
        if prediction == label:
            results_by_accent[accent]["correct"] += 1
        results_by_accent[accent]["total"] += 1

    # Compute macro-average accuracy, ignoring accents with < 10 samples
    acc_by_accent = {accent: results_by_accent[accent]["correct"]/results_by_accent[accent]["total"] \
                        for accent in results_by_accent}

    macro_avg = np.mean([acc for accent, acc in acc_by_accent.items() if results_by_accent[accent]["total"] >= 10])
    micro_avg = sum([results_by_accent[accent]["correct"] for accent in results_by_accent]) \
        / sum([results_by_accent[accent]["total"] for accent in results_by_accent])
    std_dev = np.std([acc for accent, acc in acc_by_accent.items() if results_by_accent[accent]["total"] >= 10])
    
    return results_by_accent, macro_avg, micro_avg, std_dev



In [6]:
def read_fleurs_results(predictions_dir):

    predictions_path = predictions_dir + "/fleurs_test_predictions.pkl"

    with open(predictions_path, "rb") as f:
        eval_data = pkl.load(f)

    results_by_lang = defaultdict(lambda: defaultdict(int))
    # print(list(zip(eval_data["preds"], eval_data["accents"], eval_data["labels"])))
    for prediction, accent, label in zip(eval_data["preds"], eval_data["accents"], eval_data["labels"]):
        if prediction == label:
            results_by_lang[label]["correct"] += 1
        results_by_lang[label]["total"] += 1
    
    # Compute macro-average accuracy, ignoring accents with < 10 samples
    acc_by_accent = {lang: results_by_lang[lang]["correct"]/results_by_lang[lang]["total"] \
                        for lang in results_by_lang}

    macro_avg = np.mean([acc for lang, acc in acc_by_accent.items() if results_by_lang[lang]["total"] >= 10])
    micro_avg = sum([results_by_lang[lang]["correct"] for lang in results_by_lang]) \
        / sum([results_by_lang[lang]["total"] for lang in results_by_lang])
    std_dev = np.std([acc for lang, acc in acc_by_accent.items() if results_by_lang[lang]["total"] >= 10])
    
    return results_by_lang, macro_avg, micro_avg, std_dev

In [13]:
# Required format:

# 							FLEURS	EdAcc	CV
# ET
# duseqs,w2v2-att4-1000
# phoneseqs,vl107,att=8
# ET+phoneseqs_combo
# ET+duseq_train
# ET+duseqembed
# ET+phoneseqs_train


approach2dir = {
    "ET": "/home/hltcoe/nbafna/projects/mitigating-accent-bias-in-lid/prelim_evals/preds/formatted",
    "duseqs": "/exp/nbafna/projects/mitigating-accent-bias-in-lid/wav2vec2_intermediate_outputs/vl107/wav2vec2-base-layer8-1000/cnn-attentions-linear-4/lid_model_outputs/",
    "ET+duseq-train": "/exp/nbafna/projects/mitigating-accent-bias-in-lid/reps_phoneseqs_duseqs_exps/vl107/ecapa-tdnn_wav2vec2-xlsr-53-espeak-cv-ft_wav2vec2-base-layer8-1000/attentions-linear-4/reps-phoneseq-duseqs_lid_model_outputs",
    "ET+duseqembed-train": "/exp/nbafna/projects/mitigating-accent-bias-in-lid/reps_phoneseqs_duseqembed_exps/vl107/ecapa-tdnn_wav2vec2-xlsr-53-espeak-cv-ft_wav2vec2-base-layer8-1000/attentions-linear-4/reps-phoneseq-duseqs_lid_model_outputs",
    "phoneseqs": "/exp/nbafna/projects/mitigating-accent-bias-in-lid/phoneseq_exps/vl107/wav2vec2-xlsr-53-espeak-cv-ft/attentions-linear-8/phoneseq_lid_model_outputs/",
    "ET+phoneseqs": "/exp/nbafna/projects/mitigating-accent-bias-in-lid/dists-phoneseq-systemcombo_exps/vl107/ecapa-tdnn_wav2vec2-xlsr-53-espeak-cv-ft/attentions-linear-8/lid_model_outputs/",
    "ET+phoneseqs-train": "/exp/nbafna/projects/mitigating-accent-bias-in-lid/reps-phoneseq_exps/vl107/ecapa-tdnn_wav2vec2-xlsr-53-espeak-cv-ft/attentions-linear-8/reps-phoneseq_lid_model_outputs/",

}

dataset_to_processor = {"fleurs_test": read_fleurs_results, "cv": read_cv_results,\
                         "edacc": read_edacc_results, "cv_from_hf": read_cv_from_hf_results, \
                            "cv_from_hf_l2": read_cv_from_hf_l2_results}

results = defaultdict(lambda: defaultdict(dict))
for approach, dir in approach2dir.items():
    for dataset in ["fleurs_test", "edacc", "cv", "cv_from_hf", "cv_from_hf_l2"]:
        results_by_variety, macro_avg, micro_avg, std_dev = dataset_to_processor[dataset](dir)
        results[approach][dataset]["macro_avg"] = round(macro_avg*100, 1)
        results[approach][dataset]["micro_avg"] = round(micro_avg*100, 1)
        results[approach][dataset]["std_dev"] = round(std_dev*100, 1)
        


In [14]:
results
results_formatted = {approach: {dataset: (results[approach][dataset]["micro_avg"], results[approach][dataset]["macro_avg"], results[approach][dataset]["std_dev"]) for dataset in results[approach]} for approach in results}
import pandas as pd
df = pd.DataFrame(results_formatted)

df.T

Unnamed: 0,fleurs_test,edacc,cv,cv_from_hf,cv_from_hf_l2
ET,"(89.3, 89.5, 17.2)","(52.7, 57.0, 26.0)","(77.0, 68.7, 25.2)","(83.0, 81.3, 19.0)","(69.8, 63.3, 20.9)"
duseqs,"(49.6, 49.8, 18.3)","(46.1, 49.8, 17.7)","(73.5, 68.3, 16.9)","(49.6, 51.5, 16.3)","(50.5, 48.0, 16.5)"
ET+duseq-train,"(84.7, 84.9, 18.9)","(54.6, 59.3, 23.8)","(78.3, 71.4, 19.0)","(82.3, 81.0, 17.8)","(70.9, 65.3, 22.7)"
ET+duseqembed-train,"(84.3, 84.2, 20.2)","(57.3, 61.6, 23.0)","(79.3, 72.2, 18.0)","(77.9, 73.9, 20.0)","(68.6, 61.0, 22.6)"
phoneseqs,"(52.9, 52.5, 22.7)","(41.9, 46.4, 22.6)","(76.2, 71.4, 14.3)","(53.4, 56.1, 18.7)","(49.1, 53.0, 16.7)"
ET+phoneseqs,"(89.5, 89.5, 17.8)","(56.7, 61.0, 25.9)","(82.2, 77.2, 19.0)","(83.0, 82.7, 17.1)","(70.5, 68.4, 19.1)"
ET+phoneseqs-train,"(86.6, 86.4, 18.2)","(60.9, 65.1, 24.8)","(88.0, 84.8, 10.9)","(84.6, 81.9, 15.4)","(74.8, 75.7, 14.0)"


In [13]:

results_formatted = {approach: {dataset: f"{results[approach][dataset]["micro_avg"]} & {results[approach][dataset]["macro_avg"]}{chr(177)}{results[approach][dataset]["std_dev"]}" for dataset in results[approach]} for approach in results}
df = pd.DataFrame(results_formatted)

print(df.T.to_latex())

\begin{tabular}{llll}
\toprule
 & fleurs_test & edacc & cv \\
\midrule
ET & 89.3 & 89.5±17.2 & 52.7 & 57.0±26.0 & 77.0 & 68.7±25.2 \\
duseqs & 49.6 & 49.8±18.3 & 46.1 & 49.8±17.7 & 73.5 & 68.3±16.9 \\
ET+duseq-train & 84.7 & 84.9±18.9 & 54.6 & 59.3±23.8 & 78.3 & 71.4±19.0 \\
ET+duseqembed-train & 84.3 & 84.2±20.2 & 57.3 & 61.6±23.0 & 79.3 & 72.2±18.0 \\
phoneseqs & 53.0 & 52.6±22.7 & 41.9 & 46.5±22.6 & 76.3 & 71.4±14.3 \\
ET+phoneseqs & 89.5 & 89.5±17.8 & 56.7 & 61.0±25.9 & 82.2 & 77.2±19.0 \\
ET+phoneseqs-train & 86.6 & 86.4±18.2 & 60.9 & 65.1±24.8 & 88.0 & 84.8±10.9 \\
\bottomrule
\end{tabular}



In [56]:
s = '''ET & 89.3 & 89.5±17.2 & 77.0 & 68.7±25.2 & 52.7 & 57.0±26.0 \\
duseqs & 49.6 & 49.8±18.3 & 73.5 & 68.3±16.9 & 46.1 & 49.8±17.7 \\
ET+duseq-train & 85.8 & 85.8±18.8 & 74.5 & 66.4±19.9 & 49.4 & 53.6±22.7 \\
ET+duseqembed-train & 85.8 & 85.8±18.9 & 79.9 & 73.3±19.2 & 59.0 & 63.1±22.4 \\
phoneseqs & 53.0 & 52.6±22.7 & 76.3 & 71.4±14.3 & 41.9 & 46.5±22.6 \\
ET+phoneseqs & 89.5 & 89.5±17.8 & 82.2 & 77.2±19.0 & 56.7 & 61.0±25.9 \\
ET+phoneseqs-train & 86.6 & 86.4±18.2 & 88.0 & 84.8±10.9 & 60.9 & 65.1±24.8 \\'''
print(s.replace("±", "$\\pm$"))

ET & 89.3 & 89.5$\pm$17.2 & 77.0 & 68.7$\pm$25.2 & 52.7 & 57.0$\pm$26.0 \
duseqs & 49.6 & 49.8$\pm$18.3 & 73.5 & 68.3$\pm$16.9 & 46.1 & 49.8$\pm$17.7 \
ET+duseq-train & 85.8 & 85.8$\pm$18.8 & 74.5 & 66.4$\pm$19.9 & 49.4 & 53.6$\pm$22.7 \
ET+duseqembed-train & 85.8 & 85.8$\pm$18.9 & 79.9 & 73.3$\pm$19.2 & 59.0 & 63.1$\pm$22.4 \
phoneseqs & 53.0 & 52.6$\pm$22.7 & 76.3 & 71.4$\pm$14.3 & 41.9 & 46.5$\pm$22.6 \
ET+phoneseqs & 89.5 & 89.5$\pm$17.8 & 82.2 & 77.2$\pm$19.0 & 56.7 & 61.0$\pm$25.9 \
ET+phoneseqs-train & 86.6 & 86.4$\pm$18.2 & 88.0 & 84.8$\pm$10.9 & 60.9 & 65.1$\pm$24.8 \


In [20]:
######## ARCHIVE: Moving ET Fleurs position into consistent format

import os
import pickle as pkl

dirpath = "/home/hltcoe/nbafna/projects/mitigating-accent-bias-in-lid/prelim_evals/preds/fleurs_test_predictions"
outdir = "/home/hltcoe/nbafna/projects/mitigating-accent-bias-in-lid/prelim_evals/preds/formatted/"
all_labels, all_preds, all_accents = [], [], []
for filepath in os.listdir(dirpath):
    print(filepath)
    predictions_path = os.path.join(dirpath, filepath)
    # Merge into a single pkl file 
    with open(predictions_path, "rb") as f:
        eval_data = pkl.load(f)
        all_labels.extend(eval_data["labels"])
        all_preds.extend(eval_data["preds"])
        all_accents.extend(eval_data["accents"])
with open(os.path.join(outdir, "fleurs_test_predictions.pkl"), "wb") as f:
    pkl.dump({"labels": all_labels, "preds": all_preds, "accents": all_accents}, f)


ar_predictions.pkl
bg_predictions.pkl
af_predictions.pkl
am_predictions.pkl
az_predictions.pkl
bn_predictions.pkl
as_predictions.pkl
bs_predictions.pkl
ca_predictions.pkl
be_predictions.pkl
ceb_predictions.pkl
cs_predictions.pkl
el_predictions.pkl
en_predictions.pkl
da_predictions.pkl
de_predictions.pkl
es_predictions.pkl
et_predictions.pkl
cy_predictions.pkl
fr_predictions.pkl
fi_predictions.pkl
fa_predictions.pkl
gl_predictions.pkl
hi_predictions.pkl
is_predictions.pkl
gu_predictions.pkl
hr_predictions.pkl
ha_predictions.pkl
id_predictions.pkl
hu_predictions.pkl
hy_predictions.pkl
ja_predictions.pkl
it_predictions.pkl
ko_predictions.pkl
ka_predictions.pkl
km_predictions.pkl
kk_predictions.pkl
kn_predictions.pkl
lo_predictions.pkl
lb_predictions.pkl
ln_predictions.pkl
lt_predictions.pkl
lv_predictions.pkl
mn_predictions.pkl
mk_predictions.pkl
ms_predictions.pkl
ml_predictions.pkl
nl_predictions.pkl
mr_predictions.pkl
mt_predictions.pkl
mi_predictions.pkl
ne_predictions.pkl
pa_predicti