In [1]:
import os, sys
import json
import pickle as pkl
import numpy as np
from collections import defaultdict
from scipy.stats import ttest_ind
from scipy.stats import ttest_rel
from scipy.stats import chi2_contingency
import numpy as np
import pandas as pd
sys.path.append("/home/hltcoe/nbafna/projects/mitigating-accent-bias-in-lid/utils/misc/")
from get_client_id_from_audio_file import get_speaker_id_from_audio_file

In [2]:
cv_from_hf_l2_accents = {"de_Amerikanisches Deutsch",
        "de_Brasilianisches Deutsch",
        "de_Britisches Deutsch",
        "de_Niederländisch Deutsch",
        "de_Französisch Deutsch",
        "de_Kanadisches Deutsch",
        "de_Italienisch Deutsch",
        "de_Polnisch Deutsch",
        "de_Russisch Deutsch",
        "de_Tschechisch Deutsch",
        "fr_Français de Roumanie",
        "fr_Français d’Autriche",
        "fr_Français d’Italie",
        "fr_Français des États-Unis",
        "fr_Français du Royaume-Uni",
        "fr_Français d’Allemagne"}

cv_l1_accents = {"canada", "us", "england", "australia", "newzealand", "scotland", "wales", "ireland"}

edacc_l1_accents = {"us", "uk", "irish", "scottish", "american"}

In [3]:

def read_edacc_results(predictions_dir):
    
    predictions_path = predictions_dir + "/edacc_predictions.pkl"
    if not os.path.exists(predictions_path):
        predictions_path = predictions_dir + "/predictions.pkl"

    if not os.path.exists(predictions_path):
        print(f"No predictions found in the specified directory: {predictions_dir}")
        sys.exit(1)

    with open(predictions_path, "rb") as f:
        eval_data = pkl.load(f)

    results_by_accent = defaultdict(lambda: defaultdict(int))
    # print(list(zip(eval_data["preds"], eval_data["accents"], eval_data["labels"])))
    for prediction, accent, label in zip(eval_data["preds"], eval_data["accents"], eval_data["labels"]):
        if accent in edacc_l1_accents:
            continue
        if prediction == label:
            results_by_accent[accent]["correct"] += 1
        results_by_accent[accent]["total"] += 1
    # %%

    # # Merge "us" and "american"  accents
    # results_by_accent["us"] = {k: results_by_accent["us"].get(k, 0) + results_by_accent["american"].get(k, 0) for k in set(results_by_accent["us"]) | set(results_by_accent["american"])}
    # # results_by_accent["us"]["total"] = sum([results_by_accent["us"]["total"], results_by_accent["american"]["total"]])
    # del results_by_accent["american"]

    # Compute macro-average accuracy, ignoring accents with < 10 samples
    acc_by_accent = {accent: results_by_accent[accent]["correct"]/results_by_accent[accent]["total"] \
                        for accent in results_by_accent}

    macro_avg = np.mean([acc for accent, acc in acc_by_accent.items() if results_by_accent[accent]["total"] >= 10])
    micro_avg = sum([results_by_accent[accent]["correct"] for accent in results_by_accent]) \
        / sum([results_by_accent[accent]["total"] for accent in results_by_accent])
    std_dev = np.std([acc for accent, acc in acc_by_accent.items() if results_by_accent[accent]["total"] >= 10])
    
    return results_by_accent, macro_avg, micro_avg, std_dev




def read_cv_results(predictions_dir):
    predictions_path = predictions_dir + "/cv_predictions.pkl"
    with open(predictions_path, "rb") as f:
        eval_data = pkl.load(f)

    results_by_accent = defaultdict(lambda: defaultdict(int))
    # print(list(zip(eval_data["preds"], eval_data["accents"], eval_data["labels"])))
    for prediction, accent, label in zip(eval_data["preds"], eval_data["accents"], eval_data["labels"]):
        if accent in cv_l1_accents:
            continue
        if prediction == label:
            results_by_accent[accent]["correct"] += 1
        results_by_accent[accent]["total"] += 1

    # Compute macro-average accuracy, ignoring accents with < 10 samples
    acc_by_accent = {accent: results_by_accent[accent]["correct"]/results_by_accent[accent]["total"] \
                        for accent in results_by_accent}

    macro_avg = np.mean([acc for accent, acc in acc_by_accent.items() if results_by_accent[accent]["total"] >= 10])
    micro_avg = sum([results_by_accent[accent]["correct"] for accent in results_by_accent]) \
        / sum([results_by_accent[accent]["total"] for accent in results_by_accent])
    std_dev = np.std([acc for accent, acc in acc_by_accent.items() if results_by_accent[accent]["total"] >= 10])
    
    return results_by_accent, macro_avg, micro_avg, std_dev



def read_cv_from_hf_results(predictions_dir):
    predictions_path = predictions_dir + "/cv_from_hf_predictions.pkl"
    with open(predictions_path, "rb") as f:
        eval_data = pkl.load(f)

    results_by_accent = defaultdict(lambda: defaultdict(int))
    # print(list(zip(eval_data["preds"], eval_data["accents"], eval_data["labels"])))
    for prediction, accent, label in zip(eval_data["preds"], eval_data["accents"], eval_data["labels"]):
        if prediction == label:
            results_by_accent[accent]["correct"] += 1
        results_by_accent[accent]["total"] += 1

    # Compute macro-average accuracy, ignoring accents with < 10 samples
    acc_by_accent = {accent: results_by_accent[accent]["correct"]/results_by_accent[accent]["total"] \
                        for accent in results_by_accent}

    macro_avg = np.mean([acc for accent, acc in acc_by_accent.items() if results_by_accent[accent]["total"] >= 10])
    micro_avg = sum([results_by_accent[accent]["correct"] for accent in results_by_accent]) \
        / sum([results_by_accent[accent]["total"] for accent in results_by_accent])
    std_dev = np.std([acc for accent, acc in acc_by_accent.items() if results_by_accent[accent]["total"] >= 10])
    
    return results_by_accent, macro_avg, micro_avg, std_dev


def read_cv_from_hf_l2_results(predictions_dir):
    predictions_path = predictions_dir + "/cv_from_hf_predictions.pkl"
    with open(predictions_path, "rb") as f:
        eval_data = pkl.load(f)

    results_by_accent = defaultdict(lambda: defaultdict(int))

    # print(list(zip(eval_data["preds"], eval_data["accents"], eval_data["labels"])))
    for prediction, accent, label in zip(eval_data["preds"], eval_data["accents"], eval_data["labels"]):
        if f"{label}_{accent.strip()}" not in cv_from_hf_l2_accents:
            continue
        if prediction == label:
            results_by_accent[accent]["correct"] += 1
        results_by_accent[accent]["total"] += 1

    # Compute macro-average accuracy, ignoring accents with < 10 samples
    acc_by_accent = {accent: results_by_accent[accent]["correct"]/results_by_accent[accent]["total"] \
                        for accent in results_by_accent}

    macro_avg = np.mean([acc for accent, acc in acc_by_accent.items() if results_by_accent[accent]["total"] >= 10])
    micro_avg = sum([results_by_accent[accent]["correct"] for accent in results_by_accent]) \
        / sum([results_by_accent[accent]["total"] for accent in results_by_accent])
    std_dev = np.std([acc for accent, acc in acc_by_accent.items() if results_by_accent[accent]["total"] >= 10])
    
    return results_by_accent, macro_avg, micro_avg, std_dev


def read_fleurs_results(predictions_dir):

    predictions_path = predictions_dir + "/fleurs_test_predictions.pkl"

    with open(predictions_path, "rb") as f:
        eval_data = pkl.load(f)

    results_by_lang = defaultdict(lambda: defaultdict(int))
    # print(list(zip(eval_data["preds"], eval_data["accents"], eval_data["labels"])))
    for prediction, accent, label in zip(eval_data["preds"], eval_data["accents"], eval_data["labels"]):
        if prediction == label:
            results_by_lang[label]["correct"] += 1
        results_by_lang[label]["total"] += 1
    
    # Compute macro-average accuracy, ignoring accents with < 10 samples
    acc_by_accent = {lang: results_by_lang[lang]["correct"]/results_by_lang[lang]["total"] \
                        for lang in results_by_lang}

    macro_avg = np.mean([acc for lang, acc in acc_by_accent.items() if results_by_lang[lang]["total"] >= 10])
    micro_avg = sum([results_by_lang[lang]["correct"] for lang in results_by_lang]) \
        / sum([results_by_lang[lang]["total"] for lang in results_by_lang])
    std_dev = np.std([acc for lang, acc in acc_by_accent.items() if results_by_lang[lang]["total"] >= 10])
    
    return results_by_lang, macro_avg, micro_avg, std_dev

    
    



In [4]:
def format_percentage(x):
    return round(x * 100, 1)

def get_eval_data(dataset_name, predictions_dir, lang = None, accent = None):

    if dataset_name == "edacc":
        predictions_path = predictions_dir + "/edacc_predictions.pkl"
        if not os.path.exists(predictions_path):
            predictions_path = predictions_dir + "/predictions.pkl"

        if not os.path.exists(predictions_path):
            print(f"No predictions found in the specified directory: {predictions_dir}")
            sys.exit(1)

    elif dataset_name == "cv":
        predictions_path = predictions_dir + "/cv_predictions.pkl"


    elif dataset_name in {"cv_from_hf", "cv_from_hf_l2"}:
        predictions_path = predictions_dir + "/cv_from_hf_predictions.pkl"


    elif dataset_name == "fleurs_test":
        predictions_path = predictions_dir + "/fleurs_test_predictions.pkl"

    with open(predictions_path, "rb") as f:
        eval_data = pkl.load(f)
    
    
    data = []
    for pred, label, accent, audio_file in zip(eval_data["preds"], eval_data["labels"], eval_data["accents"], eval_data["audio_files"]):
        if lang is not None and label != lang:
            continue
        if dataset_name == "cv_from_hf_l2":
            if f"{label}_{accent.strip()}" not in cv_from_hf_l2_accents:
                continue
        if dataset_name == "cv":
            if accent in cv_l1_accents:
                continue
        if dataset_name == "edacc":
            if accent in edacc_l1_accents:
                continue
        speaker_id = get_speaker_id_from_audio_file(dataset_name, audio_file=audio_file, accent=accent, lang=label)
        data.append((pred, label, speaker_id))

    # print(f"Number of samples in the evaluation data: {len(data)}")
    # print(f"Number of speakers in the evaluation data: {len(set([x[2] for x in data]))}")
    return data


def compute_accuracy(pred_labels):
    """Computes accuracy given a list of (pred, label) tuples."""
    correct = sum(1 for pred, label in pred_labels if pred == label)
    return correct / len(pred_labels) if pred_labels else 0.0

def calculate_bootstrap_ci(system_data, n_bootstraps=1000, confidence=0.95):
    """
    Computes bootstrap confidence interval for the accuracy of a single LID system.
    
    :param system_data: List of (pred, label, speaker_id) tuples
    :param n_bootstraps: Number of bootstrap resamples
    :param confidence: Confidence level (e.g., 0.95 for 95% CI)
    :return: (lower_bound, upper_bound) of the accuracy CI
    """
    
    # Organize data by speaker
    speaker_data = {}
    for pred, label, speaker in system_data:
        if speaker not in speaker_data:
            speaker_data[speaker] = []
        speaker_data[speaker].append((pred, label))  # Store predictions and labels per speaker
    
    speakers = list(speaker_data.keys())  # Unique speakers
    boot_accuracies = []

    # Bootstrap resampling
    for _ in range(n_bootstraps):
        resampled_speakers = np.random.choice(speakers, size=len(speakers), replace=True)
        
        # Collect resampled predictions
        resampled_preds = []
        for speaker in resampled_speakers:
            resampled_preds.extend(speaker_data[speaker])

        # Compute accuracy for the resampled dataset
        boot_accuracies.append(compute_accuracy(resampled_preds))

    mean_accuracy = np.mean(boot_accuracies)
    lower = np.percentile(boot_accuracies, (1 - confidence) / 2 * 100)
    upper = np.percentile(boot_accuracies, (1 + confidence) / 2 * 100)

    return mean_accuracy, lower, upper


def get_bootstrap_ci(dataset_name, predictions_dir, lang = None, accent = None):
    '''
    Should return a list of tuples [(pred, label, speaker_id)]
    '''
    
    data = get_eval_data(dataset_name, predictions_dir)
    mean_accuracy, lower, upper = calculate_bootstrap_ci(data)

    return format_percentage(mean_accuracy), format_percentage(lower), format_percentage(upper)



def compute_speaker_accuracy(system_data):
    """
    Computes per-speaker accuracy.
    
    :param system_data: List of (pred, label, speaker_id) tuples
    :return: Dictionary {speaker: accuracy}
    """
    speaker_results = {}
    for pred, label, speaker in system_data:
        if speaker not in speaker_results:
            speaker_results[speaker] = []
        speaker_results[speaker].append(pred == label)  # Store binary correctness
    
    # Compute per-speaker accuracy
    return {speaker: np.mean(correct_list) for speaker, correct_list in speaker_results.items()}

def t_test_speaker_level(system1_data, system2_data):
    """
    Perform an independent t-test on speaker-level accuracies.
    
    :param system1_data: List of (pred, label, speaker_id) tuples for System 1
    :param system2_data: List of (pred, label, speaker_id) tuples for System 2
    :return: t-statistic, p-value
    """
    acc_sys1 = list(compute_speaker_accuracy(system1_data).values())
    acc_sys2 = list(compute_speaker_accuracy(system2_data).values())

    # Welch's t-test (assumes unequal variances)
    t_stat, p_value = ttest_ind(acc_sys1, acc_sys2, equal_var=False)
    
    return t_stat, p_value



def compute_samplewise_correctness(preds, labels):
    """Converts (pred, label) pairs into binary correctness (1 for correct, 0 for incorrect)."""
    return [1 if pred == label else 0 for pred, label in zip(preds, labels)]

def paired_t_test(system1_data, system2_data):
    """
    Performs a paired t-test on per-sample accuracy between two systems.
    
    :param system1_data: List of (pred, label) tuples for System 1
    :param system2_data: List of (pred, label) tuples for System 2
    :return: t-statistic, p-value
    """
    assert len(system1_data) == len(system2_data), "Both systems must have predictions for the same samples."
    
    # Extract predictions and labels
    preds1, labels1 = zip(*system1_data)
    preds2, labels2 = zip(*system2_data)

    # Convert to binary correctness (1 = correct, 0 = incorrect)
    correct1 = compute_samplewise_correctness(preds1, labels1)
    correct2 = compute_samplewise_correctness(preds2, labels2)

    # Perform a paired t-test (Welch’s version handles unequal variances)
    t_stat, p_value = ttest_rel(correct1, correct2)

    return t_stat, p_value


def mcnemar_test(system1_data, system2_data):
    """
    Perform McNemar's test to compare two classifiers' error distributions.
    
    :param system1_data: List of (pred, label) tuples for System 1
    :param system2_data: List of (pred, label) tuples for System 2
    :return: Chi-squared statistic, p-value
    """
    assert len(system1_data) == len(system2_data), "Both systems must have predictions for the same samples."
    
    # Initialize contingency table counts
    A = B = C = D = 0
    
    for (pred1, label), (pred2, _) in zip(system1_data, system2_data):
        correct1 = pred1 == label
        correct2 = pred2 == label
        
        if correct1 and correct2:
            A += 1  # Both correct
        elif correct1 and not correct2:
            B += 1  # System 1 correct, System 2 incorrect
        elif not correct1 and correct2:
            C += 1  # System 1 incorrect, System 2 correct
        else:
            D += 1  # Both incorrect
    
    # McNemar's test only uses B and C
    contingency_table = [[A, B], [C, D]]
    
    chi2_stat, p_value, _, _ = chi2_contingency(contingency_table, correction=True)

    return chi2_stat, p_value



def significance_statistics(dataset_name, predictions_dir1, predictions_dir2, lang = None, accent = None, test_name = "ttest"):
    '''
    Should return a list of tuples [(pred, label, speaker_id)]
    '''
    
    data1 = get_eval_data(dataset_name, predictions_dir1, lang, accent)
    data2 = get_eval_data(dataset_name, predictions_dir2, lang, accent)
    # print(f"Number of speakers in system 1: {len(set([x[2] for x in data1]))}")
    # print(f"Number of speakers in system 2: {len(set([x[2] for x in data2]))}")

    if test_name == "ttest":
        stat, p_value = t_test_speaker_level(data1, data2)
    
    elif test_name == "paired_ttest":
        # Let's sort both lists by speaker_id

        data1 = sorted(data1, key=lambda x: x[2])
        data2 = sorted(data2, key=lambda x: x[2])

        assert [x[2] for x in data1] == [x[2] for x in data2], "Speaker IDs do not match between the two systems."

        data1 = [(x[0], x[1]) for x in data1]
        data2 = [(x[0], x[1]) for x in data2]
        stat, p_value = paired_t_test(data1, data2)

    elif test_name == "mcnemar":
        data1 = sorted(data1, key=lambda x: x[2])
        data2 = sorted(data2, key=lambda x: x[2])

        assert [x[2] for x in data1] == [x[2] for x in data2], "Speaker IDs do not match between the two systems."

        data1 = [(x[0], x[1]) for x in data1]
        data2 = [(x[0], x[1]) for x in data2]

        stat, p_value = mcnemar_test(data1, data2)

    

    return stat, p_value

In [6]:
# Required format:

# 							FLEURS	EdAcc	CV
# ET
# duseqs,w2v2-att4-1000
# phoneseqs,vl107,att=8
# ET+phoneseqs_combo
# ET+duseq_train
# ET+duseqembed
# ET+phoneseqs_train


approach2dir = {
    "ET": "/home/hltcoe/nbafna/projects/mitigating-accent-bias-in-lid/prelim_evals/preds/formatted",
    "duseqs": "/exp/nbafna/projects/mitigating-accent-bias-in-lid/wav2vec2_intermediate_outputs/vl107/wav2vec2-base-layer8-1000/cnn-attentions-linear-4/lid_model_outputs/",
    "ET+duseq-train": "/exp/nbafna/projects/mitigating-accent-bias-in-lid/reps_phoneseqs_duseqs_exps/vl107/ecapa-tdnn_wav2vec2-xlsr-53-espeak-cv-ft_wav2vec2-base-layer8-1000/attentions-linear-4/reps-phoneseq-duseqs_lid_model_outputs",
    "ET+duseqembed-train": "/exp/nbafna/projects/mitigating-accent-bias-in-lid/reps_phoneseqs_duseqembed_exps/vl107/ecapa-tdnn_wav2vec2-xlsr-53-espeak-cv-ft_wav2vec2-base-layer8-1000/attentions-linear-4/reps-phoneseq-duseqs_lid_model_outputs",
    "phoneseqs": "/exp/nbafna/projects/mitigating-accent-bias-in-lid/phoneseq_exps/vl107/wav2vec2-xlsr-53-espeak-cv-ft/attentions-linear-8/phoneseq_lid_model_outputs/",
    "ET+phoneseqs": "/exp/nbafna/projects/mitigating-accent-bias-in-lid/dists-phoneseq-systemcombo_exps/vl107/ecapa-tdnn_wav2vec2-xlsr-53-espeak-cv-ft/attentions-linear-8/lid_model_outputs/",
    "ET+phoneseqs-train": "/exp/nbafna/projects/mitigating-accent-bias-in-lid/reps-phoneseq_exps/vl107/ecapa-tdnn_wav2vec2-xlsr-53-espeak-cv-ft/attentions-linear-8/reps-phoneseq_lid_model_outputs/",

}

dataset_to_processor = {"fleurs_test": read_fleurs_results, "cv": read_cv_results,\
                         "edacc": read_edacc_results, "cv_from_hf": read_cv_from_hf_results, \
                            "cv_from_hf_l2": read_cv_from_hf_l2_results}

results = defaultdict(lambda: defaultdict(dict))
for approach, dir in approach2dir.items():
    # for dataset in ["fleurs_test", "edacc", "cv", "cv_from_hf", "cv_from_hf_l2"]:
    for dataset in ["fleurs_test", "edacc", "cv", "cv_from_hf_l2"]:
        results_by_variety, macro_avg, micro_avg, std_dev = dataset_to_processor[dataset](dir)
        results[approach][dataset]["macro_avg"] = format_percentage(macro_avg)
        results[approach][dataset]["micro_avg"] = format_percentage(micro_avg)
        results[approach][dataset]["std_dev"] = format_percentage(std_dev)
        mean_accuracy, lower, upper = get_bootstrap_ci(dataset, dir)
        results[approach][dataset]["bootstrap_mean"] = mean_accuracy
        results[approach][dataset]["bootstrap_lower"] = lower
        results[approach][dataset]["bootstrap_upper"] = upper
        # Significance test against ET for each dataset and approach
        results[approach][dataset]["paired_ttest_sig"] = False
        if approach == "ET":
            continue
        stat, p_value = significance_statistics(dataset, approach2dir["ET"], dir, test_name="mcnemar")
        if p_value < 0.05:
            results[approach][dataset]["paired_ttest_sig"] = True

        


In [7]:
approach2key = {
    "ET": r"\et",
    "duseqs": r"\duseq",
    "ET+duseq-train": r"\etdutrain",
    "ET+duseqembed-train": r"\etduembedtrain",
    "phoneseqs": r"\phoneseq",
    "ET+phoneseqs": r"\etps",
    "ET+phoneseqs-train": r"\etpstrain",
}
approaches = ["ET",  "ET+phoneseqs-train",  "ET+phoneseqs", "phoneseqs", "duseqs", "ET+duseq-train", "ET+duseqembed-train",]

results_formatted = {approach2key[approach]: {dataset: f"{results[approach][dataset]["bootstrap_mean"]} ({results[approach][dataset]["bootstrap_lower"]},{results[approach][dataset]["bootstrap_upper"]}){r"$^\dagger$" if (approach != "ET" and not results[approach][dataset]["paired_ttest_sig"]) else ""}  & {results[approach][dataset]["macro_avg"]}{chr(177)}{results[approach][dataset]["std_dev"]}" for dataset in results[approach]} for approach in approaches}
df = pd.DataFrame(results_formatted)

print(df.T.to_latex())

\begin{tabular}{lllll}
\toprule
 & fleurs_test & edacc & cv & cv_from_hf_l2 \\
\midrule
\et & 89.3 (89.0,89.6)  & 89.5±17.2 & 47.6 (39.8,55.6)  & 54.8±26.5 & 31.7 (22.5,43.4)  & 48.8±18.0 & 63.7 (54.0,73.1)  & 68.4±22.2 \\
\etpstrain & 86.6 (86.2,87.0)  & 86.4±18.2 & 57.3 (48.6,66.5)  & 63.4±25.5 & 68.3 (60.8,75.6)  & 80.6±11.4 & 73.0 (63.2,80.7)  & 76.0±13.9 \\
\etps & 89.5 (89.2,89.9)  & 89.5±17.8 & 52.0 (44.5,60.7)  & 59.0±26.4 & 44.5 (35.3,54.9)  & 63.8±15.6 & 66.8 (56.6,75.5)  & 73.6±20.3 \\
\phoneseq & 52.9 (52.1,53.7)  & 52.5±22.7 & 37.3 (30.5,44.1)  & 44.4±22.6 & 45.5 (39.0,53.1)  & 64.5±13.2 & 48.7 (40.4,56.2)  & 51.6±14.9 \\
\duseq & 49.6 (48.9,50.4)  & 49.8±18.3 & 42.5 (37.5,47.8)  & 48.4±18.0 & 47.0 (37.8,55.5)  & 63.1±12.5 & 48.0 (41.1,55.0)  & 48.2±19.2 \\
\etdutrain & 84.7 (84.2,85.1)  & 84.9±18.9 & 50.7 (43.0,57.9)  & 57.6±24.3 & 46.5 (38.0,56.3)  & 63.5±14.9 & 68.6 (60.1,76.2)  & 70.0±22.7 \\
\etduembedtrain & 84.2 (83.8,84.7)  & 84.2±20.2 & 53.2 (46.3,60.8)  & 60.0±23

In [13]:
approach2key = {
    "ET": r"\et",
    "duseqs": r"\duseq",
    "ET+duseq-train": r"\etdutrain",
    "ET+duseqembed-train": r"\etduembedtrain",
    "phoneseqs": r"\phoneseq",
    "ET+phoneseqs": r"\etps",
    "ET+phoneseqs-train": r"\etpstrain",
}
approaches = ["ET",  "ET+phoneseqs-train",  "ET+phoneseqs", "phoneseqs", "duseqs", "ET+duseq-train", "ET+duseqembed-train",]

results_formatted = {approach2key[approach]: {dataset: f"{results[approach][dataset]["micro_avg"]} ({results[approach][dataset]["bootstrap_lower"]},{results[approach][dataset]["bootstrap_upper"]}){r"$^\dagger$" if (approach != "ET" and not results[approach][dataset]["paired_ttest_sig"]) else ""}  & {results[approach][dataset]["macro_avg"]}{chr(177)}{results[approach][dataset]["std_dev"]}" for dataset in results[approach]} for approach in approaches}
df = pd.DataFrame(results_formatted)

print(df.T.to_latex())

\begin{tabular}{lllll}
\toprule
 & fleurs_test & edacc & cv & cv_from_hf_l2 \\
\midrule
\et & 89.3 (89.0,89.6)  & 89.5±17.2 & 47.7 (40.1,56.2)  & 55.8±26.8 & 33.9 (23.7,48.4)  & 57.0±24.6 & 63.6 (54.2,73.1)  & 68.4±22.2 \\
\etpstrain & 86.6 (86.1,87.0)  & 86.4±18.2 & 57.1 (48.9,65.6)  & 64.0±25.4 & 68.8 (61.2,76.1)  & 81.6±10.7 & 72.8 (63.3,81.0)  & 76.0±13.9 \\
\etps & 89.5 (89.2,89.9)$^\dagger$  & 89.5±17.8 & 52.0 (43.9,60.5)  & 59.8±26.5 & 46.1 (37.3,57.2)  & 69.1±18.6 & 66.8 (56.6,75.7)  & 73.6±20.3 \\
\phoneseq & 52.9 (52.1,53.7)  & 52.5±22.7 & 37.3 (30.5,44.2)  & 45.2±22.8 & 47.1 (40.9,54.8)  & 67.3±13.6 & 48.8 (40.6,56.0)  & 51.6±14.9 \\
\duseq & 49.6 (48.9,50.3)  & 49.8±18.3 & 42.6 (37.3,48.0)  & 48.6±17.8 & 48.5 (39.4,56.7)  & 66.3±13.5 & 48.1 (40.9,55.2)  & 48.2±19.2 \\
\etdutrain & 84.7 (84.3,85.1)  & 84.9±18.9 & 50.7 (43.0,58.1)  & 58.3±24.4 & 47.8 (39.3,58.0)  & 67.0±15.6 & 68.4 (59.8,76.6)  & 70.0±22.7 \\
\etduembedtrain & 84.3 (83.8,84.7)  & 84.2±20.2 & 53.4 (45.9,60.9) 

In [12]:
approach2dir = {
    "ET": "/home/hltcoe/nbafna/projects/mitigating-accent-bias-in-lid/prelim_evals/preds/formatted",
    "duseqs": "/exp/nbafna/projects/mitigating-accent-bias-in-lid/wav2vec2_intermediate_outputs/vl107/wav2vec2-base-layer8-1000/cnn-attentions-linear-4/lid_model_outputs/",
    "ET+duseq-train": "/exp/nbafna/projects/mitigating-accent-bias-in-lid/reps_phoneseqs_duseqs_exps/vl107/ecapa-tdnn_wav2vec2-xlsr-53-espeak-cv-ft_wav2vec2-base-layer8-1000/attentions-linear-4/reps-phoneseq-duseqs_lid_model_outputs",
    "ET+duseqembed-train": "/exp/nbafna/projects/mitigating-accent-bias-in-lid/reps_phoneseqs_duseqembed_exps/vl107/ecapa-tdnn_wav2vec2-xlsr-53-espeak-cv-ft_wav2vec2-base-layer8-1000/attentions-linear-4/reps-phoneseq-duseqs_lid_model_outputs",
    "phoneseqs": "/exp/nbafna/projects/mitigating-accent-bias-in-lid/phoneseq_exps/vl107/wav2vec2-xlsr-53-espeak-cv-ft/attentions-linear-8/phoneseq_lid_model_outputs/",
    "ET+phoneseqs": "/exp/nbafna/projects/mitigating-accent-bias-in-lid/dists-phoneseq-systemcombo_exps/vl107/ecapa-tdnn_wav2vec2-xlsr-53-espeak-cv-ft/attentions-linear-8/lid_model_outputs/",
    "ET+phoneseqs-train": "/exp/nbafna/projects/mitigating-accent-bias-in-lid/reps-phoneseq_exps/vl107/ecapa-tdnn_wav2vec2-xlsr-53-espeak-cv-ft/attentions-linear-8/reps-phoneseq_lid_model_outputs/",
}
# datasets = ["cv_from_hf", "edacc", "cv", "fleurs_test", "cv_from_hf_l2"]
datasets = ["cv_from_hf_l2"]
for dataset in datasets:
    for approach, dir in approach2dir.items():
        print(f"{dataset} {approach}")
        mean_accuracy, lower, upper = get_bootstrap_ci(dataset, dir)
        print(f"Mean accuracy: {mean_accuracy}")
        print(f"CI: ({lower}, {upper})")
        print("\n")


cv_from_hf_l2 ET
Mean accuracy: 70.7
CI: (65.1, 76.0)


cv_from_hf_l2 duseqs
Mean accuracy: 50.4
CI: (45.9, 55.0)


cv_from_hf_l2 ET+duseq-train
Mean accuracy: 71.6
CI: (66.1, 76.9)


cv_from_hf_l2 ET+duseqembed-train
Mean accuracy: 69.3
CI: (63.7, 74.3)


cv_from_hf_l2 phoneseqs
Mean accuracy: 49.1
CI: (43.6, 54.5)


cv_from_hf_l2 ET+phoneseqs
Mean accuracy: 71.3
CI: (65.3, 76.5)


cv_from_hf_l2 ET+phoneseqs-train
Mean accuracy: 74.8
CI: (69.4, 80.1)




In [26]:
results_by_variety, macro_avg, micro_avg, std_dev = dataset_to_processor["cv"](approach2dir["CV"])

In [27]:
results_by_variety

defaultdict(<function __main__.read_edacc_results.<locals>.<lambda>()>,
            {'sinhalese': defaultdict(int, {'total': 81, 'correct': 20}),
             'lithuanian': defaultdict(int, {'total': 388, 'correct': 219}),
             'bulgarian': defaultdict(int, {'correct': 44, 'total': 267}),
             'chinese': defaultdict(int, {'correct': 239, 'total': 343}),
             'shona': defaultdict(int, {'total': 57, 'correct': 21}),
             'catalan': defaultdict(int, {'total': 511, 'correct': 150}),
             'spanish': defaultdict(int, {'total': 642, 'correct': 310}),
             'romanian': defaultdict(int, {'correct': 107, 'total': 384}),
             'indian': defaultdict(int, {'correct': 216, 'total': 369}),
             'colombian': defaultdict(int, {'total': 135, 'correct': 98}),
             'nigerian': defaultdict(int, {'total': 1113, 'correct': 368}),
             'french': defaultdict(int, {'correct': 68, 'total': 167}),
             'mexican': defaultdict(int