In [42]:
import sys
import datasets
import json
import pickle as pkl
import os

In [None]:
# Merging VL107

langs = os.listdir("/exp/nbafna/projects/mitigating-accent-bias-in-lid/wav2vec2_intermediate_outputs/vl107/wav2vec2-base-layer8-1000/training_units/")
assert len(langs) == 107

for lang in langs[2:]:
    print(f"Processing {lang}")

    input_path_et_trans = f"/exp/nbafna/projects/mitigating-accent-bias-in-lid/reps_and_phoneseqs/vl107/ecapa-tdnn_wav2vec2-xlsr-53-espeak-cv-ft/{lang}_transcriptions.jsonl"
    input_path_ssl_units = f"/exp/nbafna/projects/mitigating-accent-bias-in-lid/wav2vec2_intermediate_outputs/vl107/wav2vec2-base-layer8-1000/training_units/{lang}/kmeans_data.pkl"
    outfile = f"/exp/nbafna/projects/mitigating-accent-bias-in-lid/reps_phoneseqs_duseqs_exps/vl107/ecapa-tdnn_wav2vec2-xlsr-53-espeak-cv-ft_wav2vec2-base-layer8-kmeans1000/{lang}_data.jsonl"

    # Loading the SSL unit data as an HF dataset
    with open(input_path_ssl_units, 'rb') as f:
        ssl_units_data = pkl.load(f)

    duseq_dataset = datasets.Dataset.from_dict(ssl_units_data)
    et_phoneseq_datset = datasets.load_dataset('json', data_files=input_path_et_trans)["train"]

    # Check that length is the same
    assert len(duseq_dataset) == len(et_phoneseq_datset)
    print(f"Length of datasets: {len(duseq_dataset)}")

    # Create tuples:
    duseq_datase_tuples = [{"audio_file": x["all_audio_files"], "sequences": x["sequences"]} for x in duseq_dataset]
    et_phoneseq_datset_tuples = [{k: v for k, v in x.items()} for x in et_phoneseq_datset]


    # Sort the datasets by audio file name
    duseq_datase_tuples_sorted = sorted(duseq_datase_tuples, key=lambda x: x["audio_file"])
    et_phoneseq_datset_tuples_sorted = sorted(et_phoneseq_datset_tuples, key=lambda x: x["audio_file"])

    # Merge the two datasets
    for i in range(len(duseq_datase_tuples_sorted)):
        assert duseq_datase_tuples_sorted[i]["audio_file"] == et_phoneseq_datset_tuples_sorted[i]["audio_file"]
        duseq_datase_tuples_sorted[i].update(et_phoneseq_datset_tuples_sorted[i])

    # Create HF dataset
    merged_dataset = {}
    for k in duseq_datase_tuples_sorted[0].keys():
        merged_dataset[k] = [x[k] for x in duseq_datase_tuples_sorted]

    merged_dataset = datasets.Dataset.from_dict(merged_dataset)
    print(f"Length of merged dataset: {len(merged_dataset)}")

    # Save the dataset
    merged_dataset.to_json(outfile)


Processing am
Length of datasets: 32368
Length of merged dataset: 32368


Creating json from Arrow format:   0%|          | 0/33 [00:00<?, ?ba/s]

Processing as
Length of datasets: 61797
Length of merged dataset: 61797


Creating json from Arrow format:   0%|          | 0/62 [00:00<?, ?ba/s]

Processing ar
Length of datasets: 23947
Length of merged dataset: 23947


Creating json from Arrow format:   0%|          | 0/24 [00:00<?, ?ba/s]

Processing af
Length of datasets: 42892
Length of merged dataset: 42892


Creating json from Arrow format:   0%|          | 0/43 [00:00<?, ?ba/s]

Processing ab
Length of datasets: 4137
Length of merged dataset: 4137


Creating json from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing az
Length of datasets: 23200
Length of merged dataset: 23200


Creating json from Arrow format:   0%|          | 0/24 [00:00<?, ?ba/s]

Processing bg
Length of datasets: 20011
Length of merged dataset: 20011


Creating json from Arrow format:   0%|          | 0/21 [00:00<?, ?ba/s]

Processing bn
Length of datasets: 21623
Length of merged dataset: 21623


Creating json from Arrow format:   0%|          | 0/22 [00:00<?, ?ba/s]

Processing bo
Length of datasets: 41135
Length of merged dataset: 41135


Creating json from Arrow format:   0%|          | 0/42 [00:00<?, ?ba/s]

Processing br
Length of datasets: 16285
Length of merged dataset: 16285


Creating json from Arrow format:   0%|          | 0/17 [00:00<?, ?ba/s]

Processing bs
Length of datasets: 42810
Length of merged dataset: 42810


Creating json from Arrow format:   0%|          | 0/43 [00:00<?, ?ba/s]

Processing ca
Length of datasets: 35036
Length of merged dataset: 35036


Creating json from Arrow format:   0%|          | 0/36 [00:00<?, ?ba/s]

Processing ceb
Length of datasets: 2288
Length of merged dataset: 2288


Creating json from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Processing cs
Length of datasets: 26321
Length of merged dataset: 26321


Creating json from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Processing cy
Length of datasets: 30444
Length of merged dataset: 30444


Creating json from Arrow format:   0%|          | 0/31 [00:00<?, ?ba/s]

Processing da
Length of datasets: 10934
Length of merged dataset: 10934


Creating json from Arrow format:   0%|          | 0/11 [00:00<?, ?ba/s]

Processing de
Length of datasets: 17245
Length of merged dataset: 17245


Creating json from Arrow format:   0%|          | 0/18 [00:00<?, ?ba/s]

Processing el
Length of datasets: 25732
Length of merged dataset: 25732


Creating json from Arrow format:   0%|          | 0/26 [00:00<?, ?ba/s]

Processing en
Length of datasets: 21401
Length of merged dataset: 21401


Creating json from Arrow format:   0%|          | 0/22 [00:00<?, ?ba/s]

Processing eo
Length of datasets: 3801
Length of merged dataset: 3801


Creating json from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Processing es
Length of datasets: 16963
Length of merged dataset: 16963


Creating json from Arrow format:   0%|          | 0/17 [00:00<?, ?ba/s]

Processing et
Length of datasets: 16520
Length of merged dataset: 16520


Creating json from Arrow format:   0%|          | 0/17 [00:00<?, ?ba/s]

Processing eu
Length of datasets: 11422
Length of merged dataset: 11422


Creating json from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Processing fa
Length of datasets: 23112
Length of merged dataset: 23112


Creating json from Arrow format:   0%|          | 0/24 [00:00<?, ?ba/s]

Processing fi
Length of datasets: 14546
Length of merged dataset: 14546


Creating json from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Processing fo
Length of datasets: 27147
Length of merged dataset: 27147


Creating json from Arrow format:   0%|          | 0/28 [00:00<?, ?ba/s]

Processing fr
Length of datasets: 27549
Length of merged dataset: 27549


Creating json from Arrow format:   0%|          | 0/28 [00:00<?, ?ba/s]

Processing gl
Length of datasets: 28660
Length of merged dataset: 28660


Creating json from Arrow format:   0%|          | 0/29 [00:00<?, ?ba/s]

Processing gn
Length of datasets: 742
Length of merged dataset: 742


Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing gv
Length of datasets: 1621
Length of merged dataset: 1621


Creating json from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Processing gu
Length of datasets: 18371
Length of merged dataset: 18371


Creating json from Arrow format:   0%|          | 0/19 [00:00<?, ?ba/s]

Processing ha
Length of datasets: 36465
Length of merged dataset: 36465


Creating json from Arrow format:   0%|          | 0/37 [00:00<?, ?ba/s]

Processing haw
Length of datasets: 4362
Length of merged dataset: 4362


Creating json from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing hi
Length of datasets: 32808
Length of merged dataset: 32808


Creating json from Arrow format:   0%|          | 0/33 [00:00<?, ?ba/s]

Processing hr
Length of datasets: 48060
Length of merged dataset: 48060


Creating json from Arrow format:   0%|          | 0/49 [00:00<?, ?ba/s]

Processing ht
Length of datasets: 38273
Length of merged dataset: 38273


Creating json from Arrow format:   0%|          | 0/39 [00:00<?, ?ba/s]

Processing hu
Length of datasets: 29330
Length of merged dataset: 29330


Creating json from Arrow format:   0%|          | 0/30 [00:00<?, ?ba/s]

Processing hy
Length of datasets: 28474
Length of merged dataset: 28474


Creating json from Arrow format:   0%|          | 0/29 [00:00<?, ?ba/s]

Processing ia
Length of datasets: 975
Length of merged dataset: 975


Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing id
Length of datasets: 15015
Length of merged dataset: 15015


Creating json from Arrow format:   0%|          | 0/16 [00:00<?, ?ba/s]

Processing is
Length of datasets: 36025
Length of merged dataset: 36025


Creating json from Arrow format:   0%|          | 0/37 [00:00<?, ?ba/s]

Processing it
Length of datasets: 20755
Length of merged dataset: 20755


Creating json from Arrow format:   0%|          | 0/21 [00:00<?, ?ba/s]

Processing iw
Length of datasets: 38562
Length of merged dataset: 38562


Creating json from Arrow format:   0%|          | 0/39 [00:00<?, ?ba/s]

Processing ja
Length of datasets: 21969
Length of merged dataset: 21969


Creating json from Arrow format:   0%|          | 0/22 [00:00<?, ?ba/s]

Processing jw
Length of datasets: 17887
Length of merged dataset: 17887


Creating json from Arrow format:   0%|          | 0/18 [00:00<?, ?ba/s]

Processing ka
Length of datasets: 40431
Length of merged dataset: 40431


Creating json from Arrow format:   0%|          | 0/41 [00:00<?, ?ba/s]

Processing kk
Length of datasets: 31303
Length of merged dataset: 31303


Creating json from Arrow format:   0%|          | 0/32 [00:00<?, ?ba/s]

Processing km
Length of datasets: 14767
Length of merged dataset: 14767


Creating json from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Processing kn
Length of datasets: 18314
Length of merged dataset: 18314


Creating json from Arrow format:   0%|          | 0/19 [00:00<?, ?ba/s]

Processing ko
Length of datasets: 30082
Length of merged dataset: 30082


Creating json from Arrow format:   0%|          | 0/31 [00:00<?, ?ba/s]

Processing la
Length of datasets: 24760
Length of merged dataset: 24760


Creating json from Arrow format:   0%|          | 0/25 [00:00<?, ?ba/s]

Processing lb
Length of datasets: 31129
Length of merged dataset: 31129


Creating json from Arrow format:   0%|          | 0/32 [00:00<?, ?ba/s]

Processing ln
Length of datasets: 35222
Length of merged dataset: 35222


Creating json from Arrow format:   0%|          | 0/36 [00:00<?, ?ba/s]

Processing lo
Length of datasets: 16042
Length of merged dataset: 16042


Creating json from Arrow format:   0%|          | 0/17 [00:00<?, ?ba/s]

Processing lt
Length of datasets: 33462
Length of merged dataset: 33462


Creating json from Arrow format:   0%|          | 0/34 [00:00<?, ?ba/s]

Processing lv
Length of datasets: 18213
Length of merged dataset: 18213


Creating json from Arrow format:   0%|          | 0/19 [00:00<?, ?ba/s]

Processing mg
Length of datasets: 42055
Length of merged dataset: 42055


Creating json from Arrow format:   0%|          | 0/43 [00:00<?, ?ba/s]

Processing mi
Length of datasets: 12320
Length of merged dataset: 12320


Creating json from Arrow format:   0%|          | 0/13 [00:00<?, ?ba/s]

Processing mk
Length of datasets: 45258
Length of merged dataset: 45258


Creating json from Arrow format:   0%|          | 0/46 [00:00<?, ?ba/s]

Processing ml
Length of datasets: 19282
Length of merged dataset: 19282


Creating json from Arrow format:   0%|          | 0/20 [00:00<?, ?ba/s]

Processing mn
Length of datasets: 28493
Length of merged dataset: 28493


Creating json from Arrow format:   0%|          | 0/29 [00:00<?, ?ba/s]

Processing mr
Length of datasets: 33008
Length of merged dataset: 33008


Creating json from Arrow format:   0%|          | 0/34 [00:00<?, ?ba/s]

Processing ms
Length of datasets: 31088
Length of merged dataset: 31088


Creating json from Arrow format:   0%|          | 0/32 [00:00<?, ?ba/s]

Processing mt
Length of datasets: 27042
Length of merged dataset: 27042


Creating json from Arrow format:   0%|          | 0/28 [00:00<?, ?ba/s]

Processing my
Length of datasets: 15868
Length of merged dataset: 15868


Creating json from Arrow format:   0%|          | 0/16 [00:00<?, ?ba/s]

Processing ne
Length of datasets: 28754
Length of merged dataset: 28754


Creating json from Arrow format:   0%|          | 0/29 [00:00<?, ?ba/s]

Processing nl
Length of datasets: 15835
Length of merged dataset: 15835


Creating json from Arrow format:   0%|          | 0/16 [00:00<?, ?ba/s]

Processing nn
Length of datasets: 21992
Length of merged dataset: 21992


Creating json from Arrow format:   0%|          | 0/22 [00:00<?, ?ba/s]

Processing no
Length of datasets: 42204
Length of merged dataset: 42204


Creating json from Arrow format:   0%|          | 0/43 [00:00<?, ?ba/s]

Processing oc
Length of datasets: 6228
Length of merged dataset: 6228


Creating json from Arrow format:   0%|          | 0/7 [00:00<?, ?ba/s]

Processing pa
Length of datasets: 21066
Length of merged dataset: 21066


Creating json from Arrow format:   0%|          | 0/22 [00:00<?, ?ba/s]

Processing pl
Length of datasets: 32836
Length of merged dataset: 32836


Creating json from Arrow format:   0%|          | 0/33 [00:00<?, ?ba/s]

Processing ps
Length of datasets: 19793
Length of merged dataset: 19793


Creating json from Arrow format:   0%|          | 0/20 [00:00<?, ?ba/s]

Processing pt
Length of datasets: 26070
Length of merged dataset: 26070


Creating json from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Processing ro
Length of datasets: 25739
Length of merged dataset: 25739


Creating json from Arrow format:   0%|          | 0/26 [00:00<?, ?ba/s]

Processing ru
Length of datasets: 31816
Length of merged dataset: 31816


Creating json from Arrow format:   0%|          | 0/32 [00:00<?, ?ba/s]

Processing sa
Length of datasets: 4787
Length of merged dataset: 4787


Creating json from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing sco
Length of datasets: 967
Length of merged dataset: 967


Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing sd
Length of datasets: 31667
Length of merged dataset: 31667


Creating json from Arrow format:   0%|          | 0/32 [00:00<?, ?ba/s]

Processing si
Length of datasets: 26930
Length of merged dataset: 26930


Creating json from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Processing sk
Length of datasets: 15790
Length of merged dataset: 15790


Creating json from Arrow format:   0%|          | 0/16 [00:00<?, ?ba/s]

Processing sl
Length of datasets: 49662
Length of merged dataset: 49662


Creating json from Arrow format:   0%|          | 0/50 [00:00<?, ?ba/s]

Processing sn
Length of datasets: 11856
Length of merged dataset: 11856


Creating json from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Processing so
Length of datasets: 41300
Length of merged dataset: 41300


Creating json from Arrow format:   0%|          | 0/42 [00:00<?, ?ba/s]

Processing sq
Length of datasets: 28068
Length of merged dataset: 28068


Creating json from Arrow format:   0%|          | 0/29 [00:00<?, ?ba/s]

Processing sr
Length of datasets: 20583
Length of merged dataset: 20583


Creating json from Arrow format:   0%|          | 0/21 [00:00<?, ?ba/s]

Processing su
Length of datasets: 21823
Length of merged dataset: 21823


Creating json from Arrow format:   0%|          | 0/22 [00:00<?, ?ba/s]

Processing sv
Length of datasets: 13798
Length of merged dataset: 13798


Creating json from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Processing sw
Length of datasets: 25427
Length of merged dataset: 25427


Creating json from Arrow format:   0%|          | 0/26 [00:00<?, ?ba/s]

Processing ta
Length of datasets: 20504
Length of merged dataset: 20504


Creating json from Arrow format:   0%|          | 0/21 [00:00<?, ?ba/s]

Processing te
Length of datasets: 30933
Length of merged dataset: 30933


Creating json from Arrow format:   0%|          | 0/31 [00:00<?, ?ba/s]

Processing tg
Length of datasets: 26387
Length of merged dataset: 26387


Creating json from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Processing th
Length of datasets: 22732
Length of merged dataset: 22732


Creating json from Arrow format:   0%|          | 0/23 [00:00<?, ?ba/s]

Processing tk
Length of datasets: 36146
Length of merged dataset: 36146


Creating json from Arrow format:   0%|          | 0/37 [00:00<?, ?ba/s]

Processing tl
Length of datasets: 35493
Length of merged dataset: 35493


Creating json from Arrow format:   0%|          | 0/36 [00:00<?, ?ba/s]

Processing tr
Length of datasets: 23760
Length of merged dataset: 23760


Creating json from Arrow format:   0%|          | 0/24 [00:00<?, ?ba/s]

Processing tt
Length of datasets: 40086
Length of merged dataset: 40086


Creating json from Arrow format:   0%|          | 0/41 [00:00<?, ?ba/s]

Processing uk
Length of datasets: 21593
Length of merged dataset: 21593


Creating json from Arrow format:   0%|          | 0/22 [00:00<?, ?ba/s]

Processing ur
Length of datasets: 18591
Length of merged dataset: 18591


Creating json from Arrow format:   0%|          | 0/19 [00:00<?, ?ba/s]

Processing uz
Length of datasets: 18186
Length of merged dataset: 18186


Creating json from Arrow format:   0%|          | 0/19 [00:00<?, ?ba/s]

Processing vi
Length of datasets: 23859
Length of merged dataset: 23859


Creating json from Arrow format:   0%|          | 0/24 [00:00<?, ?ba/s]

Processing war
Length of datasets: 3750
Length of merged dataset: 3750


Creating json from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Processing yi
Length of datasets: 17672
Length of merged dataset: 17672


Creating json from Arrow format:   0%|          | 0/18 [00:00<?, ?ba/s]

Processing yo
Length of datasets: 36549
Length of merged dataset: 36549


Creating json from Arrow format:   0%|          | 0/37 [00:00<?, ?ba/s]

Processing zh
Length of datasets: 17805
Length of merged dataset: 17805


Creating json from Arrow format:   0%|          | 0/18 [00:00<?, ?ba/s]

In [45]:
# Merging fleurs_test

langs = os.listdir("/exp/nbafna/projects/mitigating-accent-bias-in-lid/wav2vec2_intermediate_outputs/vl107/wav2vec2-base-layer8-1000/fleurs_test_eval_units")
assert len(langs) == 78

for lang in langs:
    print(f"Processing {lang}")

    input_path_et_trans = f"/exp/nbafna/projects/mitigating-accent-bias-in-lid/reps_and_phoneseqs/fleurs_test/ecapa-tdnn_wav2vec2-xlsr-53-espeak-cv-ft/{lang}_transcriptions.jsonl"
    input_path_ssl_units = f"/exp/nbafna/projects/mitigating-accent-bias-in-lid/wav2vec2_intermediate_outputs/vl107/wav2vec2-base-layer8-1000/fleurs_test_eval_units/{lang}/kmeans_data.pkl"
    outdir = f"/exp/nbafna/projects/mitigating-accent-bias-in-lid/reps_phoneseqs_duseqs_exps/vl107/ecapa-tdnn_wav2vec2-xlsr-53-espeak-cv-ft_wav2vec2-base-layer8-kmeans1000/fleurs_test_dataset/"
    outfile = os.path.join(outdir, f"{lang}_data.jsonl")
    os.makedirs(outdir, exist_ok=True)
    # Loading the SSL unit data as an HF dataset
    with open(input_path_ssl_units, 'rb') as f:
        ssl_units_data = pkl.load(f)

    duseq_dataset = datasets.Dataset.from_dict(ssl_units_data)
    et_phoneseq_datset = datasets.load_dataset('json', data_files=input_path_et_trans)["train"]

    # Check that length is the same
    assert len(duseq_dataset) == len(et_phoneseq_datset)
    print(f"Length of datasets: {len(duseq_dataset)}")

    # Create tuples:
    duseq_datase_tuples = [{"audio_file": x["all_audio_files"], "sequences": x["sequences"]} for x in duseq_dataset]
    et_phoneseq_datset_tuples = [{k: v for k, v in x.items()} for x in et_phoneseq_datset]


    # Sort the datasets by audio file name
    duseq_datase_tuples_sorted = sorted(duseq_datase_tuples, key=lambda x: x["audio_file"])
    et_phoneseq_datset_tuples_sorted = sorted(et_phoneseq_datset_tuples, key=lambda x: x["audio_file"])

    # Merge the two datasets
    for i in range(len(duseq_datase_tuples_sorted)):
        assert duseq_datase_tuples_sorted[i]["audio_file"] == et_phoneseq_datset_tuples_sorted[i]["audio_file"]
        duseq_datase_tuples_sorted[i].update(et_phoneseq_datset_tuples_sorted[i])

    # Create HF dataset
    merged_dataset = {}
    for k in duseq_datase_tuples_sorted[0].keys():
        merged_dataset[k] = [x[k] for x in duseq_datase_tuples_sorted]

    merged_dataset = datasets.Dataset.from_dict(merged_dataset)
    print(f"Length of merged dataset: {len(merged_dataset)}")

    # Save the dataset
    merged_dataset.to_json(outfile)


Processing af
Length of datasets: 417
Length of merged dataset: 417


Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing am
Length of datasets: 710
Length of merged dataset: 710


Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing ar
Length of datasets: 566
Length of merged dataset: 566


Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing as
Length of datasets: 1594
Length of merged dataset: 1594


Creating json from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Processing az
Length of datasets: 1491
Length of merged dataset: 1491


Creating json from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Processing be
Length of datasets: 1947
Length of merged dataset: 1947


Creating json from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Processing bg
Length of datasets: 802
Length of merged dataset: 802


Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing bn
Length of datasets: 1609
Length of merged dataset: 1609


Creating json from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Processing bs
Length of datasets: 1431
Length of merged dataset: 1431


Creating json from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Processing ca
Length of datasets: 1410
Length of merged dataset: 1410


Creating json from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Processing ceb
Length of datasets: 1055
Length of merged dataset: 1055


Creating json from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Processing cs
Length of datasets: 1120
Length of merged dataset: 1120


Creating json from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Processing cy
Length of datasets: 2065
Length of merged dataset: 2065


Creating json from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Processing da
Length of datasets: 1323
Length of merged dataset: 1323


Creating json from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Processing de
Length of datasets: 1460
Length of merged dataset: 1460


Creating json from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Processing el
Length of datasets: 823
Length of merged dataset: 823


Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing en
Length of datasets: 747
Length of merged dataset: 747


Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing es
Length of datasets: 1397
Length of merged dataset: 1397


Creating json from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Processing et
Length of datasets: 1335
Length of merged dataset: 1335


Creating json from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Processing fa
Length of datasets: 1770
Length of merged dataset: 1770


Creating json from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Processing fi
Length of datasets: 1518
Length of merged dataset: 1518


Creating json from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Processing fr
Length of datasets: 825
Length of merged dataset: 825


Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing gl
Length of datasets: 1119
Length of merged dataset: 1119


Creating json from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Processing gu
Length of datasets: 1259
Length of merged dataset: 1259


Creating json from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Processing ha
Length of datasets: 1700
Length of merged dataset: 1700


Creating json from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Processing hi
Length of datasets: 621
Length of merged dataset: 621


Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing hr
Length of datasets: 1092
Length of merged dataset: 1092


Creating json from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Processing hu
Length of datasets: 1412
Length of merged dataset: 1412


Creating json from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Processing hy
Length of datasets: 1343
Length of merged dataset: 1343


Creating json from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Processing id
Length of datasets: 1085
Length of merged dataset: 1085


Creating json from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Processing is
Length of datasets: 84
Length of merged dataset: 84


Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing it
Length of datasets: 1673
Length of merged dataset: 1673


Creating json from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Processing ja
Length of datasets: 1090
Length of merged dataset: 1090


Creating json from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Processing ka
Length of datasets: 1364
Length of merged dataset: 1364


Creating json from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Processing kk
Length of datasets: 1876
Length of merged dataset: 1876


Creating json from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Processing km
Length of datasets: 1508
Length of merged dataset: 1508


Creating json from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Processing kn
Length of datasets: 1476
Length of merged dataset: 1476


Creating json from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Processing ko
Length of datasets: 621
Length of merged dataset: 621


Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing lb
Length of datasets: 1185
Length of merged dataset: 1185


Creating json from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Processing ln
Length of datasets: 1316
Length of merged dataset: 1316


Creating json from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Processing lo
Length of datasets: 643
Length of merged dataset: 643


Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing lt
Length of datasets: 1297
Length of merged dataset: 1297


Creating json from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Processing lv
Length of datasets: 1285
Length of merged dataset: 1285


Creating json from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Processing mi
Length of datasets: 2991
Length of merged dataset: 2991


Creating json from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Processing mk
Length of datasets: 1460
Length of merged dataset: 1460


Creating json from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Processing ml
Length of datasets: 1870
Length of merged dataset: 1870


Creating json from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Processing mn
Length of datasets: 1256
Length of merged dataset: 1256


Creating json from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Processing mr
Length of datasets: 1805
Length of merged dataset: 1805


Creating json from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Processing ms
Length of datasets: 991
Length of merged dataset: 991


Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing mt
Length of datasets: 1666
Length of merged dataset: 1666


Creating json from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Processing my
Length of datasets: 1867
Length of merged dataset: 1867


Creating json from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Processing ne
Length of datasets: 1008
Length of merged dataset: 1008


Creating json from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Processing nl
Length of datasets: 398
Length of merged dataset: 398


Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing oc
Length of datasets: 2210
Length of merged dataset: 2210


Creating json from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Processing pa
Length of datasets: 824
Length of merged dataset: 824


Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing pl
Length of datasets: 865
Length of merged dataset: 865


Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing ps
Length of datasets: 813
Length of merged dataset: 813


Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing pt
Length of datasets: 1476
Length of merged dataset: 1476


Creating json from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Processing ro
Length of datasets: 1087
Length of merged dataset: 1087


Creating json from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Processing ru
Length of datasets: 1125
Length of merged dataset: 1125


Creating json from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Processing sd
Length of datasets: 1486
Length of merged dataset: 1486


Creating json from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Processing sk
Length of datasets: 1184
Length of merged dataset: 1184


Creating json from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Processing sl
Length of datasets: 973
Length of merged dataset: 973


Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing sn
Length of datasets: 1821
Length of merged dataset: 1821


Creating json from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Processing so
Length of datasets: 1857
Length of merged dataset: 1857


Creating json from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Processing sr
Length of datasets: 925
Length of merged dataset: 925


Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing sv
Length of datasets: 1023
Length of merged dataset: 1023


Creating json from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Processing sw
Length of datasets: 913
Length of merged dataset: 913


Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing ta
Length of datasets: 987
Length of merged dataset: 987


Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing te
Length of datasets: 642
Length of merged dataset: 642


Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing tg
Length of datasets: 1154
Length of merged dataset: 1154


Creating json from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Processing th
Length of datasets: 1533
Length of merged dataset: 1533


Creating json from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Processing tr
Length of datasets: 1196
Length of merged dataset: 1196


Creating json from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Processing uk
Length of datasets: 994
Length of merged dataset: 994


Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing ur
Length of datasets: 349
Length of merged dataset: 349


Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing uz
Length of datasets: 1296
Length of merged dataset: 1296


Creating json from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Processing vi
Length of datasets: 1370
Length of merged dataset: 1370


Creating json from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Processing yo
Length of datasets: 1848
Length of merged dataset: 1848


Creating json from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

In [46]:
# Merging EdAcc dataset

langs = os.listdir("/exp/nbafna/projects/mitigating-accent-bias-in-lid/wav2vec2_intermediate_outputs/vl107/wav2vec2-base-layer8-1000/eval_units")
assert len(langs) == 1

for lang in langs:
    print(f"Processing {lang}")

    input_path_et_trans = f"/exp/nbafna/projects/mitigating-accent-bias-in-lid/reps_and_phoneseqs/edacc/ecapa-tdnn_wav2vec2-xlsr-53-espeak-cv-ft/{lang}_transcriptions.jsonl"
    input_path_ssl_units = f"/exp/nbafna/projects/mitigating-accent-bias-in-lid/wav2vec2_intermediate_outputs/vl107/wav2vec2-base-layer8-1000/eval_units/{lang}/kmeans_data.pkl"
    outdir = f"/exp/nbafna/projects/mitigating-accent-bias-in-lid/reps_phoneseqs_duseqs_exps/vl107/ecapa-tdnn_wav2vec2-xlsr-53-espeak-cv-ft_wav2vec2-base-layer8-kmeans1000/edacc_dataset/"
    outfile = os.path.join(outdir, f"{lang}_data.jsonl")
    os.makedirs(outdir, exist_ok=True)
    # Loading the SSL unit data as an HF dataset
    with open(input_path_ssl_units, 'rb') as f:
        ssl_units_data = pkl.load(f)

    duseq_dataset = datasets.Dataset.from_dict(ssl_units_data)
    et_phoneseq_datset = datasets.load_dataset('json', data_files=input_path_et_trans)["train"]

    # Check that length is the same
    assert len(duseq_dataset) == len(et_phoneseq_datset)
    print(f"Length of datasets: {len(duseq_dataset)}")

    # Create tuples:
    duseq_datase_tuples = [{"audio_file": x["all_audio_files"], "sequences": x["sequences"]} for x in duseq_dataset]
    et_phoneseq_datset_tuples = [{k: v for k, v in x.items()} for x in et_phoneseq_datset]


    # Sort the datasets by audio file name
    duseq_datase_tuples_sorted = sorted(duseq_datase_tuples, key=lambda x: x["audio_file"])
    et_phoneseq_datset_tuples_sorted = sorted(et_phoneseq_datset_tuples, key=lambda x: x["audio_file"])

    # Merge the two datasets
    for i in range(len(duseq_datase_tuples_sorted)):
        assert duseq_datase_tuples_sorted[i]["audio_file"] == et_phoneseq_datset_tuples_sorted[i]["audio_file"]
        duseq_datase_tuples_sorted[i].update(et_phoneseq_datset_tuples_sorted[i])

    # Create HF dataset
    merged_dataset = {}
    for k in duseq_datase_tuples_sorted[0].keys():
        merged_dataset[k] = [x[k] for x in duseq_datase_tuples_sorted]

    merged_dataset = datasets.Dataset.from_dict(merged_dataset)
    print(f"Length of merged dataset: {len(merged_dataset)}")

    # Save the dataset
    merged_dataset.to_json(outfile)


Processing en
Length of datasets: 13213
Length of merged dataset: 13213


Creating json from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

In [47]:
# Merging CV dataset

langs = os.listdir("/exp/nbafna/projects/mitigating-accent-bias-in-lid/wav2vec2_intermediate_outputs/vl107/wav2vec2-base-layer8-1000/cv_eval_units")
assert len(langs) == 1

for lang in langs:
    print(f"Processing {lang}")

    input_path_et_trans = f"/exp/nbafna/projects/mitigating-accent-bias-in-lid/reps_and_phoneseqs/cv/ecapa-tdnn_wav2vec2-xlsr-53-espeak-cv-ft/{lang}_transcriptions.jsonl"
    input_path_ssl_units = f"/exp/nbafna/projects/mitigating-accent-bias-in-lid/wav2vec2_intermediate_outputs/vl107/wav2vec2-base-layer8-1000/cv_eval_units/{lang}/kmeans_data.pkl"
    outdir = f"/exp/nbafna/projects/mitigating-accent-bias-in-lid/reps_phoneseqs_duseqs_exps/vl107/ecapa-tdnn_wav2vec2-xlsr-53-espeak-cv-ft_wav2vec2-base-layer8-kmeans1000/cv_dataset/"
    outfile = os.path.join(outdir, f"{lang}_data.jsonl")
    os.makedirs(outdir, exist_ok=True)
    # Loading the SSL unit data as an HF dataset
    with open(input_path_ssl_units, 'rb') as f:
        ssl_units_data = pkl.load(f)

    duseq_dataset = datasets.Dataset.from_dict(ssl_units_data)
    et_phoneseq_datset = datasets.load_dataset('json', data_files=input_path_et_trans)["train"]

    # Check that length is the same
    assert len(duseq_dataset) == len(et_phoneseq_datset)
    print(f"Length of datasets: {len(duseq_dataset)}")

    # Create tuples:
    duseq_datase_tuples = [{"audio_file": x["all_audio_files"], "sequences": x["sequences"]} for x in duseq_dataset]
    et_phoneseq_datset_tuples = [{k: v for k, v in x.items()} for x in et_phoneseq_datset]


    # Sort the datasets by audio file name
    duseq_datase_tuples_sorted = sorted(duseq_datase_tuples, key=lambda x: x["audio_file"])
    et_phoneseq_datset_tuples_sorted = sorted(et_phoneseq_datset_tuples, key=lambda x: x["audio_file"])

    # Merge the two datasets
    for i in range(len(duseq_datase_tuples_sorted)):
        assert duseq_datase_tuples_sorted[i]["audio_file"] == et_phoneseq_datset_tuples_sorted[i]["audio_file"]
        duseq_datase_tuples_sorted[i].update(et_phoneseq_datset_tuples_sorted[i])

    # Create HF dataset
    merged_dataset = {}
    for k in duseq_datase_tuples_sorted[0].keys():
        merged_dataset[k] = [x[k] for x in duseq_datase_tuples_sorted]

    merged_dataset = datasets.Dataset.from_dict(merged_dataset)
    print(f"Length of merged dataset: {len(merged_dataset)}")

    # Save the dataset
    merged_dataset.to_json(outfile)


Processing en
Length of datasets: 12012
Length of merged dataset: 12012


Creating json from Arrow format:   0%|          | 0/13 [00:00<?, ?ba/s]

In [26]:
len(duseq_dataset)

21401

In [2]:
# Merging cv_from_hf dataset
import os
import pickle as pkl
import datasets


langs = ["es", "fr", "it", "de"]

for lang in langs:
    print(f"Processing {lang}")

    input_path_et_trans = f"/exp/nbafna/projects/mitigating-accent-bias-in-lid/reps_and_phoneseqs/cv_from_hf/ecapa-tdnn_wav2vec2-xlsr-53-espeak-cv-ft/{lang}_transcriptions.jsonl"
    input_path_ssl_units = f"/exp/nbafna/projects/mitigating-accent-bias-in-lid/wav2vec2_intermediate_outputs/vl107/wav2vec2-base-layer8-1000/cv_from_hf_eval_units/{lang}/kmeans_data.pkl"
    outdir = f"/exp/nbafna/projects/mitigating-accent-bias-in-lid/reps_phoneseqs_duseqs_exps/vl107/ecapa-tdnn_wav2vec2-xlsr-53-espeak-cv-ft_wav2vec2-base-layer8-1000/cv_from_hf_dataset/"
    outfile = os.path.join(outdir, f"{lang}_data.jsonl")
    os.makedirs(outdir, exist_ok=True)
    # Loading the SSL unit data as an HF dataset
    with open(input_path_ssl_units, 'rb') as f:
        ssl_units_data = pkl.load(f)

    duseq_dataset = datasets.Dataset.from_dict(ssl_units_data)
    et_phoneseq_datset = datasets.load_dataset('json', data_files=input_path_et_trans)["train"]

    # Check that length is the same
    assert len(duseq_dataset) == len(et_phoneseq_datset)
    print(f"Length of datasets: {len(duseq_dataset)}")

    # Create tuples:
    duseq_datase_tuples = [{"audio_file": x["all_audio_files"], "sequences": x["sequences"]} for x in duseq_dataset]
    et_phoneseq_datset_tuples = [{k: v for k, v in x.items()} for x in et_phoneseq_datset]


    # Sort the datasets by audio file name
    duseq_datase_tuples_sorted = sorted(duseq_datase_tuples, key=lambda x: x["audio_file"])
    et_phoneseq_datset_tuples_sorted = sorted(et_phoneseq_datset_tuples, key=lambda x: x["audio_file"])

    # Merge the two datasets
    for i in range(len(duseq_datase_tuples_sorted)):
        assert duseq_datase_tuples_sorted[i]["audio_file"] == et_phoneseq_datset_tuples_sorted[i]["audio_file"]
        duseq_datase_tuples_sorted[i].update(et_phoneseq_datset_tuples_sorted[i])

    # Create HF dataset
    merged_dataset = {}
    for k in duseq_datase_tuples_sorted[0].keys():
        merged_dataset[k] = [x[k] for x in duseq_datase_tuples_sorted]

    merged_dataset = datasets.Dataset.from_dict(merged_dataset)
    print(f"Length of merged dataset: {len(merged_dataset)}")

    # Save the dataset
    merged_dataset.to_json(outfile)




Processing es
Length of datasets: 2002
Length of merged dataset: 2002


Creating json from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Processing fr
Length of datasets: 1796
Length of merged dataset: 1796


Creating json from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Processing it
Length of datasets: 1521
Length of merged dataset: 1521


Creating json from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Processing de
Length of datasets: 2183
Length of merged dataset: 2183


Creating json from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

In [19]:
len(et_phoneseq_datset)

21401

In [27]:
duseq_datase_tuples = [{"audio_file": x["all_audio_files"], "sequences": x["sequences"]} for x in duseq_dataset]

et_phoneseq_datset_tuples = [{k: v for k, v in x.items()} for x in et_phoneseq_datset]

In [32]:
# Sort the datasets by audio file name
duseq_datase_tuples_sorted = sorted(duseq_datase_tuples, key=lambda x: x["audio_file"])
et_phoneseq_datset_tuples_sorted = sorted(et_phoneseq_datset_tuples, key=lambda x: x["audio_file"])


In [35]:
# Merge the two datasets
for i in range(len(duseq_datase_tuples_sorted)):
    assert duseq_datase_tuples_sorted[i]["audio_file"] == et_phoneseq_datset_tuples_sorted[i]["audio_file"]
    duseq_datase_tuples_sorted[i].update(et_phoneseq_datset_tuples_sorted[i])

In [37]:
duseq_datase_tuples_sorted[0].keys()

dict_keys(['audio_file', 'sequences', 'lang', 'accent', 'reps', 'phone_sequence'])

In [40]:
# Create HF dataset
merged_dataset = {}
for k in duseq_datase_tuples_sorted[0].keys():
    merged_dataset[k] = [x[k] for x in duseq_datase_tuples_sorted]

merged_dataset = datasets.Dataset.from_dict(merged_dataset)

In [41]:
merged_dataset[0]

{'audio_file': '/exp/jvillalba/corpora/voxlingua107/en/-BwrRlUdfEs__U__S0---0003.940-0020.570.wav',
 'sequences': [543,
  184,
  834,
  278,
  646,
  164,
  3,
  191,
  333,
  224,
  663,
  443,
  707,
  994,
  134,
  166,
  857,
  703,
  113,
  543,
  158,
  855,
  42,
  498,
  198,
  798,
  754,
  316,
  687,
  816,
  675,
  913,
  221,
  703,
  563,
  152,
  175,
  857,
  478,
  828,
  828,
  603,
  599,
  235,
  40,
  725,
  284,
  756,
  244,
  990,
  883,
  121,
  55,
  399,
  300,
  552,
  401,
  353,
  391],
 'lang': 'en',
 'accent': '-',
 'reps': [14.436662674,
  33.307849884,
  -8.6984453201,
  29.9989471436,
  13.7374649048,
  43.9486465454,
  47.6941871643,
  23.8075466156,
  28.8990192413,
  -12.1511459351,
  0.1968344599,
  31.1025733948,
  39.9544067383,
  45.9391555786,
  -24.8860301971,
  48.2760543823,
  3.2955844402,
  11.74929142,
  -27.6994476318,
  -9.8814573288,
  0.2955344319,
  14.5435390472,
  26.2771892548,
  46.0649490356,
  34.5133132935,
  -25.7256126404,
