In [3]:

from datasets import load_dataset, Dataset, DatasetDict, Audio
from collections import defaultdict, Counter

import numpy as np
import torch
import torchaudio

In [4]:
# Login using e.g. `huggingface-cli login` to access this dataset

def map_resample(example):

    waveform = torch.tensor(example["audio"]["array"])
    example["audio"]["array"] = torchaudio.transforms.Resample(48_000, 16_000)(waveform)
    example["audio"]["array"] = example["audio"]["array"].numpy().tolist()
    example["audio"]["sampling_rate"] = 16_000
    return example


def get_accented_dataset(lang):
    print(f"Processing {lang}")
    ds = load_dataset("mozilla-foundation/common_voice_13_0", lang, streaming=True)
    print(f"Converting to dataset")
    filtered_ds = ds['train'].filter(lambda x: x.get('accent'))
    subset = filtered_ds.take(5000)
    dataset = Dataset.from_list(list(subset))
    # print(f"Printing accents...")
    # accents = [x['accent'] for x in dataset]
    # print(f"Accents: {Counter(accents)}")
    # print(f"Total: {len(accents)}")

    # Resample the audio to 16kHz
    print(f"Resampling...")
    dataset = dataset.map(map_resample)

    # Rename columns
    
    formatted_dataset = dataset.rename_column("audio", "signal")
    formatted_dataset = formatted_dataset.rename_column("locale", "lang")
    formatted_dataset = formatted_dataset.rename_column("path", "audio_file")
    formatted_dataset = formatted_dataset.rename_column("sentence", "text_transcription")

    formatted_dataset = formatted_dataset.remove_columns([c for c in formatted_dataset.column_names \
                                        if c not in ["signal", "accent", "lang", "audio_file", \
                                                     "client_id", "text_transcription"]])

    # Save the dataset
    formatted_dataset.save_to_disk(f"/exp/nbafna/data/commonvoice/accented_data/{lang}/{lang}_accented_samples-5k")


In [5]:
for langs in ["it"]:
    get_accented_dataset(langs)

Processing it


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Converting to dataset


Reading metadata...: 162637it [00:02, 66660.49it/s]
'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 7e7beeb5-4a3f-4962-8c73-acc8474b6c89)')' thrown while requesting GET https://huggingface.co/datasets/mozilla-foundation/common_voice_13_0/resolve/main/audio/it/train/it_train_0.tar
Retrying in 1s [Retry 1/5].


Resampling...


Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Saving the dataset (0/7 shards):   0%|          | 0/5000 [00:00<?, ? examples/s]

In [16]:
subset = filtered_ds.take(5000)

In [17]:
dataset = Dataset.from_list(list(subset))

Reading metadata...: 280329it [00:04, 63770.88it/s]


In [14]:
dataset[100]

{'client_id': '071c47edc2f5e950e51416ba7580983fc30af12730da2d71eb14ade56a670b6fbc2e40349855a8b336ff2064e6b7798402ffa9d75f201df80e003fb76ccdc9c0',
 'path': 'es_train_0/common_voice_es_32107071.mp3',
 'audio': {'array': [2.2737367544323206e-13,
   5.5706550483591855e-12,
   6.480149750132114e-12,
   1.5916157281026244e-12,
   1.8189894035458565e-12,
   3.069544618483633e-12,
   3.637978807091713e-12,
   9.094947017729282e-13,
   -2.5011104298755527e-12,
   0.0,
   5.9117155615240335e-12,
   5.9117155615240335e-12,
   -2.5011104298755527e-12,
   -7.958078640513122e-12,
   -3.637978807091713e-12,
   1.0459189070388675e-11,
   2.3078428057488054e-11,
   1.6825651982799172e-11,
   -6.480149750132114e-12,
   -2.2964741219766438e-11,
   -2.319211489520967e-11,
   -1.864464138634503e-11,
   -1.8246737454319373e-11,
   -1.5006662579253316e-11,
   -5.684341886080801e-13,
   1.7905676941154525e-11,
   2.8194335754960775e-11,
   3.0809133022557944e-11,
   4.013145371573046e-11,
   4.320099833421409

In [18]:
accents = [x['accent'] for x in dataset]
from collections import Counter
Counter(accents)

Counter({'México': 1216,
         'Andino-Pacífico: Colombia, Perú, Ecuador, oeste de Bolivia y Venezuela andina': 861,
         'Rioplatense: Argentina, Uruguay, este de Bolivia, Paraguay': 546,
         'Caribe: Cuba, Venezuela, Puerto Rico, República Dominicana, Panamá, Colombia caribeña, México caribeño, Costa del golfo de México': 463,
         'España: Norte peninsular (Asturias, Castilla y León, Cantabria, País Vasco, Navarra, Aragón, La Rioja, Guadalajara, Cuenca)': 427,
         'España: Centro-Sur peninsular (Madrid, Toledo, Castilla-La Mancha)': 415,
         'América central': 354,
         'Chileno: Chile, Cuyo': 257,
         'España: Sur peninsular (Andalucia, Extremadura, Murcia)': 203,
         'España: Islas Canarias': 199,
         'Español de Filipinas': 13,
         'English': 10,
         'México centro,CDMX, México': 10,
         'Caribe: Cuba, Venezuela, Puerto Rico, República Dominicana, Panamá, Colombia caribeña, México caribeño, Costa del golfo de México,rare

In [27]:
import torchaudio
import torch
def map_resample(example):

    waveform = torch.tensor(example["audio"]["array"])
    example["audio"]["array"] = torchaudio.transforms.Resample(48_000, 16_000)(waveform)
    example["audio"]["array"] = example["audio"]["array"].numpy().tolist()
    example["audio"]["sampling_rate"] = 16_000
    return example




In [28]:
dataset = dataset.map(map_resample, batch_size=1)

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [32]:
dataset_formatted = dataset.rename_columns({
    "audio": "signal",
    "accent": "accent",
    "locale": "lang",
    "path": "audio_file",
    "sentence": "text_transcription"
})
# Keep only the columns we need
dataset_formatted = dataset_formatted.remove_columns([c for c in dataset.column_names if c not in ["signal", "accent", "lang", "audio_file", "text_transcription"]])


ValueError: Column name ['sentence', 'path', 'audio', 'locale'] not in the dataset. Current columns in the dataset: ['client_id', 'audio_file', 'signal', 'text_transcription', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'lang', 'segment', 'variant']

In [7]:
# Print the first 10 examples from IterableDataset
# accented_speech = Dataset.from_dict({"signal": [], "accent": [], "lang": [], "audio_file": []})
signals = [x["audio"] for x in dataset]
accents = [x["accent"] for x in dataset]
langs = [x["locale"] for x in dataset]
audio_files = [x["path"] for x in dataset]
text_transcriptions = [x["sentence"] for x in dataset]
formatted_dataset = {"signal": signals, "accent": accents, "lang": langs,\
                      "audio_file": audio_files, "text_transcription": text_transcriptions}
accented_speech = Dataset.from_dict(formatted_dataset)



Reading metadata...: 280329it [00:03, 76447.04it/s]
Exception ignored from cffi callback <function SoundFile._init_virtual_io.<locals>.vio_read at 0x2aabc8186700>:
Traceback (most recent call last):
  File "/home/hltcoe/nbafna/.conda/envs/accent_bias/lib/python3.12/site-packages/soundfile.py", line 1241, in vio_read
    @_ffi.callback("sf_vio_read")

KeyboardInterrupt: 


Exception ignored from cffi callback <function SoundFile._init_virtual_io.<locals>.vio_read at 0x2aabc2879580>:
Traceback (most recent call last):
  File "/home/hltcoe/nbafna/.conda/envs/accent_bias/lib/python3.12/site-packages/soundfile.py", line 1241, in vio_read
    @_ffi.callback("sf_vio_read")

KeyboardInterrupt: 


In [44]:
formatted_dataset[0]["signal"].keys()

dict_keys(['array', 'path', 'sampling_rate'])

In [1]:
def map_remove_values(example):
    example["signal"] = {
        "array": example["signal"]["array"],
        "sampling_rate": example["signal"]["sampling_rate"],
        "path": example["signal"]["path"]
    }
    return example

In [39]:
# formatted_dataset = dataset.rename_column("audio", "signal")
# formatted_dataset = formatted_dataset.rename_column("locale", "lang")
# formatted_dataset = formatted_dataset.rename_column("path", "audio_file")
# formatted_dataset = formatted_dataset.rename_column("sentence", "text_transcription")

formatted_dataset = formatted_dataset.remove_columns([c for c in formatted_dataset.column_names if c not in ["signal", "accent", "lang", "audio_file", "text_transcription"]])

In [2]:
from datasets import load_from_disk
for lang in ["es", "fr", "de"]:
    formatted_dataset = load_from_disk(f"/exp/nbafna/data/commonvoice/accented_data/{lang}/{lang}_accented_5k")
    formatted_dataset = formatted_dataset.map(map_remove_values)
    formatted_dataset.save_to_disk(f"/exp/nbafna/data/commonvoice/accented_data/{lang}/{lang}_accented_5k")

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

PermissionError: Tried to overwrite /exp/nbafna/data/commonvoice/accented_data/es/es_accented_5k but a dataset can't overwrite itself.

In [46]:
# Save to JSONL
formatted_dataset.save_to_disk("/exp/nbafna/data/commonvoice/accented_data/es/es_accented_5k")

Saving the dataset (0/8 shards):   0%|          | 0/5000 [00:00<?, ? examples/s]