In [1]:
# !pip install datasets

## Contents
- Fair Speech Dataset Meta
- Common Accent Dataset
- Speech Accent Archive
- **GLOBE**

# Fair Speech Dataset (Meta)

*   Source: https://ai.meta.com/datasets/speech-fairness-dataset/
*   Shruti evaluated separately!



# Common Accent Dataset

*   Source: https://huggingface.co/datasets/DTU54DL/common-accent
*   Is unbalanced and doesn't include gender labels.



In [None]:
from datasets import load_dataset

In [None]:
common_accent = load_dataset("DTU54DL/common-accent")
common_accent_train = common_accent["train"].to_pandas()
common_accent_test = common_accent["test"].to_pandas()

In [None]:
accent_counts = common_accent_train['accent'].value_counts()
print(accent_counts)

accent
India and South Asia (India, Pakistan, Sri Lanka)                                                                                                      5731
German English,Non native speaker                                                                                                                      3024
Southern African (South Africa, Zimbabwe, Namibia)                                                                                                      309
Filipino                                                                                                                                                295
Singaporean English                                                                                                                                     206
Hong Kong English                                                                                                                                       203
Malaysian English                                        

# Speech Accent Archive


*   Source: https://www.kaggle.com/datasets/rtatman/speech-accent-archive/
*   Has a highly diverse set of voices, but the counts are too small.



In [None]:
# !pip install kaggle

In [None]:
from google.colab import files

import pandas as pd
import os
import zipfile

In [None]:
files.upload() # Upload the kaggle.json file containing your API key.
os.makedirs('/root/.kaggle', exist_ok=True)
!mv kaggle.json /root/.kaggle/
!chmod 600 /root/.kaggle/kaggle.json

In [None]:
!kaggle datasets download -d rtatman/speech-accent-archive

Dataset URL: https://www.kaggle.com/datasets/rtatman/speech-accent-archive
License(s): CC-BY-NC-SA-4.0
Downloading speech-accent-archive.zip to /content
 99% 855M/865M [00:09<00:00, 94.8MB/s]
100% 865M/865M [00:09<00:00, 91.9MB/s]


In [None]:
with zipfile.ZipFile('speech-accent-archive.zip', 'r') as zip_ref:
    zip_ref.extractall('speech-accent-archive')

In [None]:
extracted_files = os.listdir('speech-accent-archive')
print(extracted_files)

['reading-passage.txt', 'recordings', 'speakers_all.csv']


In [None]:
speech_accent_archive = pd.read_csv('speech-accent-archive/speakers_all.csv')
speech_accent_archive = speech_accent_archive[speech_accent_archive['file_missing?'] == False]
speech_accent_archive = speech_accent_archive.drop(columns=['age_onset', 'file_missing?', 'filename', 'speakerid', 'Unnamed: 9', 'Unnamed: 10', 'Unnamed: 11'])
speech_accent_archive.head()

Unnamed: 0,age,birthplace,native_language,sex,country
32,27.0,"virginia, south africa",afrikaans,female,south africa
33,40.0,"pretoria, south africa",afrikaans,male,south africa
34,43.0,"pretoria, transvaal, south africa",afrikaans,male,south africa
35,26.0,"pretoria, south africa",afrikaans,male,south africa
36,19.0,"cape town, south africa",afrikaans,male,south africa


In [None]:
age_counts = speech_accent_archive['age'].value_counts()
age_counts = age_counts[age_counts > 20]
print(age_counts.to_string())

age
20.0    131
21.0    125
19.0    123
18.0    105
22.0    104
25.0    100
23.0     98
24.0     88
26.0     75
29.0     67
31.0     63
27.0     59
28.0     56
30.0     55
32.0     51
35.0     40
33.0     40
39.0     37
37.0     36
38.0     36
52.0     34
34.0     34
36.0     34
43.0     31
50.0     30
47.0     28
46.0     27
42.0     27
53.0     26
40.0     26
48.0     24
54.0     23
44.0     23
41.0     22
45.0     21


In [None]:
native_language_counts = speech_accent_archive['native_language'].value_counts()
native_language_counts = native_language_counts[native_language_counts > 20]
print(native_language_counts.to_string())

native_language
english       579
spanish       162
arabic        102
mandarin       65
french         63
korean         52
portuguese     48
russian        48
dutch          47
turkish        37
german         36
polish         34
italian        33
japanese       27
macedonian     26
farsi          23
cantonese      23
vietnamese     22


In [None]:
country_counts = speech_accent_archive['country'].value_counts()
country_counts = country_counts[country_counts > 20]
print(country_counts.to_string())

country
usa             391
china            88
uk               67
india            58
canada           54
south korea      51
brazil           39
belgium          36
turkey           35
poland           34
australia        33
saudi arabia     33
germany          32
italy            32
ethiopia         31
russia           31
france           28
macedonia        26
japan            26
philippines      23
spain            22
iraq             22
nigeria          22
colombia         22
iran             22
vietnam          22
pakistan         21


# GLOBE

*   Compared to commonly used English corpora, such as **LibriTTS** and **VCTK**, GLOBE is unique in its inclusion of utterances from 23,519 speakers and covers 164 accents worldwide, along with detailed metadata for these speakers.
*   Compared to its original corpus Common Voice, GLOBE significantly improves the quality of the speech data through rigorous filtering and enhancement processes, while also populating all missing speaker metadata.
*   Source: https://huggingface.co/datasets/MushanW/GLOBE_V2



In [None]:
import os
from datasets import load_from_disk, load_dataset

In [2]:
# dataset_path = "globe_v2_data"

# if not os.path.exists(dataset_path):
#   dataset = load_dataset("MushanW/GLOBE_V2", cache_dir=dataset_path)

# globe_2 = load_from_disk(dataset_path)