In [1]:
#from pydub import AudioSegment
from mutagen.mp3 import MP3
import pandas as pd
import numpy as np
import os

In [2]:
# Removes rows that have no mp3 file associated with it
# Uses speakerid as index and removes unnecessary columns
# file_path: csv file of all of the speakers
# returns a cleaned dataframe of speakers information
def create_speaker_df(file_path):
    speakers = pd.read_csv(file_path, index_col='speakerid')
    speakers = speakers[speakers['file_missing?'] == False]
    speakers.drop(['file_missing?', 'Unnamed: 9', 'Unnamed: 10', 'Unnamed: 11'], axis=1, inplace=True)
    return speakers

speakers_df = create_speaker_df('speech-accent-archive/speakers_all.csv')

In [3]:
# The path for all mp3 files
mp3_list = os.listdir('speech-accent-archive/mp3_recordings/')
for i in range(len(mp3_list)):
    mp3_list[i] = 'speech-accent-archive/mp3_recordings/' + mp3_list[i]

In [4]:
# Create a temp df to hold the filename and associated time length
# read all of the file.mp3 lengths and store into df
temp_df = pd.DataFrame(columns=['filename', 'audio_length'])
for i in mp3_list:
    fold1, fold2, file = i.split('/', 2)
    file = file.replace('.mp3', '')
    audio = MP3(i)
    aud_len = audio.info.length
    temp_df = temp_df.append({'filename': file, 'audio_length': aud_len}, ignore_index=True)

In [5]:
speakers_df = pd.merge(speakers_df, temp_df, on=['filename'])

In [6]:
# drop the rows that contain NaN
speakers_df.dropna(axis=0, inplace=True)
speakers_df.isna().sum()

age                0
age_onset          0
birthplace         0
filename           0
native_language    0
sex                0
country            0
audio_length       0
dtype: int64

In [7]:
speakers_df['rounded_length'] = speakers_df.audio_length.round(decimals=0)
speakers_df.rounded_length.value_counts().sort_index().head(15)

16.0      3
17.0      9
18.0     36
19.0     83
20.0    110
21.0    164
22.0    178
23.0    177
24.0    154
25.0    129
26.0    139
27.0    116
28.0    105
29.0     94
30.0     71
Name: rounded_length, dtype: int64

In [9]:
speakers_df.head()

Unnamed: 0,age,age_onset,birthplace,filename,native_language,sex,country,audio_length,rounded_length
0,27.0,9.0,"virginia, south africa",afrikaans1,afrikaans,female,south africa,20.772268,21.0
1,40.0,5.0,"pretoria, south africa",afrikaans2,afrikaans,male,south africa,21.961497,22.0
2,43.0,4.0,"pretoria, transvaal, south africa",afrikaans3,afrikaans,male,south africa,26.88,27.0
3,26.0,8.0,"pretoria, south africa",afrikaans4,afrikaans,male,south africa,23.471179,23.0
4,19.0,6.0,"cape town, south africa",afrikaans5,afrikaans,male,south africa,20.252404,20.0


In [10]:
len(speakers_df.country.value_counts())

176

In [8]:
english_speakers_df = speakers_df[speakers_df['native_language'].str.contains('english')]
english_speakers_df.country.value_counts()

usa                     373
uk                       65
canada                   44
australia                32
ireland                  11
new zealand               8
jamaica                   5
singapore                 4
south africa              4
india                     3
panama                    3
philippines               2
malaysia                  2
guyana                    2
the bahamas               1
bolivia                   1
italy                     1
germany                   1
nigeria                   1
trinidad                  1
papua new guinea          1
virginia                  1
ghana                     1
antigua and barbuda       1
pakistan                  1
barbados                  1
us virgin islands         1
liberia                   1
isle of man               1
united arab emirates      1
fiji                      1
belize                    1
lebanon                   1
switzerland               1
spain                     1
Name: country, dtype