In [1]:
#from pydub import AudioSegment
from mutagen.mp3 import MP3
import pandas as pd
import numpy as np
import os

In [2]:
# Removes rows that have no mp3 file associated with it
# Uses speakerid as index and removes unnecessary columns
# file_path: csv file of all of the speakers
# returns a cleaned dataframe of speakers information
def create_speaker_df(file_path):
    speakers = pd.read_csv(file_path, index_col='speakerid')
    speakers = speakers[speakers['file_missing?'] == False]
    speakers.drop(['file_missing?', 'Unnamed: 9', 'Unnamed: 10', 'Unnamed: 11'], axis=1, inplace=True)
    return speakers

speakers_df = create_speaker_df('speech-accent-archive/speakers_all.csv')

In [3]:
# The path for all mp3 files
mp3_list = os.listdir('speech-accent-archive/mp3_recordings/')
for i in range(len(mp3_list)):
    mp3_list[i] = 'speech-accent-archive/mp3_recordings/' + mp3_list[i]

In [4]:
# Create a temp df to hold the filename and associated time length
# read all of the file.mp3 lengths and store into df
temp_df = pd.DataFrame(columns=['filename', 'audio_length'])
for i in mp3_list:
    fold1, fold2, file = i.split('/', 2)
    file = file.replace('.mp3', '')
    audio = MP3(i)
    aud_len = audio.info.length
    temp_df = temp_df.append({'filename': file, 'audio_length': aud_len}, ignore_index=True)

In [5]:
speakers_df = pd.merge(speakers_df, temp_df, on=['filename'])
speakers_df

Unnamed: 0,age,age_onset,birthplace,filename,native_language,sex,country,audio_length
0,27.0,9.0,"virginia, south africa",afrikaans1,afrikaans,female,south africa,20.772268
1,40.0,5.0,"pretoria, south africa",afrikaans2,afrikaans,male,south africa,21.961497
2,43.0,4.0,"pretoria, transvaal, south africa",afrikaans3,afrikaans,male,south africa,26.880000
3,26.0,8.0,"pretoria, south africa",afrikaans4,afrikaans,male,south africa,23.471179
4,19.0,6.0,"cape town, south africa",afrikaans5,afrikaans,male,south africa,20.252404
...,...,...,...,...,...,...,...,...
2133,46.0,5.0,"lagos, nigeria",yoruba3,yoruba,female,nigeria,35.032562
2134,46.0,12.0,"lagos, nigeria",yoruba4,yoruba,male,nigeria,34.287687
2135,47.0,2.0,"ibadan, nigeria",yoruba5,yoruba,female,nigeria,41.367732
2136,31.0,1.0,"bethel, alaska, usa",yupik1,yupik,female,usa,24.118549


In [6]:
# total NaN per column in df
speakers_df.isna().sum()

age                0
age_onset          0
birthplace         4
filename           0
native_language    0
sex                0
country            5
audio_length       0
dtype: int64

In [7]:
# drop the rows that contain NaN
speakers_df.dropna(axis=0, inplace=True)
speakers_df.isna().sum()

age                0
age_onset          0
birthplace         0
filename           0
native_language    0
sex                0
country            0
audio_length       0
dtype: int64

In [62]:
speakers_df['rounded_length'] = speakers_df.audio_length.round(decimals=0)

speakers_df.rounded_length.value_counts().sort_index()

16.0       3
17.0       9
18.0      36
19.0      83
20.0     110
21.0     164
22.0     178
23.0     177
24.0     154
25.0     129
26.0     139
27.0     116
28.0     105
29.0      94
30.0      71
31.0      71
32.0      52
33.0      59
34.0      53
35.0      45
36.0      33
37.0      32
38.0      19
39.0      22
40.0      31
41.0      19
42.0      17
43.0      18
44.0       7
45.0       8
46.0       8
47.0      10
48.0       5
49.0      12
50.0       4
51.0       6
52.0       3
53.0       7
54.0       1
55.0       2
56.0       2
58.0       2
59.0       3
60.0       1
61.0       2
63.0       1
64.0       1
65.0       2
66.0       1
67.0       2
77.0       1
78.0       1
85.0       1
107.0      1
Name: rounded_length, dtype: int64

In [63]:
english_speakers_df = speakers_df[speakers_df['native_language'].str.contains('english')]
english_speakers_df.country.value_counts()

usa                     373
uk                       65
canada                   44
australia                32
ireland                  11
new zealand               8
jamaica                   5
south africa              4
singapore                 4
panama                    3
india                     3
malaysia                  2
philippines               2
guyana                    2
liberia                   1
antigua and barbuda       1
papua new guinea          1
us virgin islands         1
italy                     1
fiji                      1
isle of man               1
pakistan                  1
the bahamas               1
bolivia                   1
united arab emirates      1
spain                     1
switzerland               1
nigeria                   1
lebanon                   1
belize                    1
germany                   1
ghana                     1
barbados                  1
trinidad                  1
virginia                  1
Name: country, dtype

In [None]:
# Changes mp3 files into wave files
# mp3_path: folder where all of the mp3 recordings are contained
# wav_path: folder where all of the wav files should go
#def create_wav(mp3_path, wav_path):
#    wav_f = []
#    mp3_list = os.listdir(mp3_path)
#    for f in mp3_list:
#        src = wav_path + f.replace('mp3', 'wav')
#        dst = open(wav_path, 'w+')
#        sound = AudioSegment.from_mp3(f)
#        sound.export(dst, format='wav')

#recordings = create_wav('speech-accent-archive/mp3_recordings/', 'speech-accent-archive/wav_recordings/')