In [19]:
#from pydub import AudioSegment
from mutagen.mp3 import MP3
import pandas as pd
import numpy as np
import os

In [20]:
# Removes rows that have no mp3 file associated with it
# Uses speakerid as index and removes unnecessary columns
# file_path: csv file of all of the speakers
# returns a cleaned dataframe of speakers information
def create_speaker_df(file_path):
    speakers = pd.read_csv(file_path, index_col='speakerid')
    speakers = speakers[speakers['file_missing?'] == False]
    speakers.drop(['file_missing?', 'Unnamed: 9', 'Unnamed: 10', 'Unnamed: 11'], axis=1, inplace=True)
    return speakers

speakers_df = create_speaker_df('speech-accent-archive/speakers_all.csv')

In [21]:
# The path for all mp3 files
mp3_list = os.listdir('speech-accent-archive/mp3_recordings/')
for i in range(len(mp3_list)):
    mp3_list[i] = 'speech-accent-archive/mp3_recordings/' + mp3_list[i]

In [22]:
# Create a temp df to hold the filename and associated time length
# read all of the file.mp3 lengths and store into df
temp_df = pd.DataFrame(columns=['filename', 'audio_length'])
for i in mp3_list:
    fold1, fold2, file = i.split('/', 2)
    file = file.replace('.mp3', '')
    audio = MP3(i)
    aud_len = audio.info.length
    temp_df = temp_df.append({'filename': file, 'audio_length': aud_len}, ignore_index=True)

In [23]:
speakers_df = pd.merge(speakers_df, temp_df, on=['filename'])

In [24]:
# drop the rows that contain NaN
speakers_df.dropna(axis=0, inplace=True)
speakers_df.isna().sum()

age                0
age_onset          0
birthplace         0
filename           0
native_language    0
sex                0
country            0
audio_length       0
dtype: int64

In [27]:
speakers_df['rounded_length'] = speakers_df.audio_length.round(decimals=0)
speakers_df.rounded_length.value_counts().sort_index().head(15)

16.0      3
17.0      9
18.0     36
19.0     83
20.0    110
21.0    164
22.0    178
23.0    177
24.0    154
25.0    129
26.0    139
27.0    116
28.0    105
29.0     94
30.0     71
Name: rounded_length, dtype: int64

In [14]:
len_20s_df = speakers_df[speakers_df['rounded_length'] == 20.0]
len_21s_df = speakers_df[speakers_df['rounded_length'] == 21.0]
len_22s_df = speakers_df[speakers_df['rounded_length'] == 22.0]
len_23s_df = speakers_df[speakers_df['rounded_length'] == 23.0]
len_24s_df = speakers_df[speakers_df['rounded_length'] == 24.0]
len_25s_df = speakers_df[speakers_df['rounded_length'] == 25.0]
len_26s_df = speakers_df[speakers_df['rounded_length'] == 26.0]
len_27s_df = speakers_df[speakers_df['rounded_length'] == 27.0]
len_28s_df = speakers_df[speakers_df['rounded_length'] == 28.0]

In [None]:
english_speakers_df = speakers_df[speakers_df['native_language'].str.contains('english')]
english_speakers_df.country.value_counts()