In [41]:
import numpy as np
import pandas as pd
import librosa
import os

In [42]:
# Convert csv to dataframe
df = pd.read_csv('speakers_all.csv')

# Display the first 5 rows of the dataframe
df.head()

Unnamed: 0,age,age_onset,birthplace,filename,native_language,sex,speakerid,country,file_missing?,Unnamed: 9,Unnamed: 10,Unnamed: 11
0,24.0,12.0,"koussi, senegal",balanta,balanta,male,788,senegal,True,,,
1,18.0,10.0,"buea, cameroon",cameroon,cameroon,male,1953,cameroon,True,,,
2,48.0,8.0,"hong, adamawa, nigeria",fulfulde,fulfulde,male,1037,nigeria,True,,,
3,42.0,42.0,"port-au-prince, haiti",haitian,haitian,male,1165,haiti,True,,,
4,40.0,35.0,"port-au-prince, haiti",haitian,haitian,male,1166,haiti,True,,,


In [43]:
# Print distribution of native_language, and country
print(df['native_language'].value_counts())
print(df['country'].value_counts())

native_language
english     579
spanish     162
arabic      102
mandarin     65
french       63
           ... 
kalanga       1
kabyle        1
jola          1
irish         1
zulu          1
Name: count, Length: 214, dtype: int64
country
usa         393
china        88
uk           67
india        59
canada       54
           ... 
namibia       1
romanian      1
burundi       1
rwanda        1
benin         1
Name: count, Length: 176, dtype: int64


In [48]:
# Create two classes of data, USA and non-USA, and grab the filenames of both classes
# Also make sure the file_missing? column is False
english = df[(df['native_language'] == 'english') & (df['file_missing?'] == False)]['filename']
non_english = df[(df['native_language'] != 'english') & (df['file_missing?'] == False)]['filename']

print('Number of english: ', len(english))
print('Number of non-english: ', len(non_english))

# Print total number of files
print('Total number of files: ', len(df))

# Create a list of countries that have less than 10 speakers
native_languages = df['native_language'].value_counts()
native_languages = native_languages[native_languages < 25].index.tolist()

# Remove the countries that only have 1 speaker
df = df[~df['native_language'].isin(native_languages)]

english = df[(df['native_language'] == 'english') & (df['file_missing?'] == False)]['filename']
non_english = df[(df['native_language'] != 'english') & (df['file_missing?'] == False)]['filename']

print('Number of english: ', len(english))
print('Number of non-english: ', len(non_english))


Number of english:  579
Number of non-english:  780
Total number of files:  1359
Number of english:  579
Number of non-english:  780


In [49]:
# Create a helper function for extracting the MFCCs from the audio files
def extract_mfcc(audio_file, n_mfcc=13):
    signal, sample_rate = librosa.load(audio_file)
    mfcc = librosa.feature.mfcc(y=signal, sr=sample_rate, n_mfcc=n_mfcc)
    return np.mean(mfcc, axis=1)

In [50]:
# Grab the MFCCs for all audio files in the english class
english_mfcc = []
i = 0
for file in english:
    # Check if file_name exists, if it doesn't, skip it
    file_name = 'recordings/recordings/' + file + '.mp3'
    if not os.path.exists(file_name):
        continue

    mfcc = extract_mfcc('recordings/recordings/' + file + '.mp3', n_mfcc=20)
    english_mfcc.append(mfcc)

    # Add a counter to keep track of progress
    i += 1
    if i % 100 == 0:
        print(i)

# Grab the MFCCs for all audio files in the non-english class
non_english_mfcc = []
i = 0
for file in non_english:
    # Check if file_name exists, if it doesn't, skip it
    file_name = 'recordings/recordings/' + file + '.mp3'
    if not os.path.exists(file_name):
        continue

    mfcc = extract_mfcc('recordings/recordings/' + file + '.mp3', n_mfcc=20)
    non_english_mfcc.append(mfcc)

    # Add a counter to keep track of progress
    i += 1
    if i % 100 == 0:
        print(i)

# Convert the lists to numpy arrays
english_mfcc = np.array(english_mfcc)
non_english_mfcc = np.array(non_english_mfcc)

# Print the shape of the arrays
print(english_mfcc.shape)
print(non_english_mfcc.shape)

100
200
300
400
500
100
200
300
400
500
600
700
(579, 20)
(780, 20)


In [51]:
# Add a label of 1 to the english class, and 0 to the non-english class
english_labels = np.ones(english_mfcc.shape[0])
non_english_labels = np.zeros(non_english_mfcc.shape[0])

# Combine the english and non-english data into one array
X = np.concatenate((english_mfcc, non_english_mfcc))
y = np.concatenate((english_labels, non_english_labels))

# Print the shape of the combined array
print(X.shape)
print(y.shape)



(1359, 20)
(1359,)


In [None]:
# Save the data to npz files
np.savez('mfcc_data_20.npz', X=X, y=y)