In [15]:
import numpy as np
import pandas as pd
import librosa
import os
from AudioAugmentations import *


In [16]:
# Convert csv to dataframe
df = pd.read_csv('speakers_all.csv')

# Display the first 5 rows of the dataframe
df.head()

Unnamed: 0,age,age_onset,birthplace,filename,native_language,sex,speakerid,country,file_missing?,Unnamed: 9,Unnamed: 10,Unnamed: 11
0,24.0,12.0,"koussi, senegal",balanta,balanta,male,788,senegal,True,,,
1,18.0,10.0,"buea, cameroon",cameroon,cameroon,male,1953,cameroon,True,,,
2,48.0,8.0,"hong, adamawa, nigeria",fulfulde,fulfulde,male,1037,nigeria,True,,,
3,42.0,42.0,"port-au-prince, haiti",haitian,haitian,male,1165,haiti,True,,,
4,40.0,35.0,"port-au-prince, haiti",haitian,haitian,male,1166,haiti,True,,,


In [17]:
# Print distribution of native_language, and country
print(df['native_language'].value_counts())
print(df['country'].value_counts())

native_language
english     579
spanish     162
arabic      102
mandarin     65
french       63
           ... 
kalanga       1
kabyle        1
jola          1
irish         1
zulu          1
Name: count, Length: 214, dtype: int64
country
usa         393
china        88
uk           67
india        59
canada       54
           ... 
namibia       1
romanian      1
burundi       1
rwanda        1
benin         1
Name: count, Length: 176, dtype: int64


In [18]:
# Create two classes of data, USA and non-USA, and grab the filenames of both classes
# Also make sure the file_missing? column is False
english = df[(df['native_language'] == 'english') & (df['file_missing?'] == False)]['filename']
non_english = df[(df['native_language'] != 'english') & (df['file_missing?'] == False)]['filename']

print('Number of english: ', len(english))
print('Number of non-english: ', len(non_english))

# Print total number of files
print('Total number of files: ', len(df))

# Create a list of countries that have less than 10 speakers
native_languages = df['native_language'].value_counts()
native_languages = native_languages[native_languages < 20].index.tolist()

# Remove the countries that only have 1 speaker
df = df[~df['native_language'].isin(native_languages)]

english = df[(df['native_language'] == 'english') & (df['file_missing?'] == False)]['filename']
non_english = df[(df['native_language'] != 'english') & (df['file_missing?'] == False)]['filename']

# Randomly remove files from non_english until the number of files is equal to the number of english files
non_english = non_english.sample(n=len(english), random_state=42)

print('Number of english: ', len(english))
print('Number of non-english: ', len(non_english))


Number of english:  579
Number of non-english:  1559
Total number of files:  2172
Number of english:  579
Number of non-english:  579


In [19]:
# Create a helper function for extracting the MFCCs from the audio files
def extract_mfcc_1d(audio_file, n_mfcc=40, sample_rate=16000, load=True):
    if load:
        signal, sample_rate = librosa.load(audio_file)
        mfcc = librosa.feature.mfcc(y=signal, sr=sample_rate, n_mfcc=n_mfcc, n_fft=512, hop_length=256)
        return np.mean(mfcc, axis=1)
    

    mfcc = librosa.feature.mfcc(y=audio_file, sr=sample_rate, n_mfcc=n_mfcc, n_fft=512, hop_length=256)
    return np.mean(mfcc, axis=1)

In [20]:
# Take n samples from each class and use them to create a validation set
english_validation = english.sample(n=125, random_state=42)
non_english_validation = non_english.sample(n=125, random_state=42)

# Remove the validation samples from the training set
english = english.drop(english_validation.index)
non_english = non_english.drop(non_english_validation.index)

val_set = pd.concat([english_validation, non_english_validation])
train_set = pd.concat([english, non_english])

# Print 50 rows of the validation set
val_set.head(200)


613       english324
886       english570
923        english82
828       english518
718       english419
            ...     
1681       romanian4
1640    portuguese42
1700       russian19
1314        korean22
2047       turkish13
Name: filename, Length: 200, dtype: object

In [21]:
# Calculate the mfcc for all audio files in the english class
train_set_mfcc = []
train_set_labels = []
i = 0
for file in train_set:
    # Check if file_name exists, if it doesn't, skip it
    file_name = 'recordings/recordings/' + file + '.mp3'
    if not os.path.exists(file_name):
        continue
    
    # Load the audio file
    signal, sample_rate = librosa.load(file_name)
    mfcc = extract_mfcc_1d(file_name, n_mfcc=40)
    train_set_mfcc.append(mfcc)

    # Create a pitch shifted version of the mfcc, first find the sampling rate
    # pitch_shifted_mfcc = librosa.feature.mfcc(y=pitch_shift(signal, sample_rate, 2), sr=sample_rate, n_mfcc=40)
    pitch_shifted_mfcc = extract_mfcc_1d(pitch_shift(signal, sample_rate, 2), n_mfcc=40, load=False, sample_rate=sample_rate)
    train_set_mfcc.append(pitch_shifted_mfcc)

    # Create a pitch shifted version of the mfcc, first find the sampling rate
    #pitch_shifted_mfcc = librosa.feature.mfcc(y=pitch_shift(signal, sample_rate, -2), sr=sample_rate, n_mfcc=40)
    pitch_shifted_mfcc = extract_mfcc_1d(pitch_shift(signal, sample_rate, -2), n_mfcc=40, load=False, sample_rate=sample_rate)
    train_set_mfcc.append(pitch_shifted_mfcc)

    # Create a noisy version of the mfcc
    #noisy_mfcc = librosa.feature.mfcc(y=add_background_noise(signal, 0.005), sr=sample_rate, n_mfcc=40)
    noisy_mfcc = extract_mfcc_1d(add_background_noise(signal, 0.005), n_mfcc=40, load=False, sample_rate=sample_rate)
    train_set_mfcc.append(noisy_mfcc)

    # Create a noisy pitch shifted version of the mfcc
    # noisy_pitch_shifted_mfcc = librosa.feature.mfcc(y=add_background_noise(pitch_shift(signal, sample_rate, 2), 0.005), sr=sample_rate, n_mfcc=40)
    noisy_pitch_shifted_mfcc = extract_mfcc_1d(add_background_noise(pitch_shift(signal, sample_rate, 2), 0.005), n_mfcc=40, load=False, sample_rate=sample_rate)
    train_set_mfcc.append(noisy_pitch_shifted_mfcc)

    # Create another noisy pitch shifted version of the mfcc
    #noisy_pitch_shifted_mfcc = librosa.feature.mfcc(y=add_background_noise(pitch_shift(signal, sample_rate, -2), 0.005), sr=sample_rate, n_mfcc=40)
    noisy_pitch_shifted_mfcc = extract_mfcc_1d(add_background_noise(pitch_shift(signal, sample_rate, -2), 0.005), n_mfcc=40, load=False, sample_rate=sample_rate)
    train_set_mfcc.append(noisy_pitch_shifted_mfcc)

    # If the file name has english in it, it's an english speaker, otherwise it's a non-english speaker
    if 'english' in file:
        train_set_labels.append(1)
        train_set_labels.append(1)
        train_set_labels.append(1)
        train_set_labels.append(1)
        train_set_labels.append(1)
        train_set_labels.append(1)
    else:
        train_set_labels.append(0)
        train_set_labels.append(0)
        train_set_labels.append(0)
        train_set_labels.append(0)
        train_set_labels.append(0)
        train_set_labels.append(0)

    # Add a counter to keep track of progress
    i += 1
    if i % 100 == 0:
        print(i)

# Grab the spectros for all audio files in the non-english class
val_set_mfcc = []
val_set_labels = []
i = 0
for file in val_set:
    # Check if file_name exists, if it doesn't, skip it
    file_name = 'recordings/recordings/' + file + '.mp3'
    if not os.path.exists(file_name):
        continue

    mfcc = extract_mfcc_1d(file_name, n_mfcc=40, load=True)

    val_set_mfcc.append(mfcc)

    # No need to apply data augmentation to the validation set, add correct label
    if 'english' in file:
        val_set_labels.append(1)
    else:
        val_set_labels.append(0)

    # Add a counter to keep track of progress
    i += 1
    if i % 100 == 0:
        print(i)


# # Calculate the minimum length of all mfccs
# min_length = min([mfcc.shape[1] for mfcc in train_set_mfcc + val_set_mfcc])

# # Truncate each mfcc in the english_spectro and non_english_spectro lists to the minimum length
# train_set_mfcc = [truncate_spectrogram(mfcc, min_length) for mfcc in train_set_mfcc]
# val_set_mfcc = [truncate_spectrogram(mfcc, min_length) for mfcc in val_set_mfcc]

# # Normalize each mfcc in the english_spectro and non_english_spectro lists using min-max normalization
# train_set_mfcc = [min_max_normalize_spectrogram(mfcc) for mfcc in train_set_mfcc]
# val_set_mfcc = [min_max_normalize_spectrogram(mfcc) for mfcc in val_set_mfcc]

# # Add channel dimension as the first dimension to each mfcc
# train_set_mfcc = [np.expand_dims(mfcc, axis=0) for mfcc in train_set_mfcc]
# val_set_mfcc = [np.expand_dims(mfcc, axis=0) for mfcc in val_set_mfcc]

# Convert the lists to numpy arrays
train_set_mfcc = np.array(train_set_mfcc)
val_set_mfcc = np.array(val_set_mfcc)

# Convert the labels to numpy arrays
train_set_labels = np.array(train_set_labels)
val_set_labels = np.array(val_set_labels)

# Print shapes of first and last mfcc in both lists
print(train_set_mfcc[0].shape)
print(train_set_mfcc[-1].shape)

print(val_set_mfcc[0].shape)
print(val_set_mfcc[-1].shape)

# Print the shape of the arrays
print("Size of training set: ", train_set_mfcc.shape)
print("Size of validation set: ", val_set_mfcc.shape)

# Count the number of english and non-english speakers in the training and validation sets
print("Number of english speakers in training set: ", np.sum(train_set_labels))
print("Number of non-english speakers in training set: ", len(train_set_labels) - np.sum(train_set_labels))

print("Number of english speakers in validation set: ", np.sum(val_set_labels))
print("Number of non-english speakers in validation set: ", len(val_set_labels) - np.sum(val_set_labels))


100
200
300
400
500
600
700
800
900
100
200
(40,)
(40,)
(40,)
(40,)
Size of training set:  (5448, 40)
Size of validation set:  (250, 40)
Number of english speakers in training set:  2724
Number of non-english speakers in training set:  2724
Number of english speakers in validation set:  125
Number of non-english speakers in validation set:  125


In [22]:
# # Add a label of 1 to the english class, and 0 to the non-english class
# english_labels = np.ones(english_mfcc.shape[0])
# non_english_labels = np.zeros(non_english_mfcc.shape[0])

# # Combine the english and non-english data into one array
# X = np.concatenate((english_mfcc, non_english_mfcc))
# y = np.concatenate((english_labels, non_english_labels))

# # Print the shape of the combined array
# print(X.shape)
# print(y.shape)



In [23]:
# Save the data to npz files
np.savez('mfcc_data_40_augmented.npz', X_train=train_set_mfcc, y_train=train_set_labels, X_val=val_set_mfcc, y_val=val_set_labels)