In [9]:
import numpy as np
import pandas as pd
import librosa
import os

In [10]:
# Convert csv to dataframe
df = pd.read_csv('speakers_all.csv')

# Display the first 5 rows of the dataframe
df.head()

Unnamed: 0,age,age_onset,birthplace,filename,native_language,sex,speakerid,country,file_missing?,Unnamed: 9,Unnamed: 10,Unnamed: 11
0,24.0,12.0,"koussi, senegal",balanta,balanta,male,788,senegal,True,,,
1,18.0,10.0,"buea, cameroon",cameroon,cameroon,male,1953,cameroon,True,,,
2,48.0,8.0,"hong, adamawa, nigeria",fulfulde,fulfulde,male,1037,nigeria,True,,,
3,42.0,42.0,"port-au-prince, haiti",haitian,haitian,male,1165,haiti,True,,,
4,40.0,35.0,"port-au-prince, haiti",haitian,haitian,male,1166,haiti,True,,,


In [11]:
# Print distribution of native_language, and country
print(df['native_language'].value_counts())
print(df['country'].value_counts())

native_language
english     579
spanish     162
arabic      102
mandarin     65
french       63
           ... 
kalanga       1
kabyle        1
jola          1
irish         1
zulu          1
Name: count, Length: 214, dtype: int64
country
usa         393
china        88
uk           67
india        59
canada       54
           ... 
namibia       1
romanian      1
burundi       1
rwanda        1
benin         1
Name: count, Length: 176, dtype: int64


In [12]:
# Create two classes of data, USA and non-USA, and grab the filenames of both classes
# Also make sure the file_missing? column is False
english = df[(df['native_language'] == 'english') & (df['file_missing?'] == False)]['filename']
non_english = df[(df['native_language'] != 'english') & (df['file_missing?'] == False)]['filename']

print('Number of english: ', len(english))
print('Number of non-english: ', len(non_english))

# Print total number of files
print('Total number of files: ', len(df))

# Create a list of countries that have less than 10 speakers
native_languages = df['native_language'].value_counts()
native_languages = native_languages[native_languages < 25].index.tolist()

# Remove the countries that only have 1 speaker
df = df[~df['native_language'].isin(native_languages)]

english = df[(df['native_language'] == 'english') & (df['file_missing?'] == False)]['filename']
non_english = df[(df['native_language'] != 'english') & (df['file_missing?'] == False)]['filename']

print('Number of english: ', len(english))
print('Number of non-english: ', len(non_english))


Number of english:  579
Number of non-english:  1559
Total number of files:  2172
Number of english:  579
Number of non-english:  780


In [13]:
# Helper function to calculate the spectrogram
def calculate_spectrogram(audio_file, n_fft=2048, hop_length=512):
    signal, sample_rate = librosa.load(audio_file)
    spectrogram = librosa.stft(signal, n_fft=n_fft, hop_length=hop_length)
    
    return np.abs(spectrogram)

def truncate_spectrogram(spectrogram, fixed_length):
    truncated_spectrogram = spectrogram[:, :fixed_length]
    return truncated_spectrogram

def calculate_mel_spectrogram(audio_file, n_fft=2048, hop_length=512, n_mels=128):
    signal, sample_rate = librosa.load(audio_file)
    mel_spectrogram = librosa.feature.melspectrogram(y=signal, sr=sample_rate, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels)
    
    return mel_spectrogram

def calculate_mel_spectrogram_no_load(signal, sample_rate, n_fft=2048, hop_length=512, n_mels=128):

    mel_spectrogram = librosa.feature.melspectrogram(y=signal, sr=sample_rate, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels)
    
    return mel_spectrogram

def normalize_spectrogram(spectrogram):
    mean = np.mean(spectrogram)
    std_dev = np.std(spectrogram)
    normalized_spectrogram = (spectrogram - mean) / std_dev
    return normalized_spectrogram

def min_max_normalize_spectrogram(spectrogram):
    min_value = np.min(spectrogram)
    max_value = np.max(spectrogram)
    normalized_spectrogram = (spectrogram - min_value) / (max_value - min_value)
    return normalized_spectrogram

def pitch_shift(audio_data, sample_rate, pitch_shift_steps):
    return librosa.effects.pitch_shift(y=audio_data, sr=sample_rate, n_steps=pitch_shift_steps)

def add_background_noise(audio_data, noise_amplitude):
    noise = np.random.normal(0, noise_amplitude, len(audio_data))
    return audio_data + noise


In [14]:
# Calculate the spectrogram for all audio files in the english class
english_spectro = []
augmented_spectro = []
augmented_labels = []
i = 0
for file in english:
    # Check if file_name exists, if it doesn't, skip it
    file_name = 'recordings/recordings/' + file + '.mp3'
    if not os.path.exists(file_name):
        continue

    spectrogram = calculate_mel_spectrogram('recordings/recordings/' + file + '.mp3', n_fft=2048, hop_length=512)
    english_spectro.append(spectrogram)

    # Create a pitch shifted version of the spectrogram, first find the sampling rate
    signal, sample_rate = librosa.load('recordings/recordings/' + file + '.mp3')
    pitch_shifted_spectrogram = calculate_mel_spectrogram_no_load(signal=pitch_shift(signal, sample_rate, 2), sample_rate=sample_rate, n_fft=2048, hop_length=512)
    
    augmented_spectro.append(pitch_shifted_spectrogram)

    # Add label to augmented_labels
    augmented_labels.append(1)

    # Create a noisy version of the spectrogram
    noisy_spectrogram = calculate_mel_spectrogram_no_load(signal=add_background_noise(signal, 0.005), sample_rate=sample_rate, n_fft=2048, hop_length=512)
    augmented_spectro.append(noisy_spectrogram)

    # Add label to augmented_labels
    augmented_labels.append(1)

    # Create a noisy pitch shifted version of the spectrogram
    noisy_pitch_shifted_spectrogram = calculate_mel_spectrogram_no_load(signal=add_background_noise(pitch_shift(signal, sample_rate, 2), 0.005), sample_rate=sample_rate, n_fft=2048, hop_length=512)
    augmented_spectro.append(noisy_pitch_shifted_spectrogram)

    # Add label to augmented_labels
    augmented_labels.append(1)

    # Add a counter to keep track of progress
    i += 1
    if i % 100 == 0:
        print(i)

# Grab the spectros for all audio files in the non-english class
non_english_spectro = []
i = 0
for file in non_english:
    # Check if file_name exists, if it doesn't, skip it
    file_name = 'recordings/recordings/' + file + '.mp3'
    if not os.path.exists(file_name):
        continue

    spectrogram = calculate_mel_spectrogram('recordings/recordings/' + file + '.mp3', n_fft=2048, hop_length=512)
    non_english_spectro.append(spectrogram)

    # Create a pitch shifted version of the spectrogram, first find the sampling rate
    signal, sample_rate = librosa.load('recordings/recordings/' + file + '.mp3')
    pitch_shifted_spectrogram = calculate_mel_spectrogram_no_load(signal=pitch_shift(signal, sample_rate, 2), sample_rate=sample_rate, n_fft=2048, hop_length=512)
    
    augmented_spectro.append(pitch_shifted_spectrogram)

    # Add label to augmented_labels
    augmented_labels.append(0)

    # Create a noisy version of the spectrogram
    noisy_spectrogram = calculate_mel_spectrogram_no_load(signal=add_background_noise(signal, 0.005), sample_rate=sample_rate, n_fft=2048, hop_length=512)
    augmented_spectro.append(noisy_spectrogram)

    # Add label to augmented_labels
    augmented_labels.append(0)

    # Create a noisy pitch shifted version of the spectrogram
    noisy_pitch_shifted_spectrogram = calculate_mel_spectrogram_no_load(signal=add_background_noise(pitch_shift(signal, sample_rate, 2), 0.005), sample_rate=sample_rate, n_fft=2048, hop_length=512)
    augmented_spectro.append(noisy_pitch_shifted_spectrogram)

    # Add label to augmented_labels
    augmented_labels.append(0)

    # Add a counter to keep track of progress
    i += 1
    if i % 100 == 0:
        print(i)


# Calculate the minimum length of all spectrograms
min_length = min([spectrogram.shape[1] for spectrogram in english_spectro + non_english_spectro + augmented_spectro])

# Truncate each spectrogram in the english_spectro and non_english_spectro lists to the minimum length
english_spectro = [truncate_spectrogram(spectrogram, min_length) for spectrogram in english_spectro]
non_english_spectro = [truncate_spectrogram(spectrogram, min_length) for spectrogram in non_english_spectro]
augmented_spectro = [truncate_spectrogram(spectrogram, min_length) for spectrogram in augmented_spectro]

# Normalize each spectrogram in the english_spectro and non_english_spectro lists using min-max normalization
english_spectro = [min_max_normalize_spectrogram(spectrogram) for spectrogram in english_spectro]
non_english_spectro = [min_max_normalize_spectrogram(spectrogram) for spectrogram in non_english_spectro]
augmented_spectro = [min_max_normalize_spectrogram(spectrogram) for spectrogram in augmented_spectro]

# Add channel dimension as the first dimension to each spectrogram
english_spectro = [np.expand_dims(spectrogram, axis=0) for spectrogram in english_spectro]
non_english_spectro = [np.expand_dims(spectrogram, axis=0) for spectrogram in non_english_spectro]
augmented_spectro = [np.expand_dims(spectrogram, axis=0) for spectrogram in augmented_spectro]

# Convert the lists to numpy arrays
english_spectro = np.array(english_spectro)
non_english_spectro = np.array(non_english_spectro)
augmented_spectro = np.array(augmented_spectro)

# Print shapes of first and last spectrogram in both lists
print(english_spectro[0].shape)
print(english_spectro[-1].shape)

print(non_english_spectro[0].shape)
print(non_english_spectro[-1].shape)

print(augmented_spectro[0].shape)
print(augmented_spectro[-1].shape)

# Print the shape of the arrays
print("Size of english_spectros: ", english_spectro.shape)
print("Size of non_english_spectros: ", non_english_spectro.shape)
print("Size of augmented_spectros: ", augmented_spectro.shape)


100
200
300
400
500
100
200
300
400
500
600
700
(1, 128, 709)
(1, 128, 709)
(1, 128, 709)
(1, 128, 709)
(1, 128, 709)
(1, 128, 709)
Size of english_spectros:  (579, 1, 128, 709)
Size of non_english_spectros:  (780, 1, 128, 709)
Size of augmented_spectros:  (4077, 1, 128, 709)


In [15]:
# Add a label of 1 to the english class, and 0 to the non-english class
english_labels = np.ones(english_spectro.shape[0])
non_english_labels = np.zeros(non_english_spectro.shape[0])
augmented_labels = np.array(augmented_labels)

# Combine the english and non-english data into one array
X = np.concatenate((english_spectro, non_english_spectro))
y = np.concatenate((english_labels, non_english_labels))

# Print the shape of the combined array
print(X.shape)
print(y.shape)



(1359, 1, 128, 709)
(1359,)


In [16]:
# Save the data to npz files
np.savez('mel_spectro_data_min_max_norm_augmented.npz', X=X, y=y, X_augmented=augmented_spectro, y_augmented=augmented_labels)