# Classification of Native and Non-Native English Accents

In [30]:
import os
import IPython.display as ipd
import librosa
import librosa.display
import matplotlib.pyplot as plt
import torch
import torchaudio
import torchaudio.transforms as TaudioT
import torch.nn.functional as F
import numpy as np
import random


## Data Exploration

The different accents we have in our dataset

In [4]:
data_path = './accentdb_extended_combined/data/'
accents = [accent for accent in os.listdir(data_path) if accent[0] != '.']
print(accents)
print('count: ', len(accents))

['indian', 'malayalam', 'australian', 'odiya', 'welsh', 'telugu', 'bangla', 'british', 'american']
count:  9


The number of audio samples we have for each accents

In [5]:
for accent in accents:
    files = [file for file in os.listdir(data_path + accent + '/') if file[0] != '.']
    print((accent, len(files)))

('indian', 1484)
('malayalam', 2393)
('australian', 1484)
('odiya', 747)
('welsh', 742)
('telugu', 1515)
('bangla', 1528)
('british', 1484)
('american', 5936)


Let's take a sample wave file and listen to what the recording sounds like.

In [6]:
wave_file = "./accentdb_extended_combined/data/indian/indian_s01_008.wav"
# Audio is part of IPython's disply module providing audio controls
ipd.Audio(filename=wave_file)

Let's see a pressure vs time graph of the wave file

In [None]:
# librosa is a python package for music and audio analysis
# load converts a wave file to audio time series numpy array
# return:
# x -> numpy ndarray; audio time series multi channel supported
# sr -> scalar value; sample rate
audio, sample_rate = librosa.load(wave_file)
plt.figure(figsize=(14, 5))
# librosa.display.waveshow(x, sr=sr)
plt.show()

Mel-frequency cepstral coefficients (MFCC) is known in the audio signal analisys field to be the best representation for human speech audio signals.  Let's convert this wave file into an MFCC frame, and see what it looks like.

In [None]:
# librosa's feature extraction module mfcc converter
# returns a numpy array of mfcc sequence
mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=30)
print(mfccs.shape)

plt.figure(figsize=(14, 7))
librosa.display.specshow(mfccs, sr=sample_rate, x_axis='time')
plt.show()

Exploring meta data

In [None]:
for accent in accents:
    path = data_path + accent + '/'
    print(accent)
    directory = [file for file in os.listdir(path) if file[0] != '.']
    for file in directory:
        file_path = path + file
        print(torchaudio.info(file_path))
        audio, sample_rate = librosa.load(file_path)
        plt.figure(figsize=(14, 5))
        librosa.display.waveshow(audio, sr=sample_rate)
        plt.show()
        break

We see difference in sample rate (22050 vs 48000) and number of channels (mono vs stereo) which translates into difference in dimensions of resulting MFCC representation which is an issue for our modeling process that we need to address.

In [165]:
# helper functions
def mono_to_stereo(signal):
    x, sr = signal
    if x.shape[0] == 2:
        return signal
    stereo = torch.cat([x, x])
    return stereo, sr

# convert to mono
def stereo_to_mono(signal):
    audio, sample_rate = signal
    print("Before convert to mono", audio.shape, sample_rate)
    if audio.shape[0] == 1:
        return signal
    audio = torch.mean(audio)
    return audio, sample_rate
   


# resample
def resample(signal, target_sample_rate=22050):
    audio, sample_rate = signal
    print("Before resample", audio.shape, sample_rate)
    if sample_rate == target_sample_rate:
        return signal
    else:
        resample = TaudioT.Resample(sample_rate, target_sample_rate)
        audio = resample(audio)
        return audio, target_sample_rate

def resample1(signal, new_sr=22050):
    x, sr = signal
    if sr == new_sr:
        return signal
    channel_1 = TaudioT.Resample(sr, new_sr)(x[:1, :])
    # channel_2 = TaudioT.Resample(sr, new_sr)(x[1:, :])
    new_x = torch.cat([channel_1])
    return new_x, new_sr

# adjust samples
def adjust_samples(signal, target_num_samples= 66150):
    audio, sample_rate = signal

    # check shape and sample rate
    print("Before adjust sample", audio.shape, sample_rate)

    if audio.shape[0] > target_num_samples:
        # Crop
        audio = audio[ :target_num_samples]
        return audio, sample_rate
    elif audio.shape[0] < target_num_samples:
        # Pad
        audio = F.pad(audio, (0, target_num_samples - audio.shape[0]))
        return audio, sample_rate
    return audio, sample_rate  
    

def limit_length(signal, ms=3000):
    audio, sample_rate = signal
    rows, audio_len = audio.shape
    max_len = sample_rate // 1000 * ms
    if audio_len > max_len:
        # audio = audio[:, :max_len]
        audio = audio[:, :max_len]
    elif audio_len < max_len:
        diff = max_len - audio_len
        append_start_len = random.randint(0, diff)
        append_stop_len = diff - append_start_len
        append_start = torch.zeros((rows, append_start_len))
        append_stop = torch.zeros((rows, append_stop_len))

        audio = torch.cat((append_start, audio, append_stop), 1)
    return audio, sample_rate

def convert_to_mel(signal, target_sample_rate= 66150):
    audio, sample_rate = signal
    melspectrogram = TaudioT.MelSpectrogram(
        sample_rate = sample_rate, 
        n_mels = 20, 
        n_fft = 512, 
        hop_length = None)
    mfcc_frames = melspectrogram(audio)
    spec = TaudioT.AmplitudeToDB(top_db=80)(mfcc_frames)
    return spec
    
def mfcc(signal):
    x, sr = signal
    melkwargs = {
        "n_fft": 512, "n_mels": 20, "hop_length": None, "mel_scale": "htk"
    }
    mfcc_transformer = TaudioT.MFCC(
        sample_rate = sr,
        n_mfcc = 20, 
        melkwargs=melkwargs
    )
    mfcc_frames = mfcc_transformer(x)
    spec = TaudioT.AmplitudeToDB(top_db=80)(mfcc_frames)
    return spec


Let's convert one audio sample to see what we are dealing with

In [167]:
signal = torchaudio.load(wave_file)
signal = mono_to_stereo(signal)
signal = resample(signal)
signal = limit_length(signal)
mfcc_frame = mfcc(signal)

print(mfcc_frame.shape)

mfcc_frame

Before resample torch.Size([2, 85995]) 22050
torch.Size([2, 20, 258])


tensor([[[-60.6520, -60.6520, -60.6520,  ..., -60.6520, -60.6520, -60.6520],
         [-51.1751, -51.1751, -51.1751,  ...,  17.6589,  17.0803,  15.8415],
         [-60.6520, -60.6520, -60.6520,  ...,   9.8607,  12.2282,  15.3684],
         ...,
         [-42.0936, -42.0936, -42.0936,  ...,  -3.7579,   5.5471,   3.9661],
         [-60.6520, -60.6520, -60.6520,  ...,  -2.3994, -60.6520,   2.3849],
         [-39.7187, -39.7187, -39.7187,  ..., -60.6520, -22.3304, -60.6520]],

        [[-60.6520, -60.6520, -60.6520,  ..., -60.6520, -60.6520, -60.6520],
         [-51.1751, -51.1751, -51.1751,  ...,  17.6589,  17.0803,  15.8415],
         [-60.6520, -60.6520, -60.6520,  ...,   9.8607,  12.2282,  15.3684],
         ...,
         [-42.0936, -42.0936, -42.0936,  ...,  -3.7579,   5.5471,   3.9661],
         [-60.6520, -60.6520, -60.6520,  ...,  -2.3994, -60.6520,   2.3849],
         [-39.7187, -39.7187, -39.7187,  ..., -60.6520, -22.3304, -60.6520]]])

Let's convert the rest of the audio samples.  Since "welsh" has the lowest number of samples at 742, to keep balance in our dataset, we will convert 742 samples from each accent.

In [163]:
def convert(wave):
    signal = torchaudio.load(wave)
    signal = stereo_to_mono(signal)
    signal = resample1(signal)
    signal = adjust_samples(signal)
    spec = mfcc(signal)
    return spec

X_full = []
y_full = []

for accent in accents:
    # test with a smaller count
    count = 5
    for file in os.listdir(data_path + accent + '/'):
        if file[0] != '.':
            wave_file_path = data_path + accent + '/' + file
            X_full.append(convert(wave_file_path))
            y_full.append(accents.index(accent))
            count -= 1
        if count == 0: break


Before convert to mono torch.Size([1, 81585]) 22050
Before resample torch.Size([1, 81585]) 22050
Before adjust sample torch.Size([1, 81585]) 22050
Before convert to mono torch.Size([1, 79380]) 22050
Before resample torch.Size([1, 79380]) 22050
Before adjust sample torch.Size([1, 79380]) 22050
Before convert to mono torch.Size([1, 74970]) 22050
Before resample torch.Size([1, 74970]) 22050
Before adjust sample torch.Size([1, 74970]) 22050
Before convert to mono torch.Size([1, 74970]) 22050
Before resample torch.Size([1, 74970]) 22050
Before adjust sample torch.Size([1, 74970]) 22050
Before convert to mono torch.Size([1, 97020]) 22050
Before resample torch.Size([1, 97020]) 22050
Before adjust sample torch.Size([1, 97020]) 22050
Before convert to mono torch.Size([2, 249600]) 48000
Before resample torch.Size([]) 48000


IndexError: tuple index out of range

In [154]:
# X_full = np.stack(X_full)
# y_full = np.array(y_full)
X_full[0].shape
y_full
Y_full = []

non_native_indexes = [1,3,5,6]

for index in y_full: 
    if index in non_native_indexes:
        Y_full.append("non_native_english")
    else: 
        Y_full.append("native_english")

Check the counts of non_native_english and native_english. Should be 2968 and 3710

In [155]:
print(Y_full.count("non_native_english"))
print(Y_full.count("native_english"))


20
25


Check that the shapes of the entries in X_full are the same

In [160]:
for i in range(20):
    print(X_full[i].shape)

torch.Size([1, 20, 578])
torch.Size([1, 20, 569])
torch.Size([1, 20, 552])
torch.Size([1, 20, 552])
torch.Size([1, 20, 638])
torch.Size([20, 259])
torch.Size([20, 259])
torch.Size([20, 259])
torch.Size([20, 259])
torch.Size([20, 259])
torch.Size([1, 20, 560])
torch.Size([1, 20, 535])
torch.Size([1, 20, 543])
torch.Size([1, 20, 543])
torch.Size([1, 20, 569])
torch.Size([20, 259])
torch.Size([20, 259])
torch.Size([20, 259])
torch.Size([20, 259])
torch.Size([20, 259])


In [None]:
X_full.shape

In [None]:
y_full.shape

In [None]:
np.save('./dataset.npy', X_full)