#### Data Processing: Part 3

This notebook contains the functions and code used to process the audio files obtained from commonvoice.mozilla.org

These samples were obtained later in the process than the first four sources, and were originally intended for use exclusively as validation samples. These samples were, in general, shorter than the previous samples, so they didn't need to be split into many pieces, but also went through a different process because near the end of the project I began to prioritize computer storage space over runtime. This is why they were transformed from audio to timeseries to mfcc without saving in between. 

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import librosa
import numpy as np
import pickle

from librosa.feature import mfcc
from os import listdir

In [27]:
def audio_file_listing(source, lang_abbr):
    """
    Lists the path names for all the audio files for a given source and language.
    
    Parameters:
        source (str) : the folder name where a source's audio is saved
        lang_abbr (str) : the two letter abbreviation of the language to list audio samples of
    
    Returns:
        file_listing (numpy array) : the path names for all the source's audio files of the specified language
    """
    base_path = '../audio/' + source + '/' + lang_abbr + '/'
    return np.array(
        [base_path + f for i, f in list(enumerate(listdir(base_path))) if (
            ('.mp3' in f) or ('.wav' in f))])

In [28]:
def save_mfcc_target(folder, source, abbr, n_mfcc=10):
    """
    Saves the master mfcc array and corresponding target array
    
    Parameters:
        folder (str) : the file path to save to
        source (str) : the folder name where a source's audio is saved
        abbr (str) : the two letter abbreviation of the language to save
        n_mfcc (int) : the number of Mel frequency cepstrum coefficients to be calculated (default 10)
    """
    audio_in_path = '../audio/' + source + '/' + abbr + '/'
    mfcc_out_path = '../data/' + folder + '/' + source + '_' + abbr + '_mfcc.pkl'
    target_out_path = '../data/' + folder + '/' + source + '_' + abbr + '_target.pkl'
    
    mfcc_arr, target_arr = mfcc_and_target(audio_in_path, n_mfcc)
    
    with open(mfcc_out_path, 'wb') as f:
        pickle.dump(mfcc_arr, f)
    with open(target_out_path, 'wb') as g:
        pickle.dump(target_arr, g)

In [29]:
def add_mfcc(mfcc_arr, file, sample_n, n_mfcc=10):
    """
    Adds a sample's mfcc to the master array
    
    Parameters:
        mfcc_arr (numpy array) : the master array to add to
        file (str) : the file path of the audio sample to add
        sample_n (int) : the sample number of the file, used to index the mfcc to be added
        n_mfcc (int) : the number of Mel frequency cepstrum coefficients to be calculated (default 10)
    """
    try:
        timeseries, sr = librosa.load(file, sr=16000)
        trimmed, x = librosa.effects.trim(timeseries)

        if len(trimmed) / 16000 > 5:
            sample = trimmed[:5 * 16000]
        else:
            pad = ((5 * 16000) - len(trimmed)) / 2
            sample = np.pad(trimmed, [int(np.floor(pad)), int(np.ceil(pad))])

        mfcc_arr[sample_n] = mfcc(sample, sr=16000, n_mfcc=n_mfcc)
    except:
        mfcc_arr[sample_n] = np.full((n_mfcc, 157), np.nan)

In [30]:
def mfcc_and_target(path_in, n_mfcc=10):
    """
    Creates arrays of mfccs and language target values
    
    Parameters
        path_in (str) : file path to folder of audio files ('../audio/[source]/[abbr]/')
        n_mfcc (int) : number of mel frequency cepstrum coefficients to be calculated
    
    Returns
        mfcc_arr (numpy array) : array of mfcc matrices for each file
        target_arr (numpy array) : array of language abbreviation target values corresponding to mfcc_arr
    """
    path_in_split = path_in.split('/')
    
    source = path_in_split[-3]
    abbr = path_in_split[-2]
    
    audio_files = audio_file_listing(source, abbr)
    size = len(audio_files)
    
    mfcc_arr = np.full((size, n_mfcc, 157), np.nan)
    
    num_arr = np.array(range(size))
    
    func = np.vectorize(lambda x: add_mfcc(mfcc_arr, audio_files[x], x, n_mfcc))
    func(num_arr)
    
    target_arr = np.full(size, abbr)
    mfcc_arr = mfcc_arr.reshape(size, n_mfcc, 157)
    
    return mfcc_arr, target_arr

In [31]:
%%time
save_mfcc_target('validation', 'v_commonvoice', 'en')

CPU times: user 38min 40s, sys: 1min 18s, total: 39min 58s
Wall time: 26min 14s


In [13]:
%%time
save_mfcc_target('validation', 'v_commonvoice', 'es')

CPU times: user 39min 3s, sys: 1min 38s, total: 40min 42s
Wall time: 30min 25s


In [32]:
%%time
save_mfcc_target('validation', 'v_commonvoice', 'fr')

CPU times: user 36min 22s, sys: 1min 31s, total: 37min 54s
Wall time: 24min 49s


In [33]:
%%time
save_mfcc_target('validation', 'v_commonvoice', 'ru')

CPU times: user 43min 55s, sys: 1min 42s, total: 45min 37s
Wall time: 33min 8s


In [9]:
%%time
save_mfcc_target('validation', 'v_commonvoice', 'zh')

CPU times: user 52min 23s, sys: 1min 51s, total: 54min 14s
Wall time: 43min 5s


After a long wait time, all the mfcc and target arrays had been saved, but because I had exceptions return arrays of NaNs, I had to reload the files to remove those from the listing. 

In [5]:
valpath = '../data/validation/'

mfcc_val = [valpath + f for f in listdir(valpath) if 'mfcc.pkl' in f]
target_val = [valpath + f for f in listdir(valpath) if 'target.pkl' in f]

mfcc_val.sort()
target_val.sort()

In [6]:
def reload(path):
    """
    Reloads a pickled file
    
    Parameters:
        path (str) : the file path of the pickle
    
    Returns:
        None
    """
    with open(path, 'rb') as f:
        return pickle.load(f)

In [7]:
val_mfcc_reloaded = [reload(f) for f in mfcc_val]
val_target_reloaded = [reload(f) for f in target_val]

In [15]:
# for each of the reloaded arrays, remove any entries of NaNs and save a new target array of the correct length
for i, j in zip(val_mfcc_reloaded, val_target_reloaded):
    abbr = j[0]
    X = i[~np.isnan(i)]
    size = X.shape[0] // 157 // 10
    y = np.full(size, abbr)
    with open('../data/validation/commonvoice_' + abbr + '_mfcc.pkl', 'wb') as f:
        pickle.dump(X, f)
    with open('../data/validation/commonvoice_' + abbr + '_target.pkl', 'wb') as g:
        pickle.dump(y, g)