In [2]:
import librosa
import numpy as np
import pickle
from os import listdir

In [21]:
def timeseries_file_listing(lang_abbr, folder=None):
    if folder == None:
        base_path = '../data/timeseries/' + lang_abbr + '/'
    else: 
        base_path = '../data/timeseries/' + lang_abbr + '/' + folder + '/' 
    return np.array([base_path + f for f in listdir(base_path) if '.pkl' in f])

In [22]:
def timeseries_to_mfcc(path, array, index, n_mfcc=20):
    with open(path, 'rb') as f:
        reloaded = pickle.load(f)
    array[index] = librosa.effects.feature.mfcc(reloaded, sr=16000, n_mfcc=n_mfcc)

In [23]:
# input is language abbreviation, (optional n_mfcc)

def batch_mfcc_save(abbr, n_mfcc=20, folder=None):
    # timeseries_file_listing creates array of path names
    path_arr = timeseries_file_listing(abbr, folder)
    
    # use length of array (and n_mfcc) to create array of zeros
    size = len(path_arr)
    mfcc_arr = np.zeros((size, n_mfcc, 157))
    n_arr = np.array(range(size))
    
    # vectorize setting elements with timeseries_to_mfcc function
    func = np.vectorize(lambda x: timeseries_to_mfcc(path_arr[x], mfcc_arr, x, n_mfcc=n_mfcc))
    func(n_arr)
    
    # use length of array and lang abbr to create target array
    target_arr = np.full(size, abbr)
    
    # save both using language name and number of elements (+ mfcc or + target)
    if folder == None:
        mfcc_file_name = '../data/mfcc/' + abbr + '_' + str(size) + 'n_' + str(n_mfcc) + 'mfcc.pkl'
    else:
        mfcc_file_name = '../data/mfcc/' + folder + '/' + abbr + '_' + str(size) + 'n_' + str(n_mfcc) + 'mfcc.pkl'
    with open(mfcc_file_name, 'wb') as f:
        pickle.dump(mfcc_arr, f)
    
    if folder == None:
        mfcc_file_name = '../data/target/' + abbr + '_' + str(size) + 'n_target.pkl'
    else:
        target_file_name = '../data/target/' + folder + '/' + abbr + '_' + str(size) + 'n_target.pkl'
    with open(target_file_name, 'wb') as g:
        pickle.dump(target_arr, g)
    
    # will be able to load those, concat alphabetically to preserve matches
    return mfcc_file_name, target_file_name

In [24]:
def save_all(abbr_arr, n_mfcc=20, folder=None):
    func = np.vectorize(lambda x: batch_mfcc_save(x, n_mfcc, folder))
    func(abbr_arr)

In [7]:
%%time
batch_mfcc_save('en')

CPU times: user 2min 57s, sys: 6.23 s, total: 3min 3s
Wall time: 1min 52s


('../data/mfcc/en_12130n_20mfcc.pkl', '../data/target/en_12130n_target.pkl')

In [8]:
%%time
batch_mfcc_save('zh')

CPU times: user 2min 43s, sys: 5.83 s, total: 2min 48s
Wall time: 1min 45s


('../data/mfcc/zh_11067n_20mfcc.pkl', '../data/target/zh_11067n_target.pkl')

In [9]:
%%time
batch_mfcc_save('fr')

CPU times: user 2min 48s, sys: 6.03 s, total: 2min 54s
Wall time: 1min 50s


('../data/mfcc/fr_10894n_20mfcc.pkl', '../data/target/fr_10894n_target.pkl')

In [10]:
%%time
batch_mfcc_save('es')

CPU times: user 2min 44s, sys: 5.78 s, total: 2min 50s
Wall time: 1min 41s


('../data/mfcc/es_11763n_20mfcc.pkl', '../data/target/es_11763n_target.pkl')

In [11]:
%%time
batch_mfcc_save('ru')

CPU times: user 2min 45s, sys: 5.69 s, total: 2min 50s
Wall time: 1min 41s


('../data/mfcc/ru_11848n_20mfcc.pkl', '../data/target/ru_11848n_target.pkl')

In [7]:
batch_mfcc_save('en', 5)
batch_mfcc_save('zh', 5)
batch_mfcc_save('fr', 5)
batch_mfcc_save('es', 5)
batch_mfcc_save('ru', 5)

('../data/mfcc/ru_11848n_5mfcc.pkl', '../data/target/ru_11848n_target.pkl')

In [25]:
save_all(np.array(['en', 'es', 'fr', 'ru', 'zh']), 10, 'validation')