In [1]:
import librosa
import numpy as np
import pickle
from os import listdir


In [2]:
def timeseries_file_listing(lang_abbr):
    base_path = '../data/timeseries/' + lang_abbr + '/'
    return np.array([base_path + f for f in listdir(base_path) if '.pkl' in f])

In [7]:
def timeseries_to_mfcc(path, array, index, n_mfcc=20):
    with open(path, 'rb') as f:
        reloaded = pickle.load(f)
    array[index] = librosa.effects.feature.mfcc(reloaded, sr=16000, n_mfcc=n_mfcc)

In [4]:
# input is language abbreviation, (optional n_mfcc)

def batch_mfcc_save(abbr, n_mfcc=20):
    # timeseries_file_listing creates array of path names
    path_arr = timeseries_file_listing(abbr)
    
    # use length of array (and n_mfcc) to create array of zeros
    size = len(path_arr)
    mfcc_arr = np.zeros((size, n_mfcc, 157))
    n_arr = np.array(range(size))
    
    # vectorize setting elements with timeseries_to_mfcc function
    func = np.vectorize(lambda x: timeseries_to_mfcc(path_arr[x], mfcc_arr, x, n_mfcc=n_mfcc))
    func(n_arr)
    
    # use length of array and lang abbr to create target array
    target_arr = np.full(size, abbr)
    
    # save both using language name and number of elements (+ mfcc or + target)
    mfcc_file_name = '../data/mfcc/' + abbr + str(size) + '_mfcc.pkl'
    with open(mfcc_file_name, 'wb') as f:
        pickle.dump(mfcc_arr, f)
    
    target_file_name = '../data/target/' + abbr + str(size) + '_target.pkl'
    with open(target_file_name, 'wb') as g:
        pickle.dump(target_arr, g)
    
    # will be able to load those, concat alphabetically to preserve matches
    return mfcc_file_name, target_file_name

In [5]:
%%time
batch_mfcc_save('en')

CPU times: user 3min 15s, sys: 5.83 s, total: 3min 21s
Wall time: 2min 5s


('../data/mfcc/en12280_mfcc.pkl', '../data/target/en12280_target.pkl')

In [6]:
%%time
batch_mfcc_save('zh')

CPU times: user 1min 5s, sys: 1.99 s, total: 1min 7s
Wall time: 45.1 s


('../data/mfcc/zh3544_mfcc.pkl', '../data/target/zh3544_target.pkl')

In [7]:
%%time
batch_mfcc_save('fr')

CPU times: user 1min 46s, sys: 3.06 s, total: 1min 50s
Wall time: 1min 11s


('../data/mfcc/fr6175_mfcc.pkl', '../data/target/fr6175_target.pkl')

In [10]:
# batch_mfcc_save('ar')

('../data/mfcc/ar57_mfcc.pkl', '../data/target/ar57_target.pkl')

In [8]:
%%time
batch_mfcc_save('es')

CPU times: user 1min 54s, sys: 3.88 s, total: 1min 58s
Wall time: 1min 7s


('../data/mfcc/es8171_mfcc.pkl', '../data/target/es8171_target.pkl')

In [9]:
%%time
batch_mfcc_save('ru')

CPU times: user 2min 3s, sys: 3.85 s, total: 2min 7s
Wall time: 1min 17s


('../data/mfcc/ru8081_mfcc.pkl', '../data/target/ru8081_target.pkl')