In [3]:
import librosa
import numpy as np
import pickle
from os import listdir

In [5]:
def timeseries_file_listing(lang_abbr):
    base_path = '../data/timeseries/' + lang_abbr + '/'
    return np.array([base_path + f for f in listdir(base_path) if '.pkl' in f])

In [6]:
def timeseries_to_mfcc(path, array, index, n_mfcc=20):
    with open(path, 'rb') as f:
        reloaded = pickle.load(f)
    array[index] = librosa.effects.feature.mfcc(reloaded, sr=16000, n_mfcc=n_mfcc)

In [12]:
# input is language abbreviation, (optional n_mfcc)

def batch_mfcc_save(abbr, n_mfcc=20):
    # timeseries_file_listing creates array of path names
    path_arr = timeseries_file_listing(abbr)
    
    # use length of array (and n_mfcc) to create array of zeros
    size = len(path_arr)
    mfcc_arr = np.zeros((size, n_mfcc, 157))
    n_arr = np.array(range(size))
    
    # vectorize setting elements with timeseries_to_mfcc function
    func = np.vectorize(lambda x: timeseries_to_mfcc(path_arr[x], mfcc_arr, x, n_mfcc=n_mfcc))
    func(n_arr)
    
    # use length of array and lang abbr to create target array
    target_arr = np.full(size, abbr)
    
    # save both using language name and number of elements (+ mfcc or + target)
    mfcc_file_name = '../data/mfcc/' + abbr + str(size) + '_mfcc.pkl'
    with open(mfcc_file_name, 'wb') as f:
        pickle.dump(mfcc_arr, f)
    
    target_file_name = '../data/target/' + abbr + str(size) + '_mfcc.pkl'
    with open(target_file_name, 'wb') as g:
        pickle.dump(target_arr, g)
    
    # will be able to load those, concat alphabetically to preserve matches
    return mfcc_file_name, target_file_name