#### Data Processing: Part 1

This notebook contains the functions used to transform the audio files for the first four sources into timeseries and save for further processing. 

In [None]:
# ignoring of warnings to surpress librosa's notification about using audioreader
import warnings
warnings.filterwarnings('ignore')

In [None]:
import librosa
import numpy as np
import pickle

from os import listdir

In [None]:
def audio_file_listing(source, lang_abbr):
    """
    Lists the path names for all the audio files for a given source and language.
    
    Parameters:
        source (str) : the folder name where a source's audio is saved
        lang_abbr (str) : the two letter abbreviation of the language to list audio samples of
    
    Returns:
        file_listing (numpy array) : the path names for all the source's audio files of the specified language
    """
    base_path = '../audio/' + source + '/' + lang_abbr + '/'
    return np.array(
        [f'{i:03}' + '___' + base_path + f for i, f in list(enumerate(listdir(base_path))) if (
            ('.mp3' in f) or ('.wav' in f))])

In [None]:
def split_and_save(fname):
    """
    Saves a given audio sample into timeseries format by either segmenting it into 5 sec pieces or padding it to reach 5 sec
    
    Parameters:
        fname (str) : the file path of the audio file to save
    
    Returns:
        None
    """
    # input is a string of format 'n___f' where n is the sample number and f is the file path
    n = fname.split('___')[0]
    n = np.int(n)
    path = '___'.join(fname.split('___')[1:])
    
    try:
        # load the full sample
        full, sr = librosa.load(path, sr=16000)
        
        # trim silence
        trim, start = librosa.effects.trim(full, sr)
        
        # everytongue samples need to have music intros/outros removed
        if path.split('/')[2] == '2_everytongue':
            trim = trim[5 * sr : -5 * sr]
            
        # calculate duration in seconds
        dur = len(trim) / sr
        
    except:
        return
    
    # splitting path from format '../audio/n_source/lang/sample.mp3'
    split = path.split('/')
    source = split[2].split('_')[0]
    lang = split[3]
    
    # set index
    i = 0
    
    if dur < 5:
        try:
            pad = ((5 * 16000) - len(trim)) / 2
            sample = np.pad(trim, [int(np.floor(pad)), int(np.ceil(pad))])
            save_name = f'../data/timeseries/{lang}/{lang}{source}{n:03}{i:03}.pkl'
            with open(save_name, 'wb') as f:
                pickle.dump(sample, f)
        except:
            pass
    else:
        # cut sample into 5 second clips and save
        for x in range(0, np.floor(dur / 5).astype('int') * 5, 5):
            try:
                sample = trim[x * sr : (x + 5) * sr]
                save_name = f'../data/timeseries/{lang}/{lang}{source}{n:03}{i:03}.pkl'
                with open(save_name, 'wb') as f:
                    pickle.dump(sample, f)
            except:
                pass
            i += 1

In [None]:
v_split_and_save = np.vectorize(split_and_save)

In [None]:
en1 = audio_file_listing('1_audiolingua', 'en')
zh1 = audio_file_listing('1_audiolingua', 'zh')

In [None]:
v_split_and_save(en1)
v_split_and_save(zh1)

In [None]:
en3 = audio_file_listing('3_omniglot', 'en')
fr3 = audio_file_listing('3_omniglot', 'fr')
ru3 = audio_file_listing('3_omniglot', 'ru')
es3 = audio_file_listing('3_omniglot', 'es')
zh3 = audio_file_listing('3_omniglot', 'zh')

In [None]:
v_split_and_save(en3)
v_split_and_save(fr3)
v_split_and_save(ru3)
v_split_and_save(es3)
v_split_and_save(zh3)

In [None]:
en4 = audio_file_listing('4_voxforge', 'en')
fr4 = audio_file_listing('4_voxforge', 'fr')
ru4 = audio_file_listing('4_voxforge', 'ru')
es4 = audio_file_listing('4_voxforge', 'es')

In [None]:
v_split_and_save(en4)
v_split_and_save(fr4)
v_split_and_save(ru4)
v_split_and_save(es4)

In [None]:
zh2 = audio_file_listing('2_everytongue', 'zh')
es2 = audio_file_listing('2_everytongue', 'es')
fr2 = audio_file_listing('2_everytongue', 'fr')
ru2 = audio_file_listing('2_everytongue', 'ru')

In [None]:
v_split_and_save(zh2)
v_split_and_save(es2)
v_split_and_save(fr2)
v_split_and_save(ru2)