In [1]:
import librosa
import numpy as np
import pickle

from os import listdir

In [2]:
# https://towardsdatascience.com/understanding-audio-data-fourier-transform-fft-spectrogram-and-speech-recognition-a4072d228520
def spectrogram(samples, sample_rate, stride_ms = 10.0, 
                          window_ms = 20.0, max_freq = 8000, eps = 1e-14):

    stride_size = int(0.001 * sample_rate * stride_ms)
    window_size = int(0.001 * sample_rate * window_ms)

    # Extract strided windows
    truncate_size = (len(samples) - window_size) % stride_size
    samples = samples[:len(samples) - truncate_size]
    nshape = (window_size, (len(samples) - window_size) // stride_size + 1)
    nstrides = (samples.strides[0], samples.strides[0] * stride_size)
    windows = np.lib.stride_tricks.as_strided(samples, 
                                          shape = nshape, strides = nstrides)
    
    assert np.all(windows[:, 1] == samples[stride_size:(stride_size + window_size)])

    # Window weighting, squared Fast Fourier Transform (fft), scaling
    weighting = np.hanning(window_size)[:, None]
    
    fft = np.fft.rfft(windows * weighting, axis=0)
    fft = np.absolute(fft)
    fft = fft**2
    
    scale = np.sum(weighting**2) * sample_rate
    fft[1:-1, :] *= (2.0 / scale)
    fft[(0, -1), :] /= scale
    
    # Prepare fft frequency list
    freqs = float(sample_rate) / window_size * np.arange(fft.shape[0])
    
    # Compute spectrogram feature
    ind = np.where(freqs <= max_freq)[0][-1] + 1
    specgram = np.log(fft[:ind, :] + eps)
    return specgram

In [3]:
def audio_file_listing(source, lang_abbr):
    base_path = '../audio/' + source + '/' + lang_abbr + '/'
    return [f'{i:03}' + '___' + base_path + f for i, f in list(enumerate(listdir(base_path))) if '.mp3' in f]

In [4]:
def timeseries_file_listing(lang_abbr):
    base_path = '../data/timeseries/' + lang_abbr + '/'
    return [base_path + f for f in listdir(base_path) if '.pkl' in f]

In [5]:
def split_and_save(fname):
    # input is a string of format 'n___f' where n is the sample number and f is the file path
    n = fname.split('___')[0]
    n = np.int(n)
    path = '___'.join(fname.split('___')[1:])
    
    try:
        # load the full sample
        full, sr = librosa.load(path, sr=16000)
        # trim silence
        trim, start = librosa.effects.trim(full, sr)
        # calculate duration in seconds
        dur = len(trim) / sr
    except:
        return
    
    # splitting path from format '../audio/n_source/lang/sample.mp3'
    split = path.split('/')
    source = split[2].split('_')[0]
    lang = split[3]
    
    # set index
    i = 0
    
    # cut sample into 5 second clips and save
    for x in range(0, np.floor(dur / 5).astype('int') * 5, 5):
        try:
            sample = trim[x * sr : (x + 5) * sr]
            save_name = f'../data/timeseries/{lang}/{lang}{source}{n:03}{i:03}.pkl'
            with open(save_name, 'wb') as f:
                pickle.dump(sample, f)
        except:
            pass
        i += 1

In [6]:
def timeseries_to_spectrogram(path):
    # path comes in as ../data/timeseries/lang/file.pkl
    new_path = path.replace('timeseries', 'spectrogram')

    try:
        with open(path, 'rb') as f:
            reload = pickle.load(f)

        with open(new_path, 'wb') as f:
            pickle.dump(spectrogram(reload, 16000), f)
    except:
        pass

In [7]:
en1 = np.array(audio_file_listing('1_audiolingua', 'en'))

In [8]:
en1

array(['000___../audio/1_audiolingua/en/hello_my_name_is_samantha.mp3',
       '001___../audio/1_audiolingua/en/the_great_gatsby_favourite_book.mp3',
       '002___../audio/1_audiolingua/en/sophie_-_what_i_like_to_wear.mp3',
       '003___../audio/1_audiolingua/en/binge-watching_series_on_netflix_renee.mp3',
       '004___../audio/1_audiolingua/en/jason_if.mp3',
       '005___../audio/1_audiolingua/en/photojournalism___jordan.mp3',
       '006___../audio/1_audiolingua/en/sean.mp3',
       '007___../audio/1_audiolingua/en/india.mp3',
       '008___../audio/1_audiolingua/en/sophia_-_queen-2.mp3',
       '009___../audio/1_audiolingua/en/gwen.mp3',
       '010___../audio/1_audiolingua/en/fiona_sport.mp3',
       '011___../audio/1_audiolingua/en/emily_-_childhood_in_australia.mp3',
       '012___../audio/1_audiolingua/en/job_interview_-1.mp3',
       '013___../audio/1_audiolingua/en/sarah_introduces_herself.mp3',
       '014___../audio/1_audiolingua/en/rachel_s_latest_movie.mp3',
       '01

In [18]:
zh1 = np.array(audio_file_listing('1_audiolingua', 'zh'))

In [31]:
v_split_and_save = np.vectorize(split_and_save)

In [40]:
# v_split_and_save(en1)

In [41]:
# v_split_and_save(zh1)

In [57]:
en1_t = np.array(timeseries_file_listing('en'))
zh1_t = np.array(timeseries_file_listing('zh'))

In [58]:
v_timeseries_to_spectrogram = np.vectorize(timeseries_to_spectrogram)

In [59]:
v_timeseries_to_spectrogram(en1_t)

array([None, None, None, ..., None, None, None], dtype=object)

In [60]:
%%time
v_timeseries_to_spectrogram(zh1_t)

CPU times: user 10.8 s, sys: 3.28 s, total: 14.1 s
Wall time: 31.7 s


array([None, None, None, ..., None, None, None], dtype=object)