In [1]:
import glob
import os
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import augmentation
import scipy
import librosa

In [2]:
wavs = glob.glob('data/*.wav')
train_X, test_X = train_test_split(wavs, test_size = 0.2)
len(train_X), len(test_X)

(2240, 560)

In [3]:
if not os.path.exists('augment'):
    os.makedirs('augment')

In [None]:
for i in tqdm(range(len(train_X))):
    wav = train_X[i]
    try:
        root, ext = os.path.splitext(wav)
        root = root.split('/')[1]
        root = '%s/%s' % ('augment', root)
        sample_rate, samples = scipy.io.wavfile.read(wav)
        aug = augmentation.change_pitch_speech(samples)
        librosa.output.write_wav(
            '%s-1%s' % (root, ext),
            aug.astype('float32'),
            sample_rate,
            norm = True,
        )
        aug = augmentation.change_amplitude(samples)
        librosa.output.write_wav(
            '%s-2%s' % (root, ext),
            aug.astype('float32'),
            sample_rate,
            norm = True,
        )

        aug = augmentation.add_noise(samples)
        librosa.output.write_wav(
            '%s-3%s' % (root, ext),
            aug.astype('float32'),
            sample_rate,
            norm = True,
        )

        aug = augmentation.add_hpss(samples)
        librosa.output.write_wav(
            '%s-4%s' % (root, ext),
            aug.astype('float32'),
            sample_rate,
            norm = True,
        )

        aug = augmentation.strech(samples)
        librosa.output.write_wav(
            '%s-5%s' % (root, ext),
            aug.astype('float32'),
            sample_rate,
            norm = True,
        )

        aug = augmentation.random_augmentation(samples)
        librosa.output.write_wav(
            '%s-6%s' % (root, ext),
            aug.astype('float32'),
            sample_rate,
            norm = True,
        )
    except:
        pass


In [9]:
import soundfile

sampling_rate = 22050
n_fft = 2048
frame_shift = 0.0125
frame_length = 0.05
hop_length = int(sampling_rate * frame_shift)
win_length = int(sampling_rate * frame_length)
n_mels = 80
reduction_factor = 5

def compute_spectrogram_feature(
    samples,
    sample_rate = 16000,
    stride_ms = 10.0,
    window_ms = 20.0,
    max_freq = None,
    eps = 1e-14,
):
    if max_freq is None:
        max_freq = sample_rate / 2
    if max_freq > sample_rate / 2:
        raise ValueError(
            'max_freq must not be greater than half of sample rate.'
        )

    if stride_ms > window_ms:
        raise ValueError('Stride size must not be greater than window size.')

    stride_size = int(0.001 * sample_rate * stride_ms)
    window_size = int(0.001 * sample_rate * window_ms)

    # Extract strided windows
    truncate_size = (len(samples) - window_size) % stride_size
    samples = samples[: len(samples) - truncate_size]
    nshape = (window_size, (len(samples) - window_size) // stride_size + 1)
    nstrides = (samples.strides[0], samples.strides[0] * stride_size)
    windows = np.lib.stride_tricks.as_strided(
        samples, shape = nshape, strides = nstrides
    )
    assert np.all(
        windows[:, 1] == samples[stride_size : (stride_size + window_size)]
    )

    # Window weighting, squared Fast Fourier Transform (fft), scaling
    weighting = np.hanning(window_size)[:, None]
    fft = np.fft.rfft(windows * weighting, axis = 0)
    fft = np.absolute(fft)
    fft = fft ** 2
    scale = np.sum(weighting ** 2) * sample_rate
    fft[1:-1, :] *= 2.0 / scale
    fft[(0, -1), :] /= scale
    # Prepare fft frequency list
    freqs = float(sample_rate) / window_size * np.arange(fft.shape[0])

    # Compute spectrogram feature
    ind = np.where(freqs <= max_freq)[0][-1] + 1
    specgram = np.log(fft[:ind, :] + eps)
    return np.transpose(specgram, (1, 0))


def get_spectrogram(fpath):
    y, sr = librosa.load(fpath, sr = sampling_rate)
    D = librosa.stft(
        y = y, n_fft = n_fft, hop_length = hop_length, win_length = win_length
    )
    magnitude = np.abs(D)
    power = magnitude ** 2
    S = librosa.feature.melspectrogram(S = power, n_mels = n_mels)
    return np.transpose(S.astype(np.float32))


def reduce_frames(x, r_factor):
    T, C = x.shape
    num_paddings = reduction_factor - (T % r_factor) if T % r_factor != 0 else 0
    padded = np.pad(x, [[0, num_paddings], [0, 0]], 'constant')
    return np.reshape(padded, (-1, C * r_factor))

In [10]:
spectrogram = get_spectrogram(wavs[0])
spectrogram = reduce_frames(spectrogram, reduction_factor)
spectrogram.shape

(33, 400)

In [11]:
augments = glob.glob('augment/*.wav')

In [12]:
data, _ = soundfile.read(augments[1])
spectrogram = compute_spectrogram_feature(data)
spectrogram

array([[-19.36971185, -16.57376858, -18.71725863, ..., -23.41501521,
        -23.68784775, -24.03949327],
       [-15.61767618, -16.41841276, -19.80099433, ..., -20.95772327,
        -21.39394458, -24.18905672],
       [-15.02933384, -15.46920427, -19.5143906 , ..., -20.94875192,
        -21.02911394, -21.28899505],
       ...,
       [-32.2361913 , -32.2361913 , -32.2361913 , ..., -32.2361913 ,
        -32.2361913 , -32.2361913 ],
       [-32.2361913 , -32.2361913 , -32.2361913 , ..., -32.2361913 ,
        -32.2361913 , -32.2361913 ],
       [-32.2361913 , -32.2361913 , -32.2361913 , ..., -32.2361913 ,
        -32.2361913 , -32.2361913 ]])

In [15]:
if not os.path.exists('spectrogram-train'):
    os.mkdir('spectrogram-train')

if not os.path.exists('spectrogram-test'):
    os.mkdir('spectrogram-test')

In [16]:
from tqdm import tqdm

for i in tqdm(range(len(train_X))):
    i = train_X[i]
    loc = 'spectrogram-train/%s.npy'%(os.path.basename(i).split('.')[0])
    
    spectrogram = get_spectrogram(i)
    spectrogram = reduce_frames(spectrogram, reduction_factor)
    np.save(loc, spectrogram)
    
for i in tqdm(range(len(augments))):
    i = augments[i]
    loc = 'spectrogram-train/%s.npy'%(os.path.basename(i).split('.')[0])
    spectrogram = get_spectrogram(i)
    spectrogram = reduce_frames(spectrogram, reduction_factor)
    np.save(loc, spectrogram)

100%|██████████| 2240/2240 [02:26<00:00, 16.25it/s]
100%|██████████| 13642/13642 [15:26<00:00, 13.94it/s]


In [17]:
for i in tqdm(range(len(test_X))):
    i = test_X[i]
    loc = 'spectrogram-test/%s.npy'%(os.path.basename(i).split('.')[0])
    spectrogram = get_spectrogram(i)
    spectrogram = reduce_frames(spectrogram, reduction_factor)
    np.save(loc, spectrogram)

100%|██████████| 560/560 [00:36<00:00, 15.41it/s]
