In [1]:
import glob
import os
import librosa
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
from matplotlib.pyplot import specgram
import time
%matplotlib inline
plt.style.use('ggplot')

plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.serif'] = 'Ubuntu'
plt.rcParams['font.monospace'] = 'Ubuntu Mono'
plt.rcParams['font.size'] = 12
plt.rcParams['axes.labelsize'] = 11
plt.rcParams['axes.labelweight'] = 'bold'
plt.rcParams['axes.titlesize'] = 12
plt.rcParams['xtick.labelsize'] = 9
plt.rcParams['ytick.labelsize'] = 9
plt.rcParams['legend.fontsize'] = 11
plt.rcParams['figure.titlesize'] = 13

## Load Data

In [11]:
# parent_dir = '../../data/UrbanSound8K/audio/'
# # sub_dirs= ['fold1','fold2', 'fold3', 'fold4', 'fold5', 'fold6', 'fold7', 'fold8','fold9', 'fold10']
# sub_dirs= ['fold1']
# print "---Extracting features.... ---"
# start_time = time.time()
# sounds, labels, file_names = load_sounds(parent_dir,sub_dirs)
# print "---Loading time: {0} seconds ---".format(time.time() - start_time)

---Extracting features.... ---
---Loading time: 142.936600924 seconds ---


In [3]:
# All Folds
train_x = np.load("cnn_train_features_full.npy", allow_pickle=True)
train_y = np.load("cnn_train_labels_full.npy", allow_pickle=True)
train_files = np.load("cnn_train_file_names_full.npy", allow_pickle=True)

## Time Stretching

In [5]:
N_FOLDS = 10

In [4]:
stretching_factors = [0.81, 0.93, 1.07, 1.23]
sr = 22050

In [40]:

def time_stretching(data, names, factors):
    data_length = np.shape(data)[0]
    n_patches = np.int32(np.floor(data_length/(N_FOLDS-1)))
    for i in range(len(factors)):
        for fold in np.arange(N_FOLDS):
            fold_start = fold*n_patches
            for idx in np.arange(fold_start, fold_start+n_patches):
                if(idx >= data_length):
                    break
                aug = librosa.effects.time_stretch(data[idx], factors[i])            
                name =names[idx].split('/')[-1].split('.')[0] + "-ts-" +str(i)        
                name = os.path.join('../../data/UrbanSound8K/audio/time_stretching_audio' + '/fold' + str(fold+1),name + '.wav')
                librosa.output.write_wav(name, aug, sr)

In [41]:
time_stretching(train_x, train_files, stretching_factors)

## Pitch shifting

In [42]:
pitch_shifting_steps = [-3.5, -2.5, -2, -1, 1, 2, 2.5, 3.5]

In [43]:
def pitch_shifting(data, names, steps):
    data_length = np.shape(data)[0]
    n_patches = np.int32(np.floor(data_length/(N_FOLDS-1)))
    for i in range(len(steps)):
        for fold in np.arange(N_FOLDS):
            fold_start = fold*n_patches
            for idx in np.arange(fold_start, fold_start+n_patches):
                if(idx >= data_length):
                    break;
                aug = librosa.effects.pitch_shift(data[idx], sr, steps[i])            
                name = names[idx].split('/')[-1].split('.')[0] + "-ps-" +str(i)        
                name = os.path.join('../../data/UrbanSound8K/audio/pitch_shifting_audio' + '/fold' + str(fold+1),name + '.wav')
                librosa.output.write_wav(name, aug, sr)

In [44]:
pitch_shifting(train_x, train_files, pitch_shifting_steps)

## Dynamic Range Compression

In [3]:
sound_clip,sr = librosa.load('../../data/UrbanSound8K/audio/fold1/7061-6-0-0.wav')

In [4]:
np.shape(sound_clip)

(49613,)

In [5]:
melspec = librosa.feature.melspectrogram(sound_clip, sr=sr, n_fft=512, hop_length=512)

In [6]:
np.shape(melspec)

(128, 97)

In [7]:
sound_clip,sr = librosa.load('../../data/UrbanSound8K/audio/fold1/17913-4-0-1.wav')
np.shape(sound_clip)

(88200,)

In [8]:
melspec = librosa.feature.melspectrogram(sound_clip, sr=sr, n_fft=512, hop_length=512)
np.shape(melspec)

(128, 173)