In [1]:
import glob
import os
import librosa
import cPickle as pickle
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
from matplotlib.pyplot import specgram
import time
from sklearn.cross_validation import train_test_split
%matplotlib inline
plt.style.use('ggplot')

plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.serif'] = 'Ubuntu'
plt.rcParams['font.monospace'] = 'Ubuntu Mono'
plt.rcParams['font.size'] = 12
plt.rcParams['axes.labelsize'] = 11
plt.rcParams['axes.labelweight'] = 'bold'
plt.rcParams['axes.titlesize'] = 12
plt.rcParams['xtick.labelsize'] = 9
plt.rcParams['ytick.labelsize'] = 9
plt.rcParams['legend.fontsize'] = 11
plt.rcParams['figure.titlesize'] = 13

## 1.1 Load audio data

In [2]:
# ------------
# DONE RUNNING
# ------------

SAMPLING_RATE = 44100

def load_sounds(parent_dir,sub_dirs,file_ext="wav", split=True):
    print "Load raw audio files as numpy array"
    print "Loading...."
    start_time = time.time()
    data = []
    for l, sub_dir in enumerate(sub_dirs):
        namePath = []
        sounds = []
        labels = []
        path = os.path.join(parent_dir, sub_dir)
        folder = os.listdir(path)
        for audio in folder:
            if(audio.split('.')[-1] == file_ext):
                file_path = os.path.join(path, audio)
                sound_clip, _ = librosa.load(file_path, sr=SAMPLING_RATE)
                sounds.append(sound_clip)
                name_list = file_path.split('/')
                namePath.append('/'.join(name_list[-2:]))

                label = name_list[-1].split('-')[1]
                labels.append(label)
        sounds = np.array(sounds)
        labels = np.array(labels,dtype = np.int)
        namePath = np.array(namePath, dtype=np.string_)
        data.append({'audio': sounds, 'label': labels, 'file names': namePath})
        print ("{0} is loaded successfully".format(sub_dir))
    return dict(zip(sub_dirs, data))

In [3]:
# ------------
# DONE RUNNING
# ------------

# Only for the first time running
parent_dir = '../data/UrbanSound8K/audio/original_audio/'
# sub_dirs= ['fold1','fold2', 'fold3', 'fold4', 'fold5', 'fold6', 'fold7', 'fold8','fold9', 'fold10']

sub_dirs= ['fold1']
print "---Loading Sounds.... ---"
start_time = time.time()
data = load_sounds(parent_dir,sub_dirs)
print "---Loading time: {0} seconds ---".format(time.time() - start_time)

---Loading Sounds.... ---
Load raw audio files as numpy array
Loading....
fold1 is loaded successfully
---Loading time: 170.979458094 seconds ---


In [3]:
# ------------
# DONE RUNNING
# ------------

# Only for the first time running
parent_dir = '../data/UrbanSound8K/audio/original_audio/'
sub_dirs= ['fold1','fold2', 'fold3', 'fold4', 'fold5', 'fold6', 'fold7', 'fold8','fold9', 'fold10']

# sub_dirs= ['fold1', 'fold2']
print "---Loading Sounds.... ---"
start_time = time.time()
data = load_sounds(parent_dir,sub_dirs)
print "---Loading time: {0} seconds ---".format(time.time() - start_time)

---Loading Sounds.... ---
Load raw audio files as numpy array
Loading....
fold1 is loaded successfully
fold2 is loaded successfully
fold3 is loaded successfully
fold4 is loaded successfully
fold5 is loaded successfully
fold6 is loaded successfully
fold7 is loaded successfully
fold8 is loaded successfully
fold9 is loaded successfully
fold10 is loaded successfully
---Loading time: 1450.36918497 seconds ---


In [18]:
with open("full_fold_data.pickle", 'wb') as f:
    pickle.dump(data, f, protocol=pickle.HIGHEST_PROTOCOL)

fold1/155202-9-0-42.wav
9
[ True  True  True ...,  True  True  True]


## 1.6 Load Manually prepared test set

In [8]:
parent_dir = '../data'
utube_test_set = load_sounds(parent_dir=parent_dir, sub_dirs=['youtube_test_set'])

Load raw audio files as numpy array
Loading....
youtube_test_set is loaded successfully


In [9]:
# import cPickle as pickle
# with open("utube_test_set.pickle", 'wb') as f:
#     pickle.dump(utube_test_set, f, protocol=pickle.HIGHEST_PROTOCOL)

In [11]:
x, sr = librosa.load('../data/youtube_test_set/carhorn_14-1-.wav', sr=None)
print sr

44100


## 2. Data Augmentation

In [2]:
with open("full_fold_data_44100hz.pickle", "rb") as f:
    dataset = pickle.load(f)

In [3]:
dataset['fold1']['audio'].shape

(873,)

In [4]:
sub_dirs= ['fold1','fold2', 'fold3', 'fold4', 'fold5', 'fold6', 'fold7', 'fold8','fold9', 'fold10']

In [5]:
def stretch_time(audio_data, time_stretch_factors):
    n_samples = audio_data['audio'].shape[0]
    audio = [0]*n_samples*len(time_stretch_factors)
    labels = [0]*n_samples*len(time_stretch_factors)
    names = [0]*n_samples*len(time_stretch_factors)
    i = 0
    for factor in time_stretch_factors:
        for idx in range(n_samples):
            audio[i] = librosa.effects.time_stretch(audio_data['audio'][idx], factor)
            names[i] = audio_data['file names'][idx]
            labels[i] = audio_data['label'][idx]
            i += 1
    audio = np.array(audio)
    labels = np.array(labels,dtype = np.int)
    names = np.array(names, dtype=np.string_)
    return {'audio': audio, 'label': labels, 'file names': names}

def pitch_shift(audio_data, pitch_shift_values):
    n_samples = audio_data['audio'].shape[0]
    audio = [0]*n_samples*len(pitch_shift_values)
    labels = [0]*n_samples*len(pitch_shift_values)
    names = [0]*n_samples*len(pitch_shift_values)
    i = 0
    for val in pitch_shift_values:
        for idx in range(n_samples):
            audio[i] = librosa.effects.pitch_shift(y=audio_data['audio'][idx],
                                                                          sr=44100, 
                                                                          n_steps=val)
            names[i] = audio_data['file names'][idx]
            labels[i] = audio_data['label'][idx]
            i += 1
    audio = np.array(audio)
    labels = np.array(labels,dtype = np.int)
    names = np.array(names, dtype=np.string_)
    return {'audio': audio, 'label': labels, 'file names': names}

def augmentate_data(audio_data, folders, time_stretch_factors, pitch_shift_values):
    start_time = time.time()
    for folder in folders:
        print ("Augmenting {0}......".format(folder))
        time_stretching_data = stretch_time(audio_data[folder], time_stretch_factors)
        pitch_shifting_data = pitch_shift(audio_data[folder], pitch_shift_values)
        data = {"time": time_stretching_data, "pitch": pitch_shifting_data}
        saved_file = "augmented_audio/" + folder + "_augmented_data.pickle"
        with open(saved_file, 'wb') as f:
            pickle.dump(data, f, protocol=pickle.HIGHEST_PROTOCOL)
        print ("--- Save {0} augmented data successfully".format(folder))
        print ("--- Elapsed time: {0} seconds ---".format(time.time() - start_time))
    print ("Total running time: {0} seconds".format(time.time() - start_time))

In [6]:
time_stretch_factors = [0.81, 0.93, 1.07, 1.23]
pitch_shift_values = [-3.5, -2.5, -2, -1, 1, 2, 2.5, 3.5]
folders= ['fold1','fold2', 'fold3', 'fold4', 'fold5', 'fold6', 'fold7', 'fold8','fold9', 'fold10']
augmentate_data(dataset, folders, time_stretch_factors, pitch_shift_values)

Augmenting fold1......
--- Save fold1 augmented data successfully
--- Elapsed time: 1764.41101599 seconds ---
Augmenting fold2......
--- Save fold2 augmented data successfully
--- Elapsed time: 3540.72552204 seconds ---
Augmenting fold3......
--- Save fold3 augmented data successfully
--- Elapsed time: 5440.17721415 seconds ---
Augmenting fold4......
--- Save fold4 augmented data successfully
--- Elapsed time: 7420.86635208 seconds ---
Augmenting fold5......
--- Save fold5 augmented data successfully
--- Elapsed time: 9309.1092782 seconds ---
Augmenting fold6......
--- Save fold6 augmented data successfully
--- Elapsed time: 10984.2175982 seconds ---
Augmenting fold7......
--- Save fold7 augmented data successfully
--- Elapsed time: 12691.8318851 seconds ---
Augmenting fold8......
--- Save fold8 augmented data successfully
--- Elapsed time: 14289.4338422 seconds ---
Augmenting fold9......
--- Save fold9 augmented data successfully
--- Elapsed time: 15931.949013 seconds ---
Augmenting f

In [7]:
with open("augmented_audio/fold1_augmented_data.pickle", "rb") as f:
    dataset = pickle.load(f)


In [8]:
dataset['time']['audio'].shape

(3492,)

In [9]:
dataset['time']['file names'].shape

(3492,)

In [10]:
dataset['time']['file names']

array(['fold1/180937-7-3-11.wav', 'fold1/193394-3-0-4.wav',
       'fold1/180937-7-1-3.wav', ..., 'fold1/78360-4-0-0.wav',
       'fold1/108041-9-0-11.wav', 'fold1/101415-3-0-2.wav'], 
      dtype='|S24')

In [11]:
dataset['time']['label'].shape

(3492,)

In [12]:
dataset['time']['label']

array([7, 3, 7, ..., 4, 9, 3])