In [2]:
# imports
from pathlib import Path
import numpy as np
from scipy.signal import resample
import scipy.io.wavfile
import keras
from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Flatten, Convolution1D, Dropout, BatchNormalization, MaxPooling1D, AveragePooling1D
from keras import regularizers
from keras import initializers
from keras.optimizers import Adam, SGD, Adagrad
from keras.utils import np_utils
from keras.callbacks import TensorBoard

Using TensorFlow backend.


In [3]:
## load wav as arrays
audio_path = Path('../dataset/UrbanSound8K/audio')
def load_wav_fold(fold_id):
    p = audio_path.joinpath('fold'+str(fold_id))
    x_arr = []
    y_arr = []
    for f in p.iterdir():
        if f.is_file():
            parts = f.parts
            fname = parts[-1]
            ## get label from file name
            ftype = fname.split('.')[-1]
            if ftype != 'wav':
                continue
            label = fname.split('-')[1]
            try:
                rate, arr = scipy.io.wavfile.read(str(f))
            except ValueError as e:
                continue
            if len(arr.shape) > 1:
                arr = arr[:, 0]
            x_arr.append(arr)
            y_arr.append(label)
    x_arr = np.array(x_arr) ## only keep the first dimension
    y_arr = np.array(y_arr)
    return x_arr, y_arr

def apply_resample(foldarr, outdim):
    for idx in range(foldarr.shape[0]):
        foldarr[idx] = resample(foldarr[idx], outdim)
    return foldarr

def get_xtrain_mean(x_train):
    ## mean value for each dimension (exp. each of 625 dim)
    m = np.mean(x_train, axis=0)
    ## then we can apply x_train - m for zero mean
    return m

def combine_samples(arrs):
    ## exp. arrs.shape: (20, ?)
    if arrs.shape[0] < 1:
        return arrs
    sp = list(arrs[0].shape)
    sp[0] = 0
    combined = np.zeros(sp)
    print("combinde", combined.shape)
    for sample in range(arrs.shape[0]):
        arr = arrs[sample]
        combined = np.concatenate((combined, arr), axis=0)
    return combined

def save_to_npy(x, y, fold_id):  
    np.save(audio_path.joinpath('fold{}_x.npy'.format(fold_id)), x)
    np.save(audio_path.joinpath('fold{}_y.npy'.format(fold_id)), y)

In [4]:
# for i in range(10, 11):    
#     x_arr, y_arr = load_wav_fold(i)
#     print("x_arr", x_arr.shape)
#     x_arr = apply_resample(x_arr, 32000)
#     save_to_npy(x_arr, y_arr, i)

In [5]:
## global parameters

### models to test
* VERY DEEP CONVOLUTIONAL NEURAL NETWORKS FOR RAW WAVEFORMS

they have a clearly defined structure, and their data are of similar dimentions

* Raw Waveform-based Audio Classification Using Sample-level CNN Architectures
* SAMPLE-LEVEL DEEP CONVOLUTIONAL NEURAL NETWORKS FOR MUSIC AUTO-TAGGING USING RAW WAVEFORMS

realtively simple arch;

#### test raw waveform input first
* input: 2500 * 1 waveform (normalized, center to 0, variance 1)
* conv layer: with/withour overlapping. In the paper:
    * filter size 3, stride 3, 128 filters
    * filter size 80, stride 4, 256 filters
* batch normalization: after every conv layer
* max pool
    * stride of 3? 4?


### input
or we can make the input as 625 * 4

In [6]:
## model
## 1d conv, size 4 filter, 64 filters, stride 2
## output 1250 * 64
## batch norm
## maxpool 2 * 1
## output 625 * 64
## 1d conv, size 3 filter, stride 2, 128 filters
## maxpool 2 * 1
## output 312 * 64
## 1d conv, size 3 filter, stride 3, 128 filters
## output 104 * 128
## maxpool 2 * 1
## output 52 * 128
## 1d conv, size 3 filter, stride 2, 256 filters

In [7]:
## one time parameter for the model below
## regularizer
## l2
ker_reg = 0.001
act_reg = 0.1
## kernel_initializer
ker_init = initializers.glorot_normal(seed=None)
## shape
in_shape = (32000, 1)
## learning rate
opt = Adam()
opt.lr = 0.0001
##
OUTPUT_SIZE = 10
##
epochs = 25
##
SEED = 2018
## callback
model_callback = TensorBoard(log_dir='./logs', histogram_freq=0, write_graph=True, write_images=False)

In [8]:
## model
## resample data to 32000 * 1
model = Sequential()
## 1d conv, size 80 filter, 64 filters, stride 2
## batch norm, batch after activation
## maxpool 4 --> 2000 * 64
## keras.layers.Conv1D(filters, kernel_size, strides=1, padding='valid', dilation_rate=1, activation=None, use_bias=True, kernel_initializer='glorot_uniform', bias_initializer='zeros', kernel_regularizer=None, bias_regularizer=None, activity_regularizer=None, kernel_constraint=None, bias_constraint=None)
model.add(Convolution1D(filters=64, kernel_size=80, strides=2, padding='same', input_shape=in_shape, kernel_initializer=ker_init, activation='relu', kernel_regularizer=regularizers.l2(ker_reg)))
model.add(BatchNormalization())
model.add(MaxPooling1D(pool_size=4))
## 1d conv, size 3 filter, 128 filters, stride 1
## batch norm, batch after activation
## maxpool 4 --> 250 * 128
model.add(Convolution1D(filters=128, kernel_size=3, strides=2, padding='same', activation='relu', kernel_regularizer=regularizers.l2(ker_reg)))
model.add(BatchNormalization())
model.add(MaxPooling1D(pool_size=4))
## 1d conv, size 3 filter, 128 filters, stride 2
## batch norm, batch after activation
## max pool 4 -->  32 * 128
model.add(Convolution1D(filters=256, kernel_size=3, strides=2, padding='same', activation='relu', kernel_regularizer=regularizers.l2(ker_reg)))
model.add(BatchNormalization())
model.add(MaxPooling1D(pool_size=4))
## 1d conv, size 3 filter, 512 filters, stride 2
## batch norm, batch after activation
## average pool 16 -->  1 * 512
model.add(Convolution1D(filters=512, kernel_size=3, strides=2, padding='same',activation='relu', kernel_regularizer=regularizers.l2(ker_reg)))
model.add(BatchNormalization())
model.add(AveragePooling1D(pool_size=16))
##
model.add(Flatten())
## fully connected
model.add(Dense(OUTPUT_SIZE))
## softmax
model.add(Activation('softmax'))

In [9]:
model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_1 (Conv1D)            (None, 16000, 64)         5184      
_________________________________________________________________
batch_normalization_1 (Batch (None, 16000, 64)         256       
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 4000, 64)          0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 2000, 128)         24704     
_________________________________________________________________
batch_normalization_2 (Batch (None, 2000, 128)         512       
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 500, 128)          0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 250, 256)          98560     
__________

In [10]:
## experiment
audio_path = Path('../dataset/UrbanSound8K/audio')
x_list = []
y_list = []
get_int_arr = np.vectorize(int)
for i in range(1, 11):
    x_arr = np.load(audio_path.joinpath('fold{}_x.npy'.format(i)))
    y_arr = np.load(audio_path.joinpath('fold{}_y.npy'.format(i))) ## string
    y_arr = get_int_arr(y_arr)
#     x_base = np.zeros([0, 32000])
#     for idx in range(0, x_arr.shape[0]):
#         x_base = np.vstack((x_base, x_arr[idx]))
#     save_to_npy(x_base, y_arr, i)
#     x_list.append(x_base)
    x_list.append(x_arr)
    y_list.append(y_arr)

In [11]:
x_list = np.array(x_list)
y_list = np.array(y_list)
train_list_x = combine_samples(x_list)
train_list_y = combine_samples(y_list)

combinde (0, 32000)
combinde (0,)


In [12]:
shuffle_idx = np.arange(train_list_x.shape[0])
np.random.seed(SEED)
np.random.shuffle(shuffle_idx)
train_list_x = train_list_x[shuffle_idx]
train_list_y = train_list_y[shuffle_idx]
## train val test split
val_start = int(train_list_x.shape[0] * 0.7)
val_end = int(train_list_x.shape[0] * 0.9)

In [13]:
val_list_x = train_list_x[val_start: val_end]
val_list_y = train_list_y[val_start: val_end]
test_list_x = train_list_x[val_end:]
test_list_y = train_list_y[val_end:]
train_list_x = train_list_x[: val_start]
train_list_y = train_list_y[: val_start]

train_list_y = np_utils.to_categorical(train_list_y, num_classes=10)
val_list_y = np_utils.to_categorical(val_list_y, num_classes=10)
test_list_y = np_utils.to_categorical(test_list_y, num_classes=10)

train_list_x = np.reshape(train_list_x, [train_list_x.shape[0], train_list_x.shape[1], 1])
val_list_x = np.reshape(val_list_x, [val_list_x.shape[0], val_list_x.shape[1], 1])
test_list_x = np.reshape(test_list_x, [test_list_x.shape[0], test_list_x.shape[1], 1])

In [14]:
## zero center
x_train_mean = np.mean(train_list_x, axis=0)
train_list_x = train_list_x - x_train_mean
val_list_x = val_list_x - x_train_mean
test_list_x = test_list_x - x_train_mean

In [15]:
model.fit(train_list_x, train_list_y,
          epochs=epochs,
          verbose=2,
          validation_data=(val_list_x, val_list_y),
          callbacks=[model_callback])

Train on 4179 samples, validate on 1194 samples
Epoch 1/25
 - 15s - loss: 2.4273 - acc: 0.3759 - val_loss: 2.1744 - val_acc: 0.4958
Epoch 2/25
 - 14s - loss: 2.0426 - acc: 0.5130 - val_loss: 2.0164 - val_acc: 0.5209
Epoch 3/25
 - 14s - loss: 1.8697 - acc: 0.5765 - val_loss: 1.9353 - val_acc: 0.5645
Epoch 4/25
 - 14s - loss: 1.7564 - acc: 0.6126 - val_loss: 1.8447 - val_acc: 0.5955
Epoch 5/25
 - 14s - loss: 1.6592 - acc: 0.6466 - val_loss: 1.8643 - val_acc: 0.5737
Epoch 6/25
 - 15s - loss: 1.6031 - acc: 0.6688 - val_loss: 1.8195 - val_acc: 0.5946
Epoch 7/25
 - 15s - loss: 1.5219 - acc: 0.6937 - val_loss: 1.7433 - val_acc: 0.6005
Epoch 8/25
 - 15s - loss: 1.4594 - acc: 0.7172 - val_loss: 1.7012 - val_acc: 0.6298
Epoch 9/25
 - 16s - loss: 1.4075 - acc: 0.7320 - val_loss: 1.7399 - val_acc: 0.5980
Epoch 10/25
 - 15s - loss: 1.3517 - acc: 0.7523 - val_loss: 1.6519 - val_acc: 0.6482
Epoch 11/25
 - 15s - loss: 1.3028 - acc: 0.7679 - val_loss: 1.5777 - val_acc: 0.6792
Epoch 12/25
 - 15s - loss:

<keras.callbacks.History at 0x7fff61d4d0f0>

Train on 4179 samples, validate on 1194 samples
Epoch 1/25
 - 27s - loss: 2.3930 - acc: 0.3898 - val_loss: 2.2291 - val_acc: 0.4204
Epoch 2/25
 - 14s - loss: 2.0106 - acc: 0.5279 - val_loss: 2.0354 - val_acc: 0.5067
Epoch 3/25
 - 14s - loss: 1.8591 - acc: 0.5784 - val_loss: 1.9823 - val_acc: 0.4941
Epoch 4/25
 - 14s - loss: 1.7572 - acc: 0.6131 - val_loss: 1.9066 - val_acc: 0.5570
Epoch 5/25
 - 14s - loss: 1.6655 - acc: 0.6487 - val_loss: 1.8375 - val_acc: 0.5913
Epoch 6/25
 - 14s - loss: 1.5844 - acc: 0.6671 - val_loss: 1.7749 - val_acc: 0.6022
Epoch 7/25
 - 14s - loss: 1.5322 - acc: 0.6868 - val_loss: 1.7747 - val_acc: 0.5980
Epoch 8/25
 - 14s - loss: 1.4601 - acc: 0.7186 - val_loss: 1.7339 - val_acc: 0.6022
Epoch 9/25
 - 14s - loss: 1.4011 - acc: 0.7310 - val_loss: 1.7275 - val_acc: 0.6055
Epoch 10/25
 - 14s - loss: 1.3506 - acc: 0.7523 - val_loss: 1.6713 - val_acc: 0.6323
Epoch 11/25
 - 14s - loss: 1.3105 - acc: 0.7588 - val_loss: 1.6753 - val_acc: 0.6298
Epoch 12/25
 - 14s - loss: 1.2634 - acc: 0.7770 - val_loss: 1.6949 - val_acc: 0.6281
Epoch 13/25
 - 14s - loss: 1.2121 - acc: 0.8028 - val_loss: 1.6527 - val_acc: 0.6407
Epoch 14/25
 - 14s - loss: 1.1635 - acc: 0.8186 - val_loss: 1.5837 - val_acc: 0.6575
Epoch 15/25
 - 14s - loss: 1.1391 - acc: 0.8220 - val_loss: 1.6443 - val_acc: 0.6365
Epoch 16/25
 - 14s - loss: 1.1070 - acc: 0.8306 - val_loss: 1.6861 - val_acc: 0.6122
Epoch 17/25
 - 14s - loss: 1.0942 - acc: 0.8337 - val_loss: 1.5659 - val_acc: 0.6600
Epoch 18/25
 - 14s - loss: 1.0583 - acc: 0.8478 - val_loss: 1.6476 - val_acc: 0.6290
Epoch 19/25
 - 14s - loss: 1.0263 - acc: 0.8545 - val_loss: 1.5645 - val_acc: 0.6616
Epoch 20/25
 - 14s - loss: 0.9994 - acc: 0.8574 - val_loss: 1.6417 - val_acc: 0.6348
Epoch 21/25
 - 14s - loss: 0.9829 - acc: 0.8629 - val_loss: 1.6160 - val_acc: 0.6441
Epoch 22/25
 - 14s - loss: 0.9400 - acc: 0.8789 - val_loss: 1.6271 - val_acc: 0.6516
Epoch 23/25
 - 14s - loss: 0.9380 - acc: 0.8768 - val_loss: 1.6438 - val_acc: 0.6307
Epoch 24/25
 - 14s - loss: 0.9090 - acc: 0.8825 - val_loss: 1.5639 - val_acc: 0.6558
Epoch 25/25

In [26]:
loss, acc = model.evaluate(test_list_x, test_list_y)
print("loss", loss, "acc", acc)

loss 3.5232881414420287 acc 0.3830645160222139
