In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import os
from os import listdir
from os.path import isfile, join

import numpy as np
import matplotlib.pyplot as plt
import madmom

import sys
sys.path.append('../src')
from preprocessing import get_dataset
from models import OLSPatchRegressor
import visualize
from utils import cv
MUSIC = 1
SPEECH = 0

na = np.newaxis

plt.rc('text', usetex=True)
plt.rc('font', family='serif')

In [None]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Lambda
from keras.layers import Conv2D, MaxPooling2D
from keras import backend as K

In [None]:
music_dir  = '../data/music_speech/music_wav/'
speech_dir = '../data/music_speech/speech_wav/'

max_samples = -1

In [None]:
X, Y = get_dataset(music_dir, speech_dir, hpool=0, wpool=0, 
                   num_samples=max_samples, shuffle=True, reload=False,
                   window=np.hanning, fps=100, num_bands=3, fmin=30, fmax=17000, filtered=False,
                   fft_sizes=[1024, 2048, 4096]
                  )
# --------------------------------------------
num_frequencies = X.shape[1]
num_timesteps   = X.shape[2]
num_channels    = X.shape[3]
filter_time_size = 3
input_shape = num_frequencies, num_timesteps, num_channels

In [None]:
print('Train Set Shape')
print(X.shape, Y.shape)

Y = (Y + 1) / 2 

# Train CNN using KERAS

We now only use a convolution, so we have a linear model

In [None]:
def reset_weights(model):
    session = K.get_session()
    for layer in model.layers: 
        if hasattr(layer, 'kernel_initializer'):
            layer.kernel.initializer.run(session=session)

MODEL = None
def init_model(input_shape=(num_frequencies, num_timesteps, num_channels), reinit=False):
    global MODEL
    if MODEL is None:
        
        # DEFINE MODEL
        model = Sequential()
        model.add(Conv2D(1, kernel_size=(num_frequencies, filter_time_size),
                         activation='sigmoid',
                         input_shape=input_shape))

        model.add(Lambda(lambda x: K.mean(x, axis=[1,2])))

        model.compile(loss=keras.losses.binary_crossentropy,
                      optimizer=keras.optimizers.Adadelta(),
                      metrics=['accuracy'])
        return model
    
    else:
        model = MODEL
        reset_weights(model)
        return model

train_model = lambda model, X, Y: model.fit(X, Y,
                                        batch_size=batch_size,
                                        epochs=epochs,
                                        verbose=0)

# DEFINE MODEL


In [None]:


# evaluate using cross-validation on training set
cvacc = cv(X, Y, init_model, train_model, nfolds=5, nrepetitions=1)
print('CV loss:', cvacc[0])
print('CV accuracy:', cvacc[1])

In [None]:
split = 100
Xtrain, Ytrain, Xtest, Ytest = X[:split], Y[:split], X[split:], Y[split:]
# evaluate using train-test split
model = init_model()
train_model(model, Xtrain, Ytrain)
score = model.evaluate(Xtest, Ytest, verbose=0)

model_path = '../models/keras/'
os.makedirs(model_path, exist_ok=True)

model.save(os.path.join(model_path, 'cnn_on_features_filter{}-{:2.2f}.h5'.format(filter_size, score[1])))

In [None]:
music = XTest[YTest == MUSIC][1]
speech = XTest[YTest == SPEECH][2]
visualize.prediction_over_time(music, speech, model)