In [None]:
%load_ext autoreload
%autoreload 2

import os
from os import listdir
from os.path import isfile, join

import numpy as np
import matplotlib.pyplot as plt

import madmom

import sys
sys.path.append('../src')
from preprocessing import load_rhythm_feature_db
from models import OLSPatchRegressor
import visualize
from utils import cv
MUSIC = 1
SPEECH = 0

na = np.newaxis

plt.rc('text', usetex=True)
plt.rc('font', family='serif')

In [None]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Lambda
from keras.layers import Conv2D, MaxPooling2D
from keras import backend as K

In [None]:
music_dir  = '../data/music_speech/music_wav/'
speech_dir = '../data/music_speech/speech_wav/'

X, Y = load_rhythm_feature_db(music_dir, speech_dir, num_samples=-1)

# change -1, 1 labels to 0,1
Y = (Y + 1) / 2 

# X is in (N,L,D) format

X = X[:,na,:,:] # dont conv over the number of models

batch_size = 8
num_classes = 2
epochs = 109

# --------------------------------------------
num_frequencies = X.shape[1]
num_timesteps   = X.shape[2]
num_channels    = X.shape[3]
filter_time_size = 3
input_shape = num_frequencies, num_timesteps, num_channels

In [None]:
def reset_weights(model):
    session = K.get_session()
    for layer in model.layers: 
        if hasattr(layer, 'kernel_initializer'):
            layer.kernel.initializer.run(session=session)

MODEL = None
def init_model(input_shape=(num_frequencies, num_timesteps, num_channels), reinit=False):
    global MODEL
    if MODEL is None:
        
        # DEFINE MODEL
        model = Sequential()
        model.add(MaxPooling2D(pool_size=(1, 3), input_shape=input_shape))
        model.add(Conv2D(32, kernel_size=(num_frequencies, filter_size),
                         activation='relu'))

        model.add(Conv2D(1, kernel_size=(1, 1), activation='sigmoid'))
        model.add(Lambda(lambda x: K.mean(x, axis=[1,2])))

        model.compile(loss=keras.losses.binary_crossentropy,
                      optimizer=keras.optimizers.Adadelta(),
                      metrics=['accuracy'])
        return model
    
    else:
        model = MODEL
        reset_weights(model)
        return model

train_model = lambda model, X, Y: model.fit(X, Y,
                                        batch_size=batch_size,
                                        epochs=epochs,
                                        verbose=0)

In [None]:


# evaluate using cross-validation on training set
cvacc = cv(X, Y, init_model, train_model, nfolds=5, nrepetitions=1)
print('CV loss:', cvacc[0])
print('CV accuracy:', cvacc[1])

In [None]:
split = 100
Xtrain, Ytrain, Xtest, Ytest = X[:split], Y[:split], X[split:], Y[split:]
# evaluate using train-test split
model = init_model()
train_model(model, Xtrain, Ytrain)
score = model.evaluate(Xtest, Ytest, verbose=0)

model_path = '../models/keras/'
os.makedirs(model_path, exist_ok=True)

model.save(os.path.join(model_path, 'cnn_on_features_filter{}-{:2.2f}.h5'.format(filter_size, score[1])))

In [None]:
music = XTest[YTest == MUSIC][1]
speech = XTest[YTest == SPEECH][2]
visualize.prediction_over_time(music, speech, model)