In [None]:
%load_ext autoreload
%autoreload 2

import os
from os import listdir
from os.path import isfile, join

import numpy as np
import matplotlib.pyplot as plt

import madmom

import sys
sys.path.append('../src')
from preprocessing import load_rhythm_feature_db
from models import OLSPatchRegressor
import visualize
MUSIC = 1
SPEECH = 0

na = np.newaxis

plt.rc('text', usetex=True)
plt.rc('font', family='serif')

In [None]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Lambda
from keras.layers import Conv2D, MaxPooling2D
from keras import backend as K

In [None]:
music_dir  = '../data/music_speech/music_wav/'
speech_dir = '../data/music_speech/speech_wav/'

X, Y = load_rhythm_feature_db(music_dir, speech_dir, num_samples=-1)

# change -1, 1 labels to 0,1
Y = (Y + 1) / 2 

# X is in (N,L,D) format

X = X[:,na,:,:] # dont conv over the number of models

t_ind = 96
XTrain = X[:t_ind]
YTrain = Y[:t_ind]
XTest  = X[t_ind:]
YTest  = Y[t_ind:]

print(XTrain.shape, YTrain.shape)
print(XTest.shape, YTest.shape)

In [None]:
batch_size = 8
num_classes = 2
epochs = 109

# --------------------------------------------
num_frequencies = X.shape[1]
num_timesteps   = X.shape[2]
num_channels    = X.shape[3]
filter_time_size = 3
input_shape = num_frequencies, num_timesteps, num_channels

# DEFINE MODEL
model = Sequential()
model.add(Conv2D(1, kernel_size=(num_frequencies, filter_time_size),
                 activation='sigmoid',
                 input_shape=input_shape))

model.add(Lambda(lambda x: K.mean(x, axis=[1,2])))

model.compile(loss=keras.losses.binary_crossentropy,
              optimizer=keras.optimizers.Adadelta(),
              metrics=['accuracy'])

In [None]:
model.fit(XTrain, YTrain,
          batch_size=batch_size,
          epochs=epochs,
          verbose=1,
          validation_data=(XTest, YTest))
score = model.evaluate(XTest, YTest, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

In [None]:
model_path = '../models/keras/'
os.makedirs(model_path, exist_ok=True)

model.save(os.path.join(model_path, 'linear_on_features_filter{}-{:2.2f}.h5'.format(filter_size, score[1])))

In [None]:

music = XTest[YTest == MUSIC][0]
speech = XTest[YTest == SPEECH][1]
visualize.prediction_over_time(music, speech, model)