# FMA: A Dataset For Music Analysis

Michaël Defferrard, Kirell Benzi, Pierre Vandergheynst, Xavier Bresson, EPFL LTS2.

## Baselines

We explore three types of baselines:
1. simple algorithms,
2. state-of-the-art in genre recognition,
3. deep Learning approaches,

using different input features:
1. raw audio,
2. echonest features,
3. audio features from librosa or [kapre](https://github.com/keunwoochoi/kapre).

We aim at showing that given sufficient data, DL approaches can outperfom all the others without domain-specific / expert knowledge.

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import utils
from tqdm import tqdm_notebook
import keras
from keras.layers import Activation, Dense, Conv1D, Conv2D, MaxPooling1D, Flatten, Reshape
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import IPython.display as ipd
import time
import os
import ast

from sklearn.utils import shuffle
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder, LabelBinarizer, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
#from sklearn.gaussian_process import GaussianProcessClassifier
#from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.multiclass import OneVsRestClassifier

Using TensorFlow backend.


In [2]:
DATA_DIR = os.environ.get('DATA_DIR')
tracks = pd.read_csv(os.path.join(DATA_DIR, 'tracks.csv'), index_col=0, converters={'genres': ast.literal_eval})
features = pd.read_csv(os.path.join(DATA_DIR, 'features.csv'), index_col=0, header=[0, 1, 2])
echonest = pd.read_csv(os.path.join(DATA_DIR, 'echonest.csv'), index_col=0, header=[0, 1, 2])

features_all = features.join(echonest).sort_index(axis=1)

## 1 Multiple classifiers and feature sets

Todo:
* Cross-validation for hyper-parameters.
* Dimensionality reduction?

### 1.1 Pre-processing

In [3]:
# Discard songs with NaN Echonest features.
# TODO: fix dataset.
keep1 = echonest['echonest', 'audio_features'].isnull().apply(lambda x: not x.any(), axis=1)
keep2 = echonest['echonest', 'social_features'].isnull().apply(lambda x: not x.any(), axis=1)
keep = keep1 & keep2
echonest = pd.DataFrame(echonest[keep])
tracks = pd.DataFrame(tracks[keep])
features = pd.DataFrame(features[keep])
features_all = pd.DataFrame(features_all[keep])

In [4]:
n_train = np.sum(tracks['train'] == True)
n_test = tracks.shape[0] - n_train
print('{} training examples, {} validation examples, {} testing examples'.format(n_train, 0, n_test))

genres = list(LabelEncoder().fit(tracks['genres']).classes_)
genres = list(tracks['top_genre'].unique())
print('Top genres ({}): {}'.format(len(genres), genres))
genres = list(MultiLabelBinarizer().fit(tracks['genres']).classes_)
print('All genres ({}): {}'.format(len(genres), genres))

3078 training examples, 0 validation examples, 772 testing examples
Top genres (10): ['Hip-Hop', 'Folk', 'Jazz', 'Punk', 'Rock', 'Electronic', 'Psych-Rock', 'Indie-Rock', 'Pop', 'Old-Time / Historic']
All genres (106): ['20th Century Classical', 'African', 'Afrobeat', 'Alternative Hip-Hop', 'Americana', 'Asia-Far East', 'Balkan', 'Big Band/Swing', 'Bigbeat', 'Bluegrass', 'Brazilian', 'Breakbeat', 'Breakcore - Hard', 'British Folk', 'Chamber Music', 'Chill-out', 'Chip Music', 'Chiptune', 'Classical', 'Composed Music', 'Country', 'Country & Western', 'Cumbia', 'Dance', 'Disco', 'Downtempo', 'Drone', 'Dubstep', 'Easy Listening', 'Easy Listening: Vocal', 'Electro-Punk', 'Electroacoustic', 'Electronic', 'Europe', 'Flamenco', 'Folk', 'Freak-Folk', 'Free-Folk', 'Free-Jazz', 'French', 'Funk', 'Gospel', 'Goth', 'Hardcore', 'Hip-Hop', 'Hip-Hop Beats', 'Holiday', 'House', 'IDM', 'Improv', 'Indie-Rock', 'Industrial', 'Instrumental', 'Interview', 'Jazz', 'Jazz: Out', 'Jazz: Vocal', 'Klezmer', 'Krau

In [5]:
def pre_process(tracks, features, columns, multi_label=False, verbose=False):
    if not multi_label:
        # Assign an integer value to each genre.
        enc = LabelEncoder()
        y = enc.fit_transform(tracks['top_genre'])
    else:
        # Create an indicator matrix.
        enc = MultiLabelBinarizer()
        y = enc.fit_transform(tracks['genres'])

    X = features.loc[:, columns].as_matrix()
    
    # Split in training, validation and testing sets.
    train = tracks['train'] == True
    y_train = y[train]
    y_test = y[~train]
    X_train = X[train]
    X_test = X[~train]
    X_val, y_val = np.empty((0, X_train.shape[1])), np.empty((0, 0))
    
    X_train, y_train = shuffle(X_train, y_train, random_state=42)
    
    # Standardize features by removing the mean and scaling to unit variance.
    scaler = StandardScaler(copy=False)
    scaler.fit_transform(X_train)
    #scaler.transform(X_val)
    scaler.transform(X_test)
    
    return y_train, y_val, y_test, X_train, X_val, X_test

### 1.2 Single genre

When shuffling (with or without validation set) on `fma_small`:
* <36% for echonest_audio.
* <15% for echonest_social.
* <46% for echonset_temporal.
* <40% for mfcc.
* <42% for all except echonest.
* <44% for best non-echonest combination

In [6]:
def test_classifiers_features(classifiers, feature_sets, multi_label=False):
    columns = list(classifiers.keys()).insert(0, 'dim')
    scores = pd.DataFrame(columns=columns, index=feature_sets.keys())
    times = pd.DataFrame(columns=classifiers.keys(), index=feature_sets.keys())
    for fset_name, fset in tqdm_notebook(feature_sets.items(), desc='features'):
        y_train, y_val, y_test, X_train, X_val, X_test = pre_process(tracks, features_all, fset, multi_label)
        scores.loc[fset_name, 'dim'] = X_train.shape[1]
        for clf_name, clf in classifiers.items():  # tqdm_notebook(classifiers.items(), desc='classifiers', leave=False):
            t = time.process_time()
            clf.fit(X_train, y_train)
            score = clf.score(X_test, y_test)
            scores.loc[fset_name, clf_name] = score
            times.loc[fset_name, clf_name] = time.process_time() - t
    return scores, times

def format_scores(scores):
    def highlight(s):
        is_max = s == max(s[1:])
        return ['background-color: yellow' if v else '' for v in is_max]
    scores = scores.style.apply(highlight, axis=1)
    return scores.format('{:.2%}', subset=pd.IndexSlice[:, scores.columns[1]:])

In [7]:
classifiers = {
    'LR': LogisticRegression(),
    'kNN': KNeighborsClassifier(n_neighbors=200),
    'SVCrbf': SVC(kernel='rbf'),
    'SVCpoly1': SVC(kernel='poly', degree=1),
    'linSVC1': SVC(kernel="linear"),
    'linSVC2': LinearSVC(),
    #GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True),
    'DT': DecisionTreeClassifier(max_depth=5),
    'RF': RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    'AdaBoost': AdaBoostClassifier(n_estimators=10),
    'MLP1': MLPClassifier(hidden_layer_sizes=(100,), max_iter=2000),
    'MLP2': MLPClassifier(hidden_layer_sizes=(200, 50), max_iter=2000),
    'NB': GaussianNB(),
    'QDA': QuadraticDiscriminantAnalysis(),
}

feature_sets = {
    'echonest_audio': ('echonest', 'audio_features'),
    'echonest_social': ('echonest', 'social_features'),
    'echonest_temporal': ('echonest', 'temporal_features'),
    'echonest_audio/social': ('echonest', ('audio_features', 'social_features')),
    'echonest_all': ('echonest', ('audio_features', 'social_features', 'temporal_features')),
}
for name in features.columns.levels[0]:
    feature_sets[name] = name
feature_sets.update({
    'mfcc/contrast': ['mfcc', 'spectral_contrast'],
    'mfcc/contrast/chroma': ['mfcc', 'spectral_contrast', 'chroma_cens'],
    'mfcc/contrast/centroid': ['mfcc', 'spectral_contrast', 'spectral_centroid'],
    'mfcc/contrast/chroma/centroid': ['mfcc', 'spectral_contrast', 'chroma_cens', 'spectral_centroid'],
    'mfcc/contrast/chroma/centroid/tonnetz': ['mfcc', 'spectral_contrast', 'chroma_cens', 'spectral_centroid', 'tonnetz'],
    'mfcc/contrast/chroma/centroid/zcr': ['mfcc', 'spectral_contrast', 'chroma_cens', 'spectral_centroid', 'zcr'],
    'all_non-echonest': list(features.columns.levels[0])
})

scores, times = test_classifiers_features(classifiers, feature_sets)

ipd.display(format_scores(scores))
ipd.display(times.style.format('{:.4f}'))






Unnamed: 0,dim,LR,kNN,SVCrbf,SVCpoly1,linSVC1,linSVC2,DT,RF,AdaBoost,MLP1,MLP2,NB,QDA
echonest_audio,8,31.48%,30.57%,35.75%,32.64%,32.77%,32.51%,31.99%,36.53%,30.96%,36.53%,31.74%,30.44%,28.89%
echonest_social,5,14.51%,13.08%,9.97%,14.64%,14.64%,13.99%,12.31%,14.25%,14.38%,15.03%,12.69%,13.86%,10.75%
echonest_temporal,224,44.04%,33.03%,45.47%,44.30%,41.19%,42.62%,29.92%,31.74%,27.07%,39.64%,40.80%,31.74%,29.53%
echonest_audio/social,13,31.22%,32.38%,31.99%,32.77%,30.44%,30.57%,32.77%,31.22%,28.50%,31.99%,32.12%,25.65%,24.09%
echonest_all,237,43.13%,35.23%,43.78%,42.36%,39.90%,41.32%,32.25%,31.99%,22.54%,40.03%,40.41%,31.61%,27.85%
chroma_cens,84,24.35%,19.82%,24.74%,21.89%,22.28%,24.48%,16.19%,21.11%,17.62%,19.30%,19.56%,19.69%,26.04%
chroma_cqt,84,26.30%,21.24%,28.24%,26.04%,26.42%,26.30%,18.52%,24.22%,19.56%,22.41%,25.65%,18.65%,18.65%
chroma_stft,84,27.72%,24.48%,29.53%,27.07%,29.15%,28.76%,20.98%,26.30%,20.08%,26.68%,23.83%,16.32%,15.41%
mfcc,140,38.60%,35.49%,42.23%,40.41%,34.72%,38.08%,29.53%,31.61%,19.17%,35.10%,34.97%,37.69%,32.64%
rmse,7,23.32%,19.43%,22.93%,22.41%,23.45%,24.22%,22.93%,20.98%,21.63%,23.06%,23.83%,17.75%,21.63%


Unnamed: 0,LR,kNN,SVCrbf,SVCpoly1,linSVC1,linSVC2,DT,RF,AdaBoost,MLP1,MLP2,NB,QDA
echonest_audio,0.0522,0.076,0.5118,0.263,0.3413,1.1647,0.013,0.0239,0.052,3.9326,89.2935,0.0517,0.0695
echonest_social,0.66,0.1104,0.5884,0.3084,0.3554,1.2628,0.0044,0.0195,0.0344,2.968,44.1389,0.0422,0.0489
echonest_temporal,3.9937,0.7578,2.9367,2.1461,7.2421,6.9626,0.2821,0.0234,0.6293,78.5366,63.2091,0.2038,3.1969
echonest_audio/social,0.8998,0.093,0.5551,0.2903,0.4246,1.5362,0.0167,0.0222,0.0596,4.111,91.3392,0.0489,0.0892
echonest_all,3.7732,0.7896,3.0307,2.2181,7.194,6.8907,0.2971,0.0238,0.6617,73.4493,60.3681,0.2199,3.4269
chroma_cens,1.5475,0.3289,1.8423,1.3392,5.2709,5.6451,0.1146,0.0236,0.283,105.4549,103.4454,0.1012,0.8171
chroma_cqt,1.7538,0.3207,1.7,1.2815,3.7431,5.5457,0.1042,0.0224,0.2587,92.1451,146.1972,0.1018,0.7535
chroma_stft,1.7768,0.3212,1.5827,1.1985,2.9353,5.2899,0.1055,0.0228,0.2563,83.5549,128.0645,0.1013,0.735
mfcc,2.1779,0.4887,2.0314,1.4613,6.301,5.5311,0.2077,0.0241,0.4695,95.0505,66.4384,0.1483,1.7129
rmse,0.8797,0.0469,0.5602,0.3,0.356,1.4157,0.0109,0.0233,0.0482,1.9471,36.0294,0.044,0.0572


### 1.3 Multiple genres

Maximum observed on `fma_small` (was 7.6% on `fma_medium`).
* <15% for echonest_audio.
* <22% for echonset_temporal.
* <16% for mfcc.
* <20% for best non-echonest combination

Todo:
* Eliminate rare genres. On small only the 10 selected genres are meaningful.

In [8]:
classifiers = {
    #LogisticRegression(),
    'LR': OneVsRestClassifier(LogisticRegression()),
    'SVC': OneVsRestClassifier(SVC()),
    'MLP': MLPClassifier(max_iter=700),
}

feature_sets = {
    'echonest_audio': ('echonest', 'audio_features'),
    'echonest_temporal': ('echonest', 'temporal_features'),
    'mfcc': 'mfcc',
    'mfcc/contrast/chroma/centroid/tonnetz': ['mfcc', 'spectral_contrast', 'chroma_cens', 'spectral_centroid', 'tonnetz'],
    'mfcc/contrast/chroma/centroid/zcr': ['mfcc', 'spectral_contrast', 'chroma_cens', 'spectral_centroid', 'zcr'],
}

scores, times = test_classifiers_features(classifiers, feature_sets, multi_label=True)

ipd.display(format_scores(scores))
ipd.display(times.style.format('{:.4f}'))

  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))





Unnamed: 0,dim,LR,SVC,MLP
echonest_audio,8,9.46%,11.14%,14.77%
echonest_temporal,224,21.24%,17.23%,17.49%
mfcc,140,16.58%,16.45%,13.73%
mfcc/contrast/chroma/centroid/tonnetz,322,19.17%,16.32%,17.62%
mfcc/contrast/chroma/centroid/zcr,287,18.52%,17.10%,17.88%


Unnamed: 0,LR,SVC,MLP
echonest_audio,0.5281,3.4284,425.2978
echonest_temporal,18.4404,26.2166,555.6063
mfcc,10.3437,17.7836,474.7788
mfcc/contrast/chroma/centroid/tonnetz,32.7827,40.9281,302.6684
mfcc/contrast/chroma/centroid/zcr,27.014,35.6666,348.5735


## 2 Deep learning on raw audio

In [9]:
# TODO: fix dataset.
# Clips with less than 1321967 samples because lower sampling rate, or mono.
BAD_CLIPS = [16402, 16425, 16406, 16431, 33709, 16352, 16404, 33708, 31375, 33702, 22590, 22591, 16039,
             12856, 33716, 16426, 16422, 16421, 16405, 16427, 16401, 16038, 16424, 16429, 16351, 16428,
             16039, 33716, 33702, 31375, 16422, 16352, 22591, 16426, 16429, 16038, 16401, 12856, 16404,
             16402, 16428, 16425, 16405, 33708, 16424, 33709, 16427, 16431, 22590, 16351, 33714, 16421, 16406]
BAD_CLIPS.extend([11665, 12899, 12916, 12917, 16353, 16398, 16400, 16423, 16430, 18689, 18691])

tracks = tracks.drop(BAD_CLIPS, errors='ignore')
path = utils.build_path(tracks, os.path.join(DATA_DIR, 'fma_small'))

In [10]:
labels_onehot = LabelBinarizer().fit_transform(tracks.top_genre)

train = np.argwhere(tracks['train'] == True).flatten()
test = np.argwhere(tracks['train'] == False).flatten()

Load audio samples in parallel using `multiprocessing` so as to maximize CPU usage when decoding MP3s and making some optional pre-processing. There are multiple ways to load a waveform from a compressed MP3:
* librosa uses audioread in the backend which can use many native libraries, e.g. ffmpeg
    * resampling is very slow
    * does not work with multi-processing, for keras `fit_generator()`
* pydub is a high-level interface for audio modification, uses ffmpeg to load
    * store a temporary `.wav`
* directly pipe ffmpeg output
    * fastest method
* [pyAV](https://github.com/mikeboers/PyAV) may be a fastest alternative by linking to ffmpeg libraries

In [11]:
# Just be sure that everything is fine. Multiprocessing is tricky to debug.
utils.FfmpegLoader().load(path(0))
SampleLoader = utils.build_sample_loader(path, labels_onehot, utils.FfmpegLoader())
SampleLoader(train, batch_size=2).__next__()[0].shape

(2, 1321967)

In [12]:
# Keras parameters.
NB_WORKER = len(os.sched_getaffinity(0))  # number of usables CPUs
params = {'pickle_safe': True, 'nb_worker': NB_WORKER, 'max_q_size': 10}

### 2.1 Fully connected neural network

* Two layers with 10 hiddens is no better than random, ~11%.

Optimize data loading to be CPU / GPU bound, not IO bound. Larger batches means reduced training time, so increase batch time until memory exhaustion. Number of workers and queue size have no influence on speed.

CPU
* batch 4, worker 8, queue 1, 600s
* batch 20, worker 24, queue 5, 190s
* batch 20, worker 12, queue 10, 185s
* batch 40, worker 12, queue 10, 135s
* batch 64, worker 12, queue 10, 110s
* batch 128, worker 12, queue 10, 100s

GPU Tesla K40c
* batch 4, worker 12, queue 10, 250s
* batch 16, worker 12, queue 10, 100s
* batch 32, worker 12, queue 10, 90s
* batch 64, worker 12, queue 10, 70s
* batch 96-128 --> memory error

In [13]:
loader = utils.FfmpegLoader(sampling_rate=2000)
SampleLoader = utils.build_sample_loader(path, labels_onehot, loader)
print('Dimensionality: {}'.format(loader.shape))

keras.backend.clear_session()

model = keras.models.Sequential()
model.add(Dense(output_dim=1000, input_shape=loader.shape))
model.add(Activation("relu"))
model.add(Dense(output_dim=100))
model.add(Activation("relu"))
model.add(Dense(output_dim=labels_onehot.shape[1]))
model.add(Activation("softmax"))

optimizer = keras.optimizers.SGD(lr=0.1, momentum=0.9, nesterov=True)
model.compile(optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

model.fit_generator(SampleLoader(train, batch_size=64), train.size, nb_epoch=2, **params)
loss = model.evaluate_generator(SampleLoader(test, batch_size=64), test.size, **params)
#Y = model.predict_generator(SampleLoader(test, batch_size=64), test.size, **params);

loss

Dimensionality: (59953,)
Epoch 1/2
Epoch 2/2


[14.487223503624092, 0.10118265444126455]

### 2.2 Convolutional neural network

* Architecture from [End-to-end learning for music audio](http://www.mirlab.org/conference_papers/International_Conference/ICASSP%202014/papers/p7014-dieleman.pdf) by Sander Dieleman, Benjamin Schrauwen.
* Missing: track segmentation and majority voting
* Larger net: http://benanne.github.io/2014/08/05/spotify-cnns.html

In [14]:
loader = utils.FfmpegLoader(sampling_rate=16000)
#loader = utils.LibrosaLoader(sampling_rate=16000)
SampleLoader = utils.build_sample_loader(path, labels_onehot, loader)

keras.backend.clear_session()

model = keras.models.Sequential()
model.add(Reshape((-1, 1), input_shape=loader.shape))
print(model.output_shape)

model.add(Conv1D(128, 512, subsample_length=512))
print(model.output_shape)
model.add(Activation("relu"))

model.add(Conv1D(32, 8))
print(model.output_shape)
model.add(Activation("relu"))
model.add(MaxPooling1D(4))

model.add(Conv1D(32, 8))
print(model.output_shape)
model.add(Activation("relu"))
model.add(MaxPooling1D(4))

print(model.output_shape)
#model.add(Dropout(0.25))
model.add(Flatten())
print(model.output_shape)
model.add(Dense(100))
model.add(Activation("relu"))
print(model.output_shape)
model.add(Dense(labels_onehot.shape[1]))
model.add(Activation("softmax"))
print(model.output_shape)

optimizer = keras.optimizers.SGD(lr=0.01, momentum=0.9, nesterov=True)
#optimizer = keras.optimizers.Adam()#lr=1e-5)#, momentum=0.9, nesterov=True)
model.compile(optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

model.fit_generator(SampleLoader(train, batch_size=10), train.size, nb_epoch=2, **params)
loss = model.evaluate_generator(SampleLoader(test, batch_size=10), test.size, **params)

loss

(None, 479625, 1)
(None, 936, 128)
(None, 929, 32)
(None, 225, 32)
(None, 56, 32)
(None, 1792)
(None, 100)
(None, 10)
Epoch 1/2
Epoch 2/2


[14.487224080089513, 0.10118265620355694]

### 2.3 Recurrent neural network

## 3 Deep learning on extracted audio features

Todo:
* Pre-processing in Keras: https://github.com/keunwoochoi/kapre
* Convolutional Recurrent Neural Networks for Music Classification: https://github.com/keunwoochoi/icassp_2017
* Music Auto-Tagger: https://github.com/keunwoochoi/music-auto_tagging-keras
* Pre-processor: https://github.com/bmcfee/pumpp

### 3.1 ConvNet on MFCC

* Architecture from [Automatic Musical Pattern Feature Extraction Using Convolutional Neural Network](http://www.iaeng.org/publication/IMECS2010/IMECS2010_pp546-550.pdf) by Tom LH. Li, Antoni B. Chan and Andy HW. Chun
* Missing: track segmentation and majority voting.
* Best seen: 17.6%

In [15]:
class MfccLoader(utils.Loader):
    raw_loader = utils.FfmpegLoader(sampling_rate=22050)
    #shape = (13, 190)  # For segmented tracks.
    shape = (13, 2582)
    def load(self, filename):
        import librosa
        x = self.raw_loader.load(filename)
        # Each MFCC frame spans 23ms on the audio signal with 50% overlap with the adjacent frames.
        mfcc = librosa.feature.mfcc(x, sr=22050, n_mfcc=13, n_fft=512, hop_length=256)
        return mfcc

loader = MfccLoader()
SampleLoader = utils.build_sample_loader(path, labels_onehot, loader)
loader.load(path(0))[0].shape

(2582,)

In [16]:
keras.backend.clear_session()

model = keras.models.Sequential()
model.add(Reshape((*loader.shape, 1),  input_shape=loader.shape))
print(model.output_shape)

model.add(Conv2D(3, 13, 10, subsample=(1, 4)))
model.add(Activation("relu"))
print(model.output_shape)

model.add(Conv2D(15, 1, 10, subsample=(1, 4)))
model.add(Activation("relu"))
print(model.output_shape)

model.add(Conv2D(65, 1, 10, subsample=(1, 4)))
model.add(Activation("relu"))
print(model.output_shape)

model.add(Flatten())
print(model.output_shape)
model.add(Dense(labels_onehot.shape[1]))
model.add(Activation("softmax"))
print(model.output_shape)

optimizer = keras.optimizers.SGD(1e-3)#lr=0.01, momentum=0.9, nesterov=True)
#optimizer = keras.optimizers.Adam()#lr=1e-5)#
model.compile(optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

model.fit_generator(SampleLoader(train, batch_size=16), train.size, nb_epoch=2, **params)
loss = model.evaluate_generator(SampleLoader(test, batch_size=16), test.size, **params)
#Y = model.predict_generator(loader, test.size, pickle_safe=True, nb_worker=NB_WORKER, max_q_size=5)

loss

(None, 13, 2582, 1)
(None, 1, 644, 3)
(None, 1, 159, 15)
(None, 1, 38, 65)
(None, 2470)
(None, 10)
Epoch 1/2
Epoch 2/2


[14.487223655259562, 0.10118265441189302]