# Neural Network Training Notebook

This notebook is for training the final model, after finding appropriate hyperparameters. Much of this code is copied from the dev notebook as needed.

In [1]:
import keras
import glob
import os
from PIL import Image
import pandas as pd
import numpy as np
from collections import defaultdict
from tqdm import tnrange, tqdm_notebook
from keras.models import Sequential, Model
from keras.layers import Input, Dense, Activation, Conv2D, MaxPooling2D, ZeroPadding2D, Flatten, Lambda, BatchNormalization
from keras.layers import Conv1D, ZeroPadding1D, MaxPooling1D, GlobalMaxPooling1D
from keras import backend as K
from keras.callbacks import ModelCheckpoint, TensorBoard, CSVLogger
from keras.optimizers import Adam
import matplotlib
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from utils import load_json, make_logger
import logging
import itertools
import pickle
import difflib

params = load_json('params.json')
logger = make_logger('train', 'log/train.log')

Using TensorFlow backend.


## Load the training and test sets and images

Creates a dict relating artist MBIDs to spectrogram images, represented as 128x128x1 arrays of floats. Also load the previously saved database of artist relationships.

In [2]:
logger.info('Loading images')

images = defaultdict(list)

for path in tqdm_notebook(glob.glob('tracks/**/*.png', recursive=True)):
    mbid = os.path.basename(path).rsplit('-', 1)[0]
    img = np.rot90(np.array(Image.open(path).convert('L')) / 255)
    #img = np.reshape(img, img.shape + (1,))
    images[mbid].append(img)
    
logger.info('Loaded images')

Loading images


HBox(children=(IntProgress(value=0, max=2791), HTML(value='')))

Loaded images





In [3]:
logger.info('Loading training set')

train_set = pd.read_hdf('dataset/train_min.hd5', key='artists')

logger.info('Loaded training set')

logger.info('Loading test set')

test_set = pd.read_hdf('dataset/test_min.hd5', key='artists')

logger.info('Loaded test set')

Loading training set
Loaded training set
Loading test set
Loaded test set


In [4]:
logger.info('Artists in images: {}'.format(len(images)))
logger.info('Training set length: {}'.format(len(train_set)))
logger.info('Test set length: {}'.format(len(test_set)))

Artists in images: 975
Training set length: 877
Test set length: 975


## Define accuracy metric

For my accuracy metric, I get the top similar artists for each artist from the original dataset, then to calculate accuracy of the model, I use the model to calculate all pairwise similarity scores and get the top predicted similar artists. Then I compare the two using edit distance, and return the average value as an accuracy.

Calculating accuracy like this is slow because you pretty much have to go through a full epoch. It's probably only useful at the end of training.

There are two accuracy metrics I use, all of which mean slightly different things:

`compare_accuracy_edit_dist` compares the top n artists in each dataframe by edit distance, and returns it as a proportion to the maximum edit distance possible (i.e. n)

`compare_accuracy_unordered` compares how many of the top n artists are shared between both dataframes, disregarding order, as a proportion of n. 

In [5]:
def similarity_matrix_to_top(matrix):
    # Converts a similarity score matrix to a sorted table of most similar artists
    most_similar = np.argsort(-matrix.values, axis=1)
    return pd.DataFrame(most_similar, index=matrix.index).T

def eval_artist_similarity(artist_A, artist_B, model):
    # Collect all pairwise combinations of artist tracks, evaluates the model on them, then
    # returns the similarity as an average of all the predictions
    audio_A = images[artist_A]
    audio_B = images[artist_B]
    
    X_tmp = [list(), list()]
    for pair in itertools.product(audio_A, audio_B):
        X_tmp[0].append(pair[0])
        X_tmp[1].append(pair[1])
    
    return np.mean(model.predict(X_tmp))

def eval_artist_baseline(artist_A, artist_B):
    # Similar to above, but using baseline evaluation of taking L2 distance between
    # spectrograms and averaging
    audio_A = images[artist_A]
    audio_B = images[artist_B]
    
    dist = []
    for pair in itertools.product(audio_A, audio_B):
        dist.append(np.linalg.norm(pair[0]-pair[1]))
    
    return -np.mean(np.array(dist))

def create_similarity_matrix(artists, model, all_artists = None, baseline = False):
    # Creates a similarity matrix for a list of artists, using a given model to predict similarity
    if all_artists is None:
        all_artists = artists
    df = pd.DataFrame(np.zeros((len(artists), len(all_artists))), columns = all_artists, index = artists)
    for artist_A, artist_B in itertools.product(all_artists, artists):
        if artist_A != artist_B:
            if baseline:
                sim = eval_artist_baseline(artist_A, artist_B)
            else:
                sim = eval_artist_similarity(artist_A, artist_B, model)
            df[artist_A][artist_B] = sim
    return df

def compare_accuracy_edit_dist(df1, df2, n=10):
    # Compares accuracy of two similarity ranking dataframes using edit distance, comparing the top n ranked
    # similar artists
    distances = dict()
    for column in df1:
        distances[column] = difflib.SequenceMatcher(None, df1.head(n)[column], df2.head(n)[column]).ratio()
    return np.mean(np.array(list(distances.values()))), distances

def compare_accuracy_unordered(df1, df2, n=10):
    # Compares accuracy of two similarity ranking dataframes by looking at the size of the unordered union of sets,
    # comparing the top n ranked similar artists
    distances = dict()
    for column in df1:
        distances[column] = len(set(df1.head(n)[column].values).intersection(df2.head(n)[column].values))\
            / len(df1.head(n)[column])
    return np.mean(np.array(list(distances.values()))), distances

## Create train/test datasets relating image references to values

Previously, we had the large raw dataset which is of the form `df[artistA][artistB]=similarity`. Here, we create X and Y arrays, with the X array being a list of two arrays, each containing references to one of the spectrogram images that was loaded for a given artist. The Y array is a list containing the similarity scores for the corresponding two artists.

In [19]:
logger.info('Re-formatting datasets for training')

train_dedup = train_set.where(~np.triu(np.ones(train_set.shape)).astype(np.bool))
train_stacked = train_dedup.stack()

test_set.index = test_set.columns.append(test_set.columns.append(test_set.index).drop_duplicates(keep=False))
test_dedup = test_set.where(~np.triu(np.ones(test_set.shape)).astype(np.bool))
test_stacked = test_dedup.stack()

def format_dataset(raw_dataset):
    X_1 = []
    X_2 = []
    Y = []
    
    for index, value in tqdm_notebook(raw_dataset.iteritems(), total = len(raw_dataset)):
        artist_A = index[0]
        artist_B = index[1]
        audio_A = images[artist_A]
        audio_B = images[artist_B]

        for pair in itertools.product(audio_A, audio_B):
            X_1.append(pair[0])
            X_2.append(pair[1])
            Y.append(value)

    Y = np.array(Y)
    X = [X_1, X_2]
    return X, Y

X_train, Y_train = format_dataset(train_stacked)
X_test, Y_test = format_dataset(test_stacked)

logger.info('Datasets formatted')

Re-formatting datasets for training


HBox(children=(IntProgress(value=0, max=384126), HTML(value='')))




HBox(children=(IntProgress(value=0, max=46504), HTML(value='')))

Datasets formatted





## Define Keras model specification

This defines a siamese network, which trains the same model with the same parameters and applies it to both images. The output of this shared vector is a fully-connected network with 128 neurons, for each image. The L1 distance between these two networks is then taken and the resulting 128 length vector is fed into a final sigmoid function.

In [7]:
def L1_distance(x):
    return K.abs(x[0] - x[1])

def L1_dist_output_shape(shapes):
    shape1, shape2 = shapes
    return shape1

def create_base_network(input_shape):
    input = Input(shape = input_shape)
    x = ZeroPadding1D()(input) # 258x64
    x = Conv1D(128,3,activation='relu')(x) # 256x128
    x = BatchNormalization()(x)
    x = MaxPooling1D(4)(x) # 64x128
    x = ZeroPadding1D()(x) # 66x128
    x = Conv1D(128,3,activation='relu')(x) # 64x128
    x = BatchNormalization()(x)
    x = MaxPooling1D()(x) # 32x128
    x = ZeroPadding1D()(x) # 34x128
    x = Conv1D(128,3,activation='relu')(x) # 32x128
    x = BatchNormalization()(x) 
    x = GlobalMaxPooling1D()(x) # 128
    x = Dense(128, activation='relu')(x) # 128
    return Model(input, x)
    
def model_spec(lr = 0.001, decay = 0.0, **kwargs):
    input_shape = (256,64)

    siamese_net = create_base_network(input_shape)

    input_a = Input(shape = input_shape)
    input_b = Input(shape = input_shape)

    process_a = siamese_net(input_a)
    process_b = siamese_net(input_b)

    distance = Lambda(L1_distance, output_shape = L1_dist_output_shape)([process_a, process_b])

    output = Dense(1, activation='sigmoid')(distance)

    model = Model([input_a, input_b], output)

    adam = Adam(lr=lr, decay=decay)
    model.compile(loss = 'mean_squared_error', optimizer = adam)
    
    return model

model = model_spec()

## Training functions

Defines a function for training the model and evaluating it, as well as a function for generating batches (to avoid memory issues).

In [8]:
def batch_loader(X_data, Y_data, batch_size):
    curr_batch = 0
    while True:
        end = min(curr_batch + batch_size, len(Y_data))
        X = [np.asarray(X_data[0][curr_batch:end]), np.asarray(X_data[1][curr_batch:end])]
        Y = np.asarray(Y_data[curr_batch:end])
        yield (X, Y)
        if end == len(Y_data):
            curr_batch = 0
        else:
            curr_batch = end

In [9]:
def train_model(hparams, model_spec, log=True, save=True, test_accs = [3, 5, 10, 25]):
    # Make the path to the logger/model directory, named based on parameters
    path = os.path.join('models', 'full')
    os.makedirs(path, exist_ok=True)
    
    # Get temporary logger with hparams.__str__ name 
    tmp_logger = make_logger(str(hparams), os.path.join(path, 'training.log'), )
    
    batch_size = hparams.get('batch_size', 128)
    epochs = hparams.get('epochs', 25)
    model = model_spec(**hparams)
    
    # Tensorboard support
    tensorboard = TensorBoard(log_dir=os.path.join(path, 'tensorboard'))
    
    # CSV Logging of epoch results
    csvlogger = CSVLogger(filename=os.path.join(path, 'epochs.csv'))

    tmp_logger.info('Training with params {}'.format(hparams))
    
    # Define a callback to save the model at every epoch
    filename = "model-{epoch:02d}-{val_loss:.3f}.hdf5"
    checkpoint = ModelCheckpoint(os.path.join(path, filename), monitor='val_loss', verbose=0, 
                                 save_best_only=False, mode='auto', period=1)

    history = model.fit_generator(batch_loader(X_train, Y_train, batch_size), epochs=epochs, 
                                  validation_data=batch_loader(X_test, Y_test, batch_size), 
                                  steps_per_epoch=np.ceil(len(Y_train) / batch_size), 
                                  validation_steps=np.ceil(len(Y_test) / batch_size),
                                  callbacks=[checkpoint, tensorboard, csvlogger])
    
    tmp_logger.info('Finished training, final train loss = {:.5f}, test loss = {:.5f}'.format(
        history.history['loss'][-1], history.history['val_loss'][-1]))

    with open(os.path.join(path, 'history'), 'wb') as file:
        pickle.dump(history.history, file)
    
    if save:
        tmp_logger.info('Saving model')
        model.save(os.path.join(path, 'post_train_model.hd5'))
        tmp_logger.info('Saved model')
        
    return history.history, model
        

## Train the model

This is it! Train the model using the optimal parameters we found earlier, i.e. 1024 batch size and 0.001 learning rate. We'll go for however many epochs as possible (up to 25), since we're saving the model at each epoch anyway.

In [None]:
hparams = {
    'batch_size': 1024,
    'epochs': 25, 
    'lr': 0.001
}

logger.info('Training final model')
history, model = train_model(hparams, model_spec)

Training final model
Training with params {'batch_size': 1024, 'epochs': 25, 'lr': 0.001}


Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25

In [21]:
model.save(os.path.join('post_train_model_x2.hd5'))