# Using a Neural Network to Predict Record Producer from Featurized Audio Data

## Background

Spotify's Audio Analysis contains a feature called `timbre` which contains information about the qualities of sound that are not found in pitch. From Spotify

>*Timbre is the quality of a musical note or sound that distinguishes different types of musical instruments, or voices. It is a complex notion also referred to as sound color, texture, or tone quality, and is derived from the shape of a segment’s spectro-temporal surface, independently of pitch and loudness. The timbre feature is a vector that includes 12 unbounded values roughly centered around 0. Those values are high level abstractions of the spectral surface, ordered by degree of importance.*

I believe that a producer's ***Signature Sound*** can be found in these timbre vectors.

I will use `TensorFlow.keras` to create a Convolutional Neural Network that will categorically predict record producer from audio snippets.

In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Builds the neural network model
"""

# Standard Imports
from matplotlib.pyplot import imread, imshow
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import datasets, layers, models
import keras.backend as K
from tensorflow.keras.constraints import min_max_norm, non_neg
# import kernels
# from artist import CustomImage, ImageBundle
import pickle
import sys
import os
import pandas as pd

Using TensorFlow backend.


In [2]:
# Load MongoDB

from pymongo import MongoClient
client = MongoClient()
# Access/Initiate Database
db = client['producer_db']
# Access/Initiate Table
tab = db['songs']
collection = db.tab

# Authorize Spotify API

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

client_id = os.environ['SPOTIFY_CLIENT_ID']
client_secret = os.environ['SPOTIFY_CLIENT_SECRET']
client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [3]:
model = models.Sequential()

# model.add(layers.Conv2D(32, (3, 3), activation='relu', input_shape=(12, 80, 1)))
# model.add(layers.MaxPooling2D((2, 2)))
# model.add(layers.Conv2D(64, (3, 3), activation='relu'))
# model.add(layers.MaxPooling2D((2, 2)))
# # model.add(layers.Conv2D(64, (3, 3), activation='relu'))

model.add(layers.Flatten(input_shape = (12,80,1)))
model.add(layers.Dense(64, activation='relu'))
# model.add(layers.Dense(32, activation='relu'))
# model.add(layers.Dense(12, activation='relu'))
model.add(layers.Dense(7, activation='softmax'))

# model.add(layers.Conv2D(32, (3, 3), activation='relu', input_shape=(28, 28, 1)))
# model.add(layers.MaxPooling2D((2, 2)))
# model.add(layers.Conv2D(64, (3, 3), activation='relu'))
# model.add(layers.MaxPooling2D((2, 2)))
# model.add(layers.Conv2D(64, (3, 3), activation='relu'))

# model.add(layers.Flatten())
# model.add(layers.Dense(64, activation='relu'))
# model.add(layers.Dense(10, activation='softmax'))

Instructions for updating:
Colocations handled automatically by placer.


In [14]:
a = np.array([[1,1,1],[2,2,2],[3,3,3]])

b = np.concatenate([a,a,a], axis = 0)
c = np.concatenate([a,a,a], axis = 1)
d = np.stack([a,a,a], axis = 2)

a.shape, b.shape, c.shape, d.shape

((3, 3), (9, 3), (3, 9), (3, 3, 3))

In [18]:
collection.distinct('producer')

['George Martin',
 'Dr. Dre',
 'Rick Rubin',
 'Brian Eno',
 'Stock Aitken Waterman',
 'Paul Epworth',
 'Pete Rock']

In [171]:
from bson.son import SON
pipeline = [
    {"$unwind": "$producer"},
    {"$group": {"_id": "$producer", "count": {"$sum": 1}}},
    {"$sort": SON([("count", -1), ("_id", -1)])}
]
import pprint
pprint.pprint(list(collection.aggregate(pipeline)))

[{'_id': 'Rick Rubin', 'count': 2039},
 {'_id': 'Dr. Dre', 'count': 1498},
 {'_id': 'George Martin', 'count': 1420},
 {'_id': 'Pete Rock', 'count': 1252},
 {'_id': 'Brian Eno', 'count': 924},
 {'_id': 'Paul Epworth', 'count': 478},
 {'_id': 'Stock Aitken Waterman', 'count': 436}]


In [173]:
(2039+1498+1420+1252+924+478+436), 2039/(2039+1498+1420+1252+924+478+436)

(8047, 0.25338635516341496)

### Create timbre vectors and a target vector with 200 songs from each producer. Create test vectors with 100 songs from each producer.

In [80]:
def make_timbre_train_test(collection, train_size, test_size):
    """
    THIS FUNCTION IS BAD BECAUSE IT DOES NOT RANDOMLY SELECT SAMPLES
    """
    
    producers = collection.distinct('producer')
    print(producers)
    timbre_train = []
    timbre_test = []
    target_train = []
    target_test = []
    for producer in producers:
        print('Producer: {}'.format(producer))
        train_count = 0
        test_count = 0
        for song in collection.find({'producer':producer}):
            try:
                # Add data to training set
                if train_count < train_size:
                    song_timbre_segments = []
                    #songs must have at least 80 segments to be in the analysis
                    if len((song['audio_analysis']['segments'])) >= 80:
                        for i in range(80):
                            song_timbre_segments.append(song['audio_analysis']['segments'][i]['timbre'])
                        song_timbre_vector = np.concatenate(song_timbre_segments, axis=0)
                    timbre_train.append(song_timbre_vector)
                    target_train.append(song['producer'])
                    train_count += 1
#                     print('Ct: {}\t\tAdded song {} by {}'.format(train_count,song['track'],song['artist']))

                # Add data to test set
                elif test_count < test_size:
                    song_timbre_segments = []
                    #songs must have at least 80 segments to be in the analysis
                    if len((song['audio_analysis']['segments'])) >= 80:
                        for i in range(80):
                            song_timbre_segments.append(song['audio_analysis']['segments'][i]['timbre'])
                        song_timbre_vector = np.concatenate(song_timbre_segments, axis=0)
                    timbre_test.append(song_timbre_vector)
                    target_test.append(song['producer'])
                    test_count += 1
#                     print('Ct: {}\t\tAdded song {} by {}'.format(test_count,song['track'],song['artist']))

                else:
                    continue
            
            except Exception as ex:
                print(ex)
                continue
                
    timbre_train = np.array(timbre_train)
    timbre_test = np.array(timbre_test)
    target_train = np.array(target_train)
    target_test = np.array(target_test)
    return (timbre_train, timbre_test, target_train, target_test)
            

In [81]:
timbre_train, timbre_test, target_train, target_test = make_timbre_train_test(collection, 200, 100)

['George Martin', 'Dr. Dre', 'Rick Rubin', 'Brian Eno', 'Stock Aitken Waterman', 'Paul Epworth', 'Pete Rock']
Producer: George Martin
Producer: Dr. Dre
Producer: Rick Rubin
Producer: Brian Eno
Producer: Stock Aitken Waterman
Producer: Paul Epworth
Producer: Pete Rock


In [82]:
timbre_train.shape, timbre_test.shape, target_train.shape, target_test.shape

((1400, 960), (700, 960), (1400,), (700,))

In [149]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_11 (Conv2D)           (None, 10, 78, 32)        320       
_________________________________________________________________
max_pooling2d_8 (MaxPooling2 (None, 5, 39, 32)         0         
_________________________________________________________________
conv2d_12 (Conv2D)           (None, 3, 37, 64)         18496     
_________________________________________________________________
max_pooling2d_9 (MaxPooling2 (None, 1, 18, 64)         0         
_________________________________________________________________
flatten_19 (Flatten)         (None, 1152)              0         
_________________________________________________________________
dense_42 (Dense)             (None, 64)                73792     
_________________________________________________________________
dense_43 (Dense)             (None, 7)                 455       
Total para

In [84]:
train_labels = pd.get_dummies(target_train).values
producer_vector = pd.get_dummies(target_train).columns

producer_vector

Index(['Brian Eno', 'Dr. Dre', 'George Martin', 'Paul Epworth', 'Pete Rock',
       'Rick Rubin', 'Stock Aitken Waterman'],
      dtype='object')

In [140]:
timbre_train.reshape(1400, 12, 80, 1).shape

(1400, 12, 80, 1)

In [153]:
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

model.fit(timbre_train.reshape(-1,12,80,1), train_labels, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0xb625c1550>

In [154]:
test_loss, test_acc = model.evaluate(timbre_test.reshape(-1,12,80,1), pd.get_dummies(target_test).values)



In [None]:
# Define a simple triangular kernel and kernel constraints
kernel_tri = tf.constant_initializer(kernels.triangle_5())
kernel_const = min_max_norm(0.001, None, rate=1, axis=0)
kernel_nonneg = non_neg()

# TensorFlow expects 4D tensors of shape (samples, rows, cols, channels)
# Note that the first index (the sample index out of the batch) is stripped
model = keras.Sequential([
        # Maxpool the image
        keras.layers.MaxPool2D(
            input_shape=(512, 512, 1),
            pool_size=2,
            padding='same',
            data_format='channels_last'),

        # Convolve the pooled image by the shape kernel(s)
        # ??? Use LocallyConnected2D instead?
        keras.layers.Conv2D(
            filters=5,
            kernel_size=(8, 8),
            strides=(8, 8),
            padding='same',
            data_format='channels_last',
            activation='sigmoid',
            use_bias=True),
            # ??? kernel_initializer=kernel_tri,
            # kernel_constraint=kernel_nonneg),
        keras.layers.Conv2D(
            filters=5,
            kernel_size=(8, 8),
            strides=(8, 8),
            padding='same',
            data_format='channels_last',
            activation='sigmoid',
            use_bias=True),
        # Flatten
        keras.layers.Flatten(),

        # Basic Dense layer
        keras.layers.Dense(
            units=25,
            activation=None,
            # kernel_constraint=kernel_nonneg,
            use_bias=True),

        # Activation layer
        keras.layers.PReLU(),

        # Reshape & output
        keras.layers.Reshape((5, 5))
        ])

# Define optimizer
optimizer = keras.optimizers.Adadelta()

# Compile the model
model.compile(
    optimizer=optimizer,
    loss='mean_squared_error',
    metrics=['mean_squared_error'])


if (__name__ == '__main__'):
    assert len(sys.argv) == 3, 'Pass me both the training and save filepaths!'
    # XXX Testing constants - Remove
    try:
        TRAINING_SET = sys.argv[1]
        SAVE_PATH = sys.argv[2]
    except IndexError:
        print('Pass me both the training set and save filepaths!')
        TRAINING_SET = '../data/train_set_01.pkl'
        SAVE_PATH = '../models/saved_model_01.h5'
#        sys.exit()

    # Load the training set from the pickled ImageBundle
    train_bundle = pickle.load(open(TRAINING_SET, 'rb'))
    train_X = train_bundle.images
    train_y = train_bundle.tri_list

    # IN: (samples, rows, cols, channels)
    IN_SHAPE = train_X.shape
    # OUT: (samples, shape_idx, shape_attrs, channels)
    OUT_SHAPE = train_y.shape
    # Initialize the training set

    # Fit the model to the training ImageBundle
    model.fit(
        train_X,
        train_y[:, :, :, 0],
        epochs=50,
        verbose=1,
        batch_size=5)

    # Write model config to YAML
    model_yaml = model.to_yaml()
    with open('../models/model_config.yaml', 'w') as yaml_file:
        yaml_file.write(model_yaml)

    # Save model
    model.save(SAVE_PATH, overwrite=True, include_optimizer=True)
    print('\nModel saved at: %s' % SAVE_PATH)

# Some notes on Models:

### Model 1: Flatten -> 64 -> 7:

    Made a pretty good model. Train accuracy ~65%. Test accuracy 35%
    
### Model 2: Flatten -> 64 -> 32 -> 7

    Crazy overfitting. Training accuracy ~90% and highly variable. Test accuracy ~30%
    
### Model 3: Flatten -> 64 -> 32 -> 12 -> 7

    Training Accuracy stays around 15-20% for some reason. About the same as randomly guesssing. Test ~15%. A non-model.
    
### Model 4: Flatten -> 32 -> 7

    Train: 88%, Test: 38%  THIS ONE IS GOOD
    
### Model 5: Flatten -> 12 -> 7

    Train and Test: 15%
    
### Model 6: Flatten -> 32 -> 32 -> 7

    Overfit: Train 84%, Test 27%

### Simple Model: Flatten -> 7

    Train 57% Test 28%

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
def better_timbre_train_test(collection, test_size = 0.25, random_state = 440):
    """
    Inputs:
        collection: Mongo DB collection
        test_size: between 0 and 1, fraction of data in test set.
        random_state: random state for sklearn train_test_split
        
    Outputs:
        X_train, X_test, y_train, y_test, y_columns
        y_columns is the labels associated with the columns of y
    """
    X = []
    y = []
                    
    for song in collection.find():
        try:
            song_timbre_segments = []
            #songs must have at least 80 segments to be in the analysis
            if len((song['audio_analysis']['segments'])) >= 80:
                for i in range(80):
                    song_timbre_segments.append(song['audio_analysis']['segments'][i]['timbre'])
                song_timbre_vector = np.concatenate(song_timbre_segments, axis=0)
            X.append(song_timbre_vector)
            y.append(song['producer'])
        except:
            continue
    
    X = np.array(X)
    y = np.array(y)
    
    y_one_hot = pd.get_dummies(y).values
    y_columns = pd.get_dummies(y).columns
    X = X.reshape(-1,12,80,1)
    
    X_train, X_test, y_train, y_test = train_test_split(X ,y_one_hot, test_size = test_size, random_state = random_state)
    
    return X_train, X_test, y_train, y_test, y_columns

In [6]:
X_train, X_test, y_train, y_test, y_columns = better_timbre_train_test(collection, test_size = 0.3, random_state = 440 )

In [9]:
model = models.Sequential()

model.add(layers.Conv2D(32, (3, 3), activation='relu', input_shape=(12, 80, 1)))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(64, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
# model.add(layers.Conv2D(64, (3, 3), activation='relu'))

model.add(layers.Flatten(input_shape = (12,80,1)))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(32, activation='relu'))
# model.add(layers.Dense(12, activation='relu'))
model.add(layers.Dense(10, activation='softmax'))

In [10]:
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

model.fit(X_train, y_train, epochs=20)

Instructions for updating:
Use tf.cast instead.
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x1a3fd81e80>

In [11]:
test_loss, test_acc = model.evaluate(X_test, y_test)



With the full dataset so far (~8000 songs, 7 producers), we get:

Flatten - 64 - 7:
    Train: 41%, Test: 36%
    
Flatten - 32 - 7:
    Train: 38%, Test: 32%
    
Hella convolutions:
    Train: 73%, Test: 34%
    
    
Note: Currently guessing majority class yields 25% accuracy.

In [177]:
model.predict(X_test)[:5]

array([[0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
        0.0000000e+00, 1.0000000e+00, 0.0000000e+00],
       [0.0000000e+00, 2.4521757e-24, 0.0000000e+00, 0.0000000e+00,
        1.0000000e+00, 3.4583623e-15, 0.0000000e+00],
       [0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
        0.0000000e+00, 1.0000000e+00, 0.0000000e+00],
       [0.0000000e+00, 0.0000000e+00, 1.0000000e+00, 0.0000000e+00,
        0.0000000e+00, 0.0000000e+00, 0.0000000e+00],
       [0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
        1.0000000e+00, 0.0000000e+00, 0.0000000e+00]], dtype=float32)