# Building Machine Learning Systems with Python - Chapter 11

In this chapter we will create a music genre classifier. While the ML algorithm itself (logistic regression) is nothing fancy by now, we will look into fancy features like Fast Fourier Transforms and Mel Frequency Cepstral Coefficients, use P/R and ROC curves to analyze what works best and then figure out which version to use.

This code is supporting material for the book `Building Machine Learning Systems with Python` by [Willi Richert](https://www.linkedin.com/in/willirichert/), [Luis Pedro Coelho](https://www.linkedin.com/in/luispedrocoelho/) and [Matthieu Brucher](https://www.linkedin.com/in/matthieubrucher/) published by PACKT Publishing. It is made available under the MIT License.

All code examples use Python in version...

In [None]:
import sys
sys.version

## Utilities we will need

In [None]:
import os
import glob 
from pathlib import Path

CHART_DIR = "charts"
if not Path(CHART_DIR).exists():
    os.mkdir(CHART_DIR)

DATA_DIR = "data"
if not Path(DATA_DIR).exists():
    os.mkdir(DATA_DIR)

GENRE_DIR = Path(DATA_DIR) / 'genres'

%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('seaborn')

import numpy as np
import scipy

DPI = 100

import collections
import csv

def save_png(name):
    fn = 'B09124_11_%s.png'%name # please ignore, it just helps our publisher :-)
    plt.savefig(str(Path(CHART_DIR) / fn), bbox_inches="tight")
    
    
def plot_pr(auc_score, name, precision, recall, label=None, plot_nr=None):
    plt.figure(num=None, figsize=(5, 4), dpi=DPI)
    plt.grid(True)
    plt.fill_between(recall, precision, alpha=0.5)
    plt.plot(recall, precision, lw=1)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('P/R curve (AUC=%0.2f) / %s' % (auc_score, label))
    filename = name.replace(" ", "_")
    save_png("%s_pr_%s" % (plot_nr, filename))
    
def plot_roc(auc_score, name, tpr, fpr, label=None, plot_nr=None):
    plt.figure(num=None, figsize=(5, 4), dpi=DPI)
    plt.grid(True)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.plot(fpr, tpr)
    plt.fill_between(fpr, tpr, alpha=0.5)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC curve (AUC = %0.2f) / %s' % (auc_score, label), verticalalignment="bottom")
    plt.legend(loc="lower right")
    save_png('%i_auc_%s' % (plot_nr, name))

## Preparing the data

In [None]:
import urllib.request

genre_fn = 'http://opihi.cs.uvic.ca/sound/genres.tar.gz'
urllib.request.urlretrieve(genre_fn, Path(DATA_DIR) / 'genres.tar.gz')

In [None]:
import tarfile

cwd = os.getcwd()

os.chdir(DATA_DIR)

try:
    f = tarfile.open('genres.tar.gz', 'r:gz')
    try: 
        f.extractall()
    finally: 
        f.close()
finally:
    os.chdir(cwd)

Download and install https://sourceforge.net/projects/sox/files/sox/. For this notebook, we are using SOX 14.4.2 to convert the downloaded genre files from `.au` into `.wav` format, which is easier to handle.

In [None]:
SOX_PATH = r'C:\Program Files (x86)\sox-14-4-2'
SOX = SOX_PATH + r'\sox.exe'
SOX = "sox"

In [None]:
for au_fn in Path(GENRE_DIR).glob('**/*.au'):
    print(au_fn)
    !"{SOX}" {au_fn} {au_fn.with_suffix('.wav')}

# Looking at music

In [None]:
import scipy.io.wavfile

from matplotlib.ticker import EngFormatter

def plot_specgram(ax, fn):
    sample_rate, X = scipy.io.wavfile.read(fn)
    ax.specgram(X, Fs=sample_rate, xextent=(0, 30), cmap='hot')

GENRES = ["classical", "jazz", "country", "pop", "rock", "metal"]

def plot_specgrams():
    """
    Plot a bunch of spectrograms of wav files in different genres
    """
    plt.clf()
    
    num_files = 3
    f, axes = plt.subplots(len(GENRES), num_files, dpi=DPI, figsize=(6, 8))
    
    for genre_idx, genre in enumerate(GENRES):
        for idx, fn in enumerate((Path(GENRE_DIR) / genre).glob('*.wav')):
            if idx == num_files:
                break
            
            axis = axes[genre_idx, idx]
            axis.tick_params(direction='out', length=0, width=1, labelsize=5)
    
            axis.yaxis.set_major_formatter(EngFormatter())
            axis.set_title("%s song %i" % (genre, idx + 1), fontsize=7)
            plot_specgram(axis, fn)
        
    plt.subplots_adjust(hspace=0.5)
    save_png("5_Spectrogram_Genres")
    
plot_specgrams()

# Playing with waves

In [None]:
!"{SOX}" --null -r 22050 sine_a.wav synth 0.2 sine 400

In [None]:
!"{SOX}" --null -r 22050 sine_b.wav synth 0.2 sine 3000

In [None]:
!"{SOX}" --combine mix --volume 1 sine_b.wav --volume 0.5 sine_a.wav sine_mix.wav

Now we have three files `sine_a.wav`, `sine_b.wav`, `sine_mix.wav` in the current directory, which we can visualize.

In [None]:
import scipy

def plot_wav_fft(wav_filename, desc=None, plot=0):
    plt.clf()
    plt.figure(num=None, figsize=(6, 4), dpi=DPI)
    sample_rate, X = scipy.io.wavfile.read(wav_filename)
    spectrum = np.fft.fft(X)
    freq = np.fft.fftfreq(len(X), 1.0 / sample_rate)

    plt.subplot(211)
    num_samples = 200
    plt.xlim(0, num_samples / sample_rate)
    plt.xlabel("time [s]")
    plt.title(desc or wav_filename)
    plt.plot(np.arange(num_samples) / sample_rate, X[:num_samples])
    plt.grid(True)

    plt.subplot(212)
    plt.xlim(0, 5000)
    plt.xlabel("frequency [Hz]")
    plt.xticks(np.arange(5) * 1000)
    if desc:
        desc = desc.strip()
        fft_desc = desc[0].lower() + desc[1:]
    else:
        fft_desc = wav_filename
    plt.title("FFT of %s" % fft_desc)
    plt.plot(freq, abs(spectrum), linewidth=2)
    plt.grid(True)

    plt.tight_layout()

    rel_filename = os.path.split(wav_filename)[1]
    save_png("%i_%s_wav_fft" % (plot, os.path.splitext(rel_filename)[0]))

    plt.show()


plot_wav_fft("sine_a.wav", "400Hz sine wave", 1)
plot_wav_fft("sine_b.wav", "3,000Hz sine wave", 2)
plot_wav_fft("sine_mix.wav", "Mixed sine wave", 3)

A "real" music file looks a bit noisier:

In [None]:
plot_wav_fft(Path(GENRE_DIR) / 'disco' / 'disco.00000.wav', "some sample song", 4)

# First classifier using FFT

## Generating FFT features

In [None]:
def create_fft(fn):
    sample_rate, X = scipy.io.wavfile.read(fn)

    fft_features = abs(scipy.fft(X)[:1000])
    np.save(Path(fn).with_suffix('.fft'), fft_features)
    
for wav_fn in Path(GENRE_DIR).glob('**/*.wav'):
    print('Converting %s ...' % str(wav_fn))
    create_fft(wav_fn)

In [None]:
def read_fft(genre_list, base_dir=GENRE_DIR):
    X = []
    y = []
    for label, genre in enumerate(genre_list):
        genre_dir = Path(base_dir) / genre
        for fn in genre_dir.glob("*.fft.npy"):
            fft_features = np.load(fn)

            X.append(fft_features[:1000])
            y.append(label)

    return np.array(X), np.array(y)

## Training the FFT-based classifier

Creating the model...

In [None]:
from sklearn.linear_model.logistic import LogisticRegression

def create_model():
    return LogisticRegression()

In [None]:
from collections import defaultdict

from sklearn.metrics import precision_recall_curve, roc_curve, confusion_matrix
from sklearn.metrics import auc
from sklearn.model_selection import ShuffleSplit

def train_model(clf_factory, X, Y, name, plot=False):
    labels = np.unique(Y)

    cv = ShuffleSplit(n_splits=1, test_size=0.3, random_state=0)

    train_errors = []
    test_errors = []

    scores = []
    pr_scores = defaultdict(list)
    precisions, recalls, thresholds = defaultdict(list), defaultdict(list), defaultdict(list)

    roc_scores = defaultdict(list)
    tprs = defaultdict(list)
    fprs = defaultdict(list)

    clfs = []  # just to later get the median

    cms = []

    for train, test in cv.split(X, Y):
        X_train, y_train = X[train], Y[train]
        X_test, y_test = X[test], Y[test]

        clf = clf_factory()
        clf.fit(X_train, y_train)
        clfs.append(clf)

        train_score = clf.score(X_train, y_train)
        test_score = clf.score(X_test, y_test)
        scores.append(test_score)

        train_errors.append(1 - train_score)
        test_errors.append(1 - test_score)

        y_pred = clf.predict(X_test)
        cm = confusion_matrix(y_test, y_pred)
        cms.append(cm)

        for label in labels:
            y_label_test = np.asarray(y_test == label, dtype=int)
            proba = clf.predict_proba(X_test)
            proba_label = proba[:, label]

            precision, recall, pr_thresholds = precision_recall_curve(
                y_label_test, proba_label)
            pr_scores[label].append(auc(recall, precision))
            precisions[label].append(precision)
            recalls[label].append(recall)
            thresholds[label].append(pr_thresholds)

            fpr, tpr, roc_thresholds = roc_curve(y_label_test, proba_label)
            roc_scores[label].append(auc(fpr, tpr))
            tprs[label].append(tpr)
            fprs[label].append(fpr)

    if plot:
        for idx, label in enumerate(labels):
            print("Plotting %s" % GENRES[label])
            scores_to_sort = roc_scores[label]
            median = np.argsort(scores_to_sort)[len(scores_to_sort) // 2]

            desc = "%s %s" % (name, GENRES[label])
            plot_pr(pr_scores[label][median], desc, precisions[label][median],
                    recalls[label][median], label='%s vs rest' % GENRES[label], plot_nr=plot+idx)
            plot_roc(roc_scores[label][median], desc, tprs[label][median],
                     fprs[label][median], label='%s vs rest' % GENRES[label], plot_nr=plot+len(labels)+idx)

    all_pr_scores = np.asarray(list(pr_scores.values())).flatten()
    #import pdb;pdb.set_trace()
    summary = (np.mean(scores), np.std(scores),
               np.mean(all_pr_scores), np.std(all_pr_scores))
    print("%.3f\t%.3f\t%.3f\t%.3f\t" % summary)

    return np.mean(train_errors), np.mean(test_errors), np.asarray(cms)

In [None]:
def plot_confusion_matrix(cm, genre_list, name, title, plot_nr=None):
    plt.figure(num=None, figsize=(5, 4), dpi=DPI)
    plt.matshow(cm, fignum=False, cmap='Blues', vmin=0, vmax=1.0)
    ax = plt.axes()
    ax.set_xticks(range(len(genre_list)))
    ax.set_xticklabels(genre_list)
    ax.xaxis.set_ticks_position("bottom")
    ax.set_yticks(range(len(genre_list)))
    ax.set_yticklabels(genre_list)
    ax.tick_params(axis='both', which='both', bottom='off',  left='off')
    plt.title(title)
    plt.colorbar()
    plt.grid(False)
    plt.xlabel('Predicted class')
    plt.ylabel('True class')
    if plot_nr is not None:
        save_png('%i_confusion_%s' % (plot_nr, name))

In [None]:
X, Y = read_fft(GENRES)

train_avg, test_avg, cms = train_model(create_model, X, Y, "Log Reg FFT", plot=5)

In [None]:
cm_avg = np.mean(cms, axis=0)
cm_norm = cm_avg / np.sum(cm_avg, axis=0)

plot_confusion_matrix(cm_norm, GENRES, "fft", "Confusion matrix of an FFT based classifier", 17)

# Improving classification performance with Mel Frequency Cepstral Coefficients

In [None]:
from python_speech_features import mfcc

fn = Path(GENRE_DIR) / 'jazz' / 'jazz.00000.wav'
sample_rate, X = scipy.io.wavfile.read(fn)
ceps = mfcc(X)
print(ceps.shape)

In [None]:
num_ceps = len(ceps)
np.mean(ceps[int(num_ceps*0.1):int(num_ceps*0.9)], axis=0)

In [None]:
np.std(ceps[int(num_ceps*0.1):int(num_ceps*0.9)], axis=0)

## Generating MFCC features

In [None]:
def create_ceps(fn):
    sample_rate, X = scipy.io.wavfile.read(fn)
    np.save(Path(fn).with_suffix('.ceps'), mfcc(X))

for wav_fn in Path(GENRE_DIR).glob('**/*.wav'):
    print('Converting %s ...' % str(wav_fn))
    create_ceps(wav_fn)

In [None]:
def read_ceps(genre_list, base_dir=GENRE_DIR):
    X = []
    y = []
    for label, genre in enumerate(genre_list):
        genre_dir = Path(base_dir) / genre
        for fn in genre_dir.glob("*.ceps.npy"):
            ceps = np.load(fn)
            num_ceps = len(ceps)
            X.append(np.mean(ceps[int(num_ceps / 10):int(num_ceps * 9 / 10)], axis=0))
            y.append(label)

    return np.array(X), np.array(y)

## Training the MFCC-based classifier

In [None]:
X, Y = read_ceps(GENRES)

train_avg, test_avg, cms = train_model(create_model, X, Y, "Log Reg CEPS", plot=18)

In [None]:
cm_avg = np.mean(cms, axis=0)
cm_norm = cm_avg / np.sum(cm_avg, axis=0)

plot_confusion_matrix(cm_norm, GENRES, "ceps","Confusion matrix of a CEPS based classifier", 19)

# Classification with Tensorflow

As the Fourier transform is a filter, which defines our features, we can also use CNN for the same kind of classification.

In [None]:
import tensorflow as tf
import numpy as np

n_epochs = 50
learning_rate = 0.01
batch_size = 128
step = 32
dropout_rate = 0.2

signal_size = 1000
signal_shape = [signal_size,1]

With only 600 songs, we don't have enough data for a neural network. But we don't need to train the network wth just 1000 samples, we can split each of them to "generate" more samples.

In [None]:
def read_wav(genre_list, multiplicity=1, base_dir=GENRE_DIR):
    X = []
    y = []
    for label, genre in enumerate(genre_list):
        genre_dir = Path(base_dir) / genre
        for fn in genre_dir.glob("*.wav"):
            sample_rate, new_X = scipy.io.wavfile.read(fn)
            for i in range(multiplicity):
                X.append(new_X[i*signal_size:(i+1)*signal_size])
                y.append(label)

    return np.array(X).reshape((-1, signal_size, 1)), np.array(y)

So we will get 20 excerpts from each song.

In [None]:
from sklearn.model_selection import train_test_split

X, Y = read_wav(GENRES, 20)
classes = len(GENRES)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=(1. / 6.))

We modify our CNN class a little bit to use 1D convolution, and we also customize the pool size, as we expect it to require to be bigger for our training.

In [None]:
class CNN():
    def __init__(
            self,
            signal_shape=[1000,1],
            dim_W1=64,
            dim_W2=32,
            dim_W3=16,
            classes=6,
            kernel_size=5,
            pool_size=16
            ):

        self.signal_shape = signal_shape

        self.dim_W1 = dim_W1
        self.dim_W2 = dim_W2
        self.dim_W3 = dim_W3
        self.classes = classes
        self.kernel_size = kernel_size
        self.pool_size = pool_size

    def build_model(self):
        image = tf.placeholder(tf.float32, [None]+self.signal_shape, name="signal")
        Y = tf.placeholder(tf.int64, [None], name="label")
        training = tf.placeholder(tf.bool, name="is_training")

        probabilities = self.discriminate(image, training)
        cost = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=Y, logits=probabilities))
        accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.argmax(probabilities, axis=1), Y), tf.float32), name="accuracy")

        return image, Y, cost, accuracy, probabilities, training

    def create_conv1d(self, input, filters, kernel_size, name):
        layer = tf.layers.conv1d(
                    inputs=input,
                    filters=filters,
                    kernel_size=kernel_size,
                    activation=tf.nn.leaky_relu,
                    name="Conv1d_" + name,
                    padding="same")
        return layer
    
    def create_maxpool(self, input, name):
        layer = tf.layers.max_pooling1d(
                    inputs=input,
                    pool_size=[self.pool_size],
                    strides=self.pool_size,
                    name="MaxPool_" + name)
        return layer

    def create_dropout(self, input, name, is_training):
        layer = tf.layers.dropout(
                    inputs=input,
                    rate=dropout_rate,
                    name="DropOut_" + name,
                    training=is_training)
        return layer

    def create_dense(self, input, units, name):
        layer = tf.layers.dense(
                inputs=input,
                units=units,
                name="Dense" + name,
                )
        layer = tf.layers.batch_normalization(
                inputs=layer,
                momentum=0,
                epsilon=1e-8,
                training=True,
                name="BatchNorm_" + name,
        )
        layer = tf.nn.leaky_relu(layer, name="LeakyRELU_" + name)
        return layer

    def discriminate(self, signal, training):

        h1 = self.create_conv1d(signal, self.dim_W3, self.kernel_size, "Layer1")
        h1 = self.create_maxpool(h1, "Layer1")

        h2 = self.create_conv1d(h1, self.dim_W2, self.kernel_size, "Layer2")
        h2 = self.create_maxpool(h2, "Layer2")
        h2 = tf.reshape(h2, (-1, self.dim_W2 * h2.shape[1]))

        h3 = self.create_dense(h2, self.dim_W1, "Layer3")
        h3 = self.create_dropout(h3, "Layer3", training)
        
        h4 = self.create_dense(h3, self.classes, "Layer4")
        return h4

Let's also create a BaseEstimator subclass to use GridSearchCV to find the optimal set of parameters.

In [None]:
tf.reset_default_graph()

from sklearn.base import BaseEstimator

class Classifier(BaseEstimator):
    def __init__(self,
            signal_shape=[1000,1],
            dim_W1=64,
            dim_W2=32,
            dim_W3=16,
            classes=6,
            kernel_size=5,
            pool_size=16):
        self.signal_shape=signal_shape
        self.dim_W1=dim_W1
        self.dim_W2=dim_W2
        self.dim_W3=dim_W3
        self.classes=classes
        self.kernel_size=kernel_size
        self.pool_size=pool_size

    def fit(self, X, y):
        tf.reset_default_graph()

        print("Fitting (W1=%i) (W2=%i) (W3=%i) (kernel=%i) (pool=%i)"
              % (self.dim_W1, self.dim_W2, self.dim_W3, self.kernel_size, self.pool_size))
        
        cnn_model = CNN(
                signal_shape=self.signal_shape,
                dim_W1=self.dim_W1,
                dim_W2=self.dim_W2,
                dim_W3=self.dim_W3,
                classes=self.classes,
                kernel_size=self.kernel_size,
                pool_size=self.pool_size
                )

        signal_tf, Y_tf, cost_tf, accuracy_tf, output_tf, training_tf = cnn_model.build_model()
        train_step = tf.train.AdamOptimizer(learning_rate, beta1=0.5).minimize(cost_tf)

        saver = tf.train.Saver()
        
        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            for epoch in range(n_epochs):
                permut = np.random.permutation(len(X_train))
                for j in range(0, len(X_train), batch_size):
                    batch = permut[j:j+batch_size]
                    Xs = X_train[batch]
                    Ys = Y_train[batch]

                    sess.run(train_step,
                            feed_dict={
                                training_tf: True,
                                Y_tf: Ys,
                                signal_tf: Xs
                                })
            saver.save(sess, './classifier')
        return self
    
    def predict(self, X):
        tf.reset_default_graph()
        new_saver = tf.train.import_meta_graph("classifier.meta")  
        with tf.Session() as sess:  
            new_saver.restore(sess, tf.train.latest_checkpoint('./'))

            graph = tf.get_default_graph()
            training_tf = graph.get_tensor_by_name('is_training:0')
            signal_tf = graph.get_tensor_by_name('signal:0')
            output_tf = graph.get_tensor_by_name('LeakyRELU_Layer4/Maximum:0')
    
            predict = sess.run(output_tf,
                            feed_dict={
                                training_tf: False,
                                signal_tf: X
                                })
            return np.argmax(predict, axis=1)

We can now create a grid of parameters to explore. Be aware that this will take a lot of time!

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, make_scorer

param_grid = {
    "dim_W1": [4, 8, 16],
    "dim_W2": [4, 8, 16],
    "dim_W3": [4, 8, 16],
    "kernel_size":[7, 11, 15],
    "pool_size":[8, 12, 16],
}

cv = GridSearchCV(Classifier(), param_grid, scoring=make_scorer(accuracy_score), cv=6)

cv.fit(X, Y)
print(cv.best_params_)

And now let's use these best parameters and check the results.

In [None]:
clf = Classifier(**cv.best_params_)
clf.fit(X_train, Y_train)

Y_train_predict = clf.predict(X_train)
Y_test_predict = clf.predict(X_test)

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(Y_train, Y_train_predict)
plot_confusion_matrix(cm / np.sum(cm, axis=0), GENRES, "CNN","Confusion matrix of a CNN based classifier (train)", 20)
cm = confusion_matrix(Y_test, Y_test_predict)
plot_confusion_matrix(cm / np.sum(cm, axis=0), GENRES, "CNN","Confusion matrix of a CNN based classifier (test)", 21)