## Music genre classifier with TensorFlow

The objective of this project is to classify 30 sec wav files by genre using a TensorFLow CNN model. The GTZAN dataset can be found here:

https://www.kaggle.com/andradaolteanu/gtzan-dataset-music-genre-classification

To classify audio samples, we will preprocess them by calculating their MFCC, which is a temporal representation of the energy for each perceived frequency band. In this case, we are choosing 13 bands.

In [13]:
import os
import json
import numpy as np
import librosa
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import tensorflow as tf

In [14]:
# Dataset location
SOURCE_PATH = '/Users/msf/Datasets/GTZAN_Dataset/genres_original'

# Path to labels and processed data file, json format.
JSON_PATH = '/Users/msf/GitHub/TF_MusicGenre/data.json'

# Sampling rate.
sr = 22050

# Let's make sure all files have the same amount of samples, pick a duration under 30 seconds.
TOTAL_SAMPLES = 29 * sr

# The dataset contains 999 files. Lets make it bigger! 
# X amount of slices => X times more training examples.
NUM_SLICES = 10

SAMPLES_PER_SLICE = int(TOTAL_SAMPLES / NUM_SLICES)

In [15]:
def generate_mfcc(source_path, json_path):

    # Let's create a dictionary of labels and processed data.
    mydict = {
        "labels": [],
        "mfcc": []
        }

    # Let's browse each file, slice it and generate the mfcc for each slice.
    for i, (dirpath, dirnames, filenames) in enumerate(os.walk(source_path)):

        for file in filenames:
            song, sr = librosa.load(os.path.join(dirpath, file), duration=29)

            for s in range(NUM_SLICES):
                start_sample = SAMPLES_PER_SLICE * s
                end_sample = start_sample 
                + SAMPLES_PER_SLICE
                mfcc = librosa.feature.mfcc(y=song[start_sample:end_sample], sr=sr, n_mfcc=13)
                mfcc = mfcc.T
                mydict["labels"].append(i-1)
                mydict["mfcc"].append(mfcc.tolist())

    # Let's write the dictionary in a json file.
    
    # 9990 mfcc, 13 coefficients, 125 time frames per mfcc.
    print('The newly generated MFCCs have the following shape: {}'.fomat(inputs.shape))
    
    with open(json_path, 'w') as fp:
        json.dump(mydict, fp)
    fp.close()

In [21]:
def load_data(json_path):

    with open(json_path) as f:
        data = json.load(f)
    fp.close()

    # Let's load our data into numpy arrays for TensorFlow compatibility.
    inputs = np.array(data["mfcc"])
    targets = np.array(data["labels"])

    return inputs, targets

In [17]:
def prepare_datasets(inputs, targets, test_size, validation_size):
    
    inputs_train, inputs_val, targets_train, targets_val = train_test_split(inputs, targets, validation_size)
    inputs_train, inputs_test, targets_train, targets_test = train_test_split(inputs_train, targets_train, test_size)
    
    inputs_train = inputs_train[..., np.newaxis]
    inputs_val = inputs_val[..., np.newaxis]
    inputs_test = inputs_test[..., np.newaxis]
    
    return inputs_train, inputs_val, inputs_test, targets_train, targets_val, targets_test

In [18]:
def design_model(input_shape):

    # Let's design the model architecture.
    model = tf.keras.models.Sequential([
        
        tf.keras.layers.Conv2D(32, (3,3), activation='relu', input_shape=input_shape),
        tf.keras.layers.MaxPooling2D((3,3), strides=(2,2), padding='same'),
        tf.keras.layers.BatchNormalization(),
        
        tf.keras.layers.Conv2D(32, (3,3), activation='relu', input_shape=input_shape),
        tf.keras.layers.MaxPooling2D((3,3), strides=(2,2), padding='same'),
        tf.keras.layers.BatchNormalization(),
        
        tf.keras.layers.Conv2D(32, (2,2), activation='relu', input_shape=input_shape),
        tf.keras.layers.MaxPooling2D((3,3), strides=(2,2), padding='same'),
        tf.keras.layers.BatchNormalization(),
        
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(64, activation='relu'), 
        tf.keras.layers.Dense(len(np.unique(targets)), activation='softmax')
    ])

    return model

In [9]:
def predict(model, X, y):
    
    

SyntaxError: unexpected EOF while parsing (<ipython-input-9-92386d1e0b51>, line 3)

In [19]:
def plot_performance(hist):
    
    acc = hist.history['acc']
    val_acc = hist.history['val_acc']
    loss = hist.history['loss']
    val_loss = hist.history['val_loss']

    epochs = range(len(acc))

    plt.plot(epochs, acc, 'r', label='Training accuracy')
    plt.plot(epochs, val_acc, 'b', label='Validation accuracy')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.figure()

    plt.plot(epochs, loss, 'r', label='Training Loss')
    plt.plot(epochs, val_loss, 'b', label='Validation Loss')
    plt.title('Training and validation loss')
    plt.legend()

    plt.show()

In [22]:

if __name__ == "__main__":

    inputs, targets = load_data(JSON_PATH)    

    Xtrain, Xval, Xtest, ytrain, yval, ytest = prepare_datasets(inputs, targets, 0.2, 0.2)

    model = design_model((Xtrain.shape[1], Xtrain.shape[2], 1))

    # Selection of the optimizer, loss type and metrics for performance evaluation.
    model.compile(optimizer = tf.keras.optimizers.RMSprop(lr=0.001),
                 loss='sparse_categorical_crossentropy',
                  metrics = ['acc']
                 )

    model.summary()
    
    #Training the model.
    history = model.fit(Xtrain, ytrain,
                        validation_data=(Xval, yval),
                        epochs=50,
                        batch_size=32
                        )
    
    plot_performance(history)
    
    

FileNotFoundError: [Errno 2] No such file or directory: '/Users/msf/GitHub/TF_MusicGenre/data.json'