## Music genre classifier with TensorFlow

The objective of this project is to classify 30 sec wav files by genre using a TensorFlow CNN model. The GTZAN dataset can be found here:

https://www.kaggle.com/andradaolteanu/gtzan-dataset-music-genre-classification

To classify audio samples, we will preprocess them by calculating their MFCC, which is a temporal representation of the energy for each perceived frequency band. In this case, we are choosing 13 bands.

In [64]:
import os
import json
import numpy as np
import librosa
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import tensorflow as tf
import gradio as gr

In [65]:
# Dataset location
SOURCE_PATH = 'Dataset/genres_original/'

# Path to labels and processed data file, json format.
JSON_PATH = 'data.json'

# Sampling rate.
sr = 22050

# Let's make sure all files have the same amount of samples, pick a duration right under 30 seconds.
TOTAL_SAMPLES = 29 * sr

# The dataset contains 999 files. Lets make it bigger. 
# X amount of slices => X times more training examples.
NUM_SLICES = 10
SAMPLES_PER_SLICE = int(TOTAL_SAMPLES / NUM_SLICES)

In [67]:
def design_model(input_shape):

    # Let's design the model architecture.
    model = tf.keras.models.Sequential([
        
        tf.keras.layers.Conv2D(32, (3,3), activation='relu', input_shape=input_shape),
        tf.keras.layers.MaxPooling2D((3,3), strides=(2,2), padding='same'),
        tf.keras.layers.BatchNormalization(),
        
        tf.keras.layers.Conv2D(32, (3,3), activation='relu'),
        tf.keras.layers.MaxPooling2D((3,3), strides=(2,2), padding='same'),
        tf.keras.layers.BatchNormalization(),
        
        tf.keras.layers.Conv2D(32, (2,2), activation='relu'),
        tf.keras.layers.MaxPooling2D((3,3), strides=(2,2), padding='same'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.3),
        
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(64, activation='relu'), 
        tf.keras.layers.Dense(len(np.unique(targets)), activation='softmax')
    ])

    return model

In [68]:
def preprocess_song(audio):
    
    song, sr = librosa.load(audio, duration=29)
    
    for s in range(NUM_SLICES):
                start_sample = SAMPLES_PER_SLICE * s
                end_sample = start_sample + SAMPLES_PER_SLICE
                mfcc = librosa.feature.mfcc(y=song[start_sample:end_sample], sr=sr, n_mfcc=13)
                mfcc = mfcc.T
    return mfcc

In [None]:
#if __name__ == "__main__":

    #preprocess_data(source_path=SOURCE_PATH, json_path=JSON_PATH)
    
    inputs, targets = load_data(json_path=JSON_PATH)
        
    Xtrain, Xval, Xtest, ytrain, yval, ytest = prepare_datasets(inputs, targets, 0.2)

    input_shape = (Xtrain.shape[1], Xtrain.shape[2], 1)
    model = design_model(input_shape)

    # Selection of the optimizer, loss type and metrics for performance evaluation.
    #model.compile(optimizer = tf.keras.optimizers.RMSprop(lr=0.001),
    #                 loss='sparse_categorical_crossentropy',
    #                 metrics = ['acc']
    #                 )

    #model.summary()

    # Training the model.
    #history = model.fit(Xtrain, ytrain,
    #                    validation_data=(Xval, yval),
    #                    epochs=3,
    #                    batch_size=32
    #                    )
    
    #model.save_weights('/Users/msf/GitHub/TensorFlow_MusicGenre_Classifier/MyModel.h5')
    
    model.load_weights('/Users/msf/GitHub/TensorFlow_MusicGenre_Classifier/MyModel.h5')

    #plot_performance(history)

    # Testing the model on never seen before data.
    #make_prediction(model, Xtest, ytest, 24)
    
    song = Xtest[0][np.newaxis, :, :]
    
    print(make_pred(model, song))

In [244]:
def make_pred(aud):
    
    genre_dict = {
        0 : "pop",
        1 : "metal",
        2 : "disco",
        3 : "blues",
        4 : "reggae",
        5 : "classical",
        6 : "rock",
        7 : "hiphop",
        8 : "country",
        9 : "jazz",
        }
    
    mfcc = preprocess_song(aud.name)
    mfcc = mfcc[..., np.newaxis]
   
    pred = model.predict(mfcc[np.newaxis, :, :])
    genre = np.argmax(pred)    
    return genre_dict[genre]

In [252]:
inputs = gr.inputs.Audio(label="Input Audio", type="file")
outputs = "text"
title = "Audio Classification"
description = "Gradio demo for Audio Classification. To use it, simply upload your audio, or click one of the examples to load them. Read more at the links below."
examples = [
    ['/Users/msf/GitHub/TensorFlow_MusicGenre_Classifier/Dataset/genres_original/blues/blues.00007.wav']
]
gr.Interface(make_pred, inputs, outputs, title=title, description=description, examples=examples).launch()

Running on local URL:  http://127.0.0.1:7874/

To create a public link, set `share=True` in `launch()`.


(<fastapi.applications.FastAPI at 0x7ff626903ac0>,
 'http://127.0.0.1:7874/',
 None)