<a href="https://www.kaggle.com/code/paurodrguezinserte/music-genre-classification?scriptVersionId=106037968" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
import os
import json
import librosa
import math
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt

In [None]:
DATASET_PATH = "../input/gtzan-dataset-music-genre-classification/Data/genres_original"
JSON_PATH = "./data.json"
JSON_EXT_PATH = "./data_extended.json"
SAMPLE_RATE = 22050
SAMPLES_TRACK = SAMPLE_RATE * 30 # dataset has 30s tracks

# Dataset preparation

In [None]:
def save_mfcc(data_path, json_path, n_mfcc=13, n_fft=2048, hop_length=512, num_segments=5):
    # Store data
    data = {"mapping": [],
           "mfcc": [],
           "labels": []}
    
    num_samples_segment = int(SAMPLES_TRACK / num_segments)
    
    # Loop genres
    for i, (dirpath, dirnames, filenames) in enumerate(os.walk(data_path)):
        dirpath_components = dirpath.split("/")
        semantic_label = dirpath_components[-1]
        data["mapping"].append(semantic_label)
        print(f"\nProcessing {semantic_label}")
        
        for f in filenames:
            file_path = os.path.join(dirpath, f)
            try:
                signal, sr = librosa.load(file_path, sr=SAMPLE_RATE)
            except:
                pass
            
            for s in range(num_segments):
                start_sample = num_samples_segment * s
                end_sample = start_sample + num_samples_segment
                mfcc = librosa.feature.mfcc(y=signal[start_sample:end_sample],
                                           sr=SAMPLE_RATE,
                                           n_fft=n_fft,
                                           n_mfcc=n_mfcc,
                                           hop_length=hop_length)
                mfcc = mfcc.T

                # Ensure that num of mfccs is the expected
                if len(mfcc) == math.ceil(num_samples_segment / hop_length):
                    data["mfcc"].append(mfcc.tolist())
                    data["labels"].append(i-1)
                    print(f"{file_path}, {s}")
        
        with open(json_path, "w") as fp:
            json.dump(data, fp, indent=4)

In [None]:
save_mfcc(DATASET_PATH, JSON_EXT_PATH, n_mfcc=32)

# CNN for music classification

In [None]:
def load_dataset(data_path):
    with open(data_path, "r") as f:
        data = json.load(f)
    X = np.array(data["mfcc"])
    y = np.array(data["labels"])
    
    return X, y

def split_in_sets_cnn(X, y, train_size, val_split):
    print(f"X total = {X.shape}")
    print(f"y total = {y.shape}")
    X_train, X_ev, y_train, y_ev = train_test_split(X, y, train_size=train_size)
    X_val, X_test, y_val, y_test = train_test_split(X_ev, y_ev, train_size=val_split)
    
    print(f"X train = {X_train.shape}")
    print(f"y train = {y_train.shape}")
    print(f"X val = {X_val.shape}")
    print(f"y val = {y_val.shape}")
    print(f"X test = {X_test.shape}")
    print(f"y test = {y_test.shape}")
    
    X_train = X_train[..., np.newaxis]
    X_val = X_val[..., np.newaxis]
    X_test = X_test[..., np.newaxis]
    
    return X_train, X_val, X_test, y_train, y_val, y_test

In [None]:
def build_alexnet(input_shape, num_classes):
    model = keras.models.Sequential([
        keras.layers.Conv2D(filters=96, kernel_size=(5,5), strides=(1,1), activation='relu', padding="same"),
        keras.layers.BatchNormalization(),
        keras.layers.MaxPool2D(pool_size=(3,3), strides=(2,2)),
        
        keras.layers.Conv2D(filters=256, kernel_size=(3,3), strides=(1,1), activation='relu', padding="same"),
        keras.layers.BatchNormalization(),
        keras.layers.MaxPool2D(pool_size=(3,3), strides=(2,2)),
        
        keras.layers.Flatten(),
        keras.layers.Dense(4096, activation='relu'),
        keras.layers.Dropout(0.7),
        keras.layers.Dense(4096, activation='relu'),
        keras.layers.Dropout(0.7),
        keras.layers.Dense(num_classes, activation='softmax')
    ])
    
    return model

In [None]:
X, y = load_dataset(JSON_EXT_PATH)
X_train, X_val, X_test, y_train, y_val, y_test = split_in_sets_cnn(X, y, train_size=0.6, val_split=0.5)

In [None]:
input_shape = (X_train.shape[1], X_train.shape[2], X_train.shape[3])
print(input_shape)
model = build_alexnet(input_shape, 10)

optimizer = keras.optimizers.Adam(learning_rate=0.0001)
model.compile(optimizer = optimizer,
             loss="sparse_categorical_crossentropy",
             metrics=["accuracy"])

In [None]:
model.fit(X_train, y_train, validation_data=(X_val, y_val), batch_size=64, epochs=30)

In [None]:
model.evaluate(x=X_test, y=y_test)

# Recurrent Convolutional Network (RCNN)

In [None]:
def load_dataset(data_path):
    with open(data_path, "r") as f:
        data = json.load(f)
    X = np.array(data["mfcc"])
    y = np.array(data["labels"])
    
    return X, y

def split_in_sets_rnn(X, y, train_size, val_split):
    X_train, X_ev, y_train, y_ev = train_test_split(X, y, train_size=train_size)
    X_val, X_test, y_val, y_test = train_test_split(X_ev, y_ev, train_size=val_split)
    
    return X_train, X_val, X_test, y_train, y_val, y_test

In [None]:
def build_lstm(input_shape, num_classes):
    model = keras.models.Sequential([
        keras.layers.Bidirectional(keras.layers.LSTM(512, input_shape=input_shape, return_sequences=True)),
        keras.layers.Bidirectional(keras.layers.LSTM(256, return_sequences=True)),
        keras.layers.Bidirectional(keras.layers.LSTM(128,return_sequences=False)),
        
        keras.layers.Dense(128, activation='relu'),
        keras.layers.Dropout(0.7),
        keras.layers.Dense(64, activation='relu'),
        keras.layers.Dropout(0.5),
        keras.layers.Dense(num_classes, activation='softmax')
    ])
    
    return model

In [None]:
from keras.layers import Conv2D, MaxPooling2D, BatchNormalization, Flatten, Dense, Dropout, Reshape, LSTM, Bidirectional

def build_crnn(input_shape, num_classes):
    model = keras.models.Sequential([
        Conv2D(input_shape=input_shape, filters=64, kernel_size=(3,3), padding="same", activation="relu"),
        Conv2D(filters=64, kernel_size=(3,3), padding="same", activation="relu"),
        MaxPooling2D(pool_size=(2,2), strides=(2,2), padding="same"),

        Conv2D(filters=128, kernel_size=(3,3), padding="same", activation="relu"),
        Conv2D(filters=128, kernel_size=(3,3), padding="same", activation="relu"),
        MaxPooling2D(pool_size=(2,2), strides=(2,2), padding="same"),

        Conv2D(filters=256, kernel_size=(3,3), padding="same", activation="relu"),
        Conv2D(filters=256, kernel_size=(3,3), padding="same", activation="relu"),
        Conv2D(filters=256, kernel_size=(3,3), padding="same", activation="relu"),
        MaxPooling2D(pool_size=(2,2), strides=(2,2), padding="same"),

        Conv2D(filters=512, kernel_size=(3,3), padding="same", activation="relu"),
        Conv2D(filters=512, kernel_size=(3,3), padding="same", activation="relu"),
        Conv2D(filters=512, kernel_size=(3,3), padding="same", activation="relu"),
        MaxPooling2D(pool_size=(2,2), strides=(2,2), padding="same"),

        Conv2D(filters=512, kernel_size=(3,3), padding="same", activation="relu"),
        Conv2D(filters=512, kernel_size=(3,3), padding="same", activation="relu"),
        Conv2D(filters=512, kernel_size=(3,3), padding="same", activation="relu"),
        MaxPooling2D(pool_size=(2,2), strides=(2,2), padding="same"),#(None, 9, 1, 512)
        
        Reshape(target_shape=((9, 512))),
        Dense(128, activation='relu'),
        
        #Bidirectional(LSTM(512, return_sequences=True)),
        Bidirectional(LSTM(256, return_sequences=True)),
        Dropout(0.2),
        Bidirectional(LSTM(128,return_sequences=False)),
        
        Dense(4096, activation='relu'),
        Dropout(0.7),
        Dense(2048, activation='relu'),
        Dropout(0.7),
        Dense(num_classes, activation='softmax')
    ])
    
    return model

In [None]:
X, y = load_dataset(JSON_EXT_PATH)
X_train, X_val, X_test, y_train, y_val, y_test = split_in_sets_cnn(X, y, train_size=0.6, val_split=0.5)

In [None]:
input_shape = (X_train.shape[1], X_train.shape[2], X_train.shape[3])
print(input_shape)
model = build_crnn(input_shape, 10)
model.summary()

In [None]:
optimizer = keras.optimizers.Adam(learning_rate=0.0001)
model.compile(optimizer = optimizer,
             loss="sparse_categorical_crossentropy",
             metrics=["accuracy"])

model.fit(X_train, y_train, validation_data=(X_val, y_val), batch_size=128, epochs=50)

In [None]:
model.evaluate(x=X_test, y=y_test)

# VGG

In [None]:
def load_dataset(data_path):
    with open(data_path, "r") as f:
        data = json.load(f)
    X = np.array(data["mfcc"])
    y = np.array(data["labels"])
    
    return X, y

def split_in_sets_cnn(X, y, train_size, val_split):
    print(f"X total = {X.shape}")
    print(f"y total = {y.shape}")
    X_train, X_ev, y_train, y_ev = train_test_split(X, y, train_size=train_size)
    X_val, X_test, y_val, y_test = train_test_split(X_ev, y_ev, train_size=val_split)
    
    print(f"X train = {X_train.shape}")
    print(f"y train = {y_train.shape}")
    print(f"X val = {X_val.shape}")
    print(f"y val = {y_val.shape}")
    print(f"X test = {X_test.shape}")
    print(f"y test = {y_test.shape}")
    
    X_train = X_train[..., np.newaxis]
    X_val = X_val[..., np.newaxis]
    X_test = X_test[..., np.newaxis]
    
    return X_train, X_val, X_test, y_train, y_val, y_test 

In [None]:
from keras.layers import Conv2D, MaxPooling2D, BatchNormalization, Flatten, Dense, Dropout

def build_vgg16(input_shape, num_classes):
    model = keras.models.Sequential([
        Conv2D(input_shape=input_shape, filters=64, kernel_size=(3,3), padding="same", activation="relu"),
        Conv2D(filters=64, kernel_size=(3,3), padding="same", activation="relu"),
        MaxPooling2D(pool_size=(2,2), strides=(2,2), padding="same"),

        Conv2D(filters=128, kernel_size=(3,3), padding="same", activation="relu"),
        Conv2D(filters=128, kernel_size=(3,3), padding="same", activation="relu"),
        MaxPooling2D(pool_size=(2,2), strides=(2,2), padding="same"),

        Conv2D(filters=256, kernel_size=(3,3), padding="same", activation="relu"),
        Conv2D(filters=256, kernel_size=(3,3), padding="same", activation="relu"),
        Conv2D(filters=256, kernel_size=(3,3), padding="same", activation="relu"),
        MaxPooling2D(pool_size=(2,2), strides=(2,2), padding="same"),

        Conv2D(filters=512, kernel_size=(3,3), padding="same", activation="relu"),
        Conv2D(filters=512, kernel_size=(3,3), padding="same", activation="relu"),
        Conv2D(filters=512, kernel_size=(3,3), padding="same", activation="relu"),
        MaxPooling2D(pool_size=(2,2), strides=(2,2), padding="same"),

        Conv2D(filters=512, kernel_size=(3,3), padding="same", activation="relu"),
        Conv2D(filters=512, kernel_size=(3,3), padding="same", activation="relu"),
        Conv2D(filters=512, kernel_size=(3,3), padding="same", activation="relu"),
        MaxPooling2D(pool_size=(2,2), strides=(2,2), padding="same"),
        
        Flatten(),
        Dense(4096, activation='relu', kernel_regularizer=keras.regularizers.L1L2(l1=1e-5, l2=1e-4)),
        Dropout(0.7),
        Dense(4096, activation='relu', kernel_regularizer=keras.regularizers.L1L2(l1=1e-5, l2=1e-4)),
        Dropout(0.7),
        Dense(2048, activation='relu', kernel_regularizer=keras.regularizers.L1L2(l1=1e-5, l2=1e-4)),
        Dropout(0.7),
        Dense(num_classes, activation='softmax')
    ])
    
    return model

In [None]:
#save_mfcc(DATASET_PATH, JSON_EXT_PATH, n_mfcc=32)
X, y = load_dataset(JSON_EXT_PATH)
X_train, X_val, X_test, y_train, y_val, y_test = split_in_sets_cnn(X, y, train_size=0.6, val_split=0.5)

In [None]:
input_shape = (X_train.shape[1], X_train.shape[2], X_train.shape[3])
model = build_vgg16(input_shape, 10)
model.summary()

In [None]:
optimizer = keras.optimizers.Adam(learning_rate=0.001)
model.compile(optimizer = optimizer,
             loss="sparse_categorical_crossentropy",
             metrics=["accuracy"])

model.fit(X_train, y_train, validation_data=(X_val, y_val), batch_size=128, epochs=50)

In [None]:
model.evaluate(x=X_test, y=y_test)

In [None]:
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
import seaborn as sns

y_pred = model.predict(X_test)
cfm = confusion_matrix(y_test, y_pred.argmax(axis=1))

labels = ["disco","metal","reggae","blues","rock","classical","jazz","hiphop","country","pop"]

fig, ax = plt.subplots(figsize=(14,14))
ax = sns.heatmap(cfm, annot=True, cmap='Blues')

ax.set_title('Music Genre classification Conf. Matrix\n\n')
ax.set_xlabel('\nPredicted Values')
ax.set_ylabel('Actual Values ')

ax.xaxis.set_ticklabels(labels)
ax.yaxis.set_ticklabels(labels)

plt.show()

# Classification with Mel-Spectrogram

In [None]:
import os
import json
import librosa
import librosa.display
import math
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt

In [None]:
DATASET_PATH = "../input/gtzan-dataset-music-genre-classification/Data/genres_original"
MEL_JSON_PATH = "./data_mel.json"
SAMPLE_RATE = 22050
SAMPLES_TRACK = SAMPLE_RATE * 30 # dataset has 30s tracks

In [None]:
def save_melspectrogram(data_path, json_path, n_fft=2048, n_mels=90, hop_length=512, num_segments=5):
    # Store data
    data = {"mapping": ["reggae", "rock", "classical", "hiphop", "disco"],
           "mels": [],
           "labels": []}
    
    num_samples_segment = int(SAMPLES_TRACK / num_segments)
    
    # Loop genres
    for i, (dirpath, dirnames, filenames) in enumerate(os.walk(data_path)):
        dirpath_components = dirpath.split("/")
        semantic_label = dirpath_components[-1]

        if semantic_label in data["mapping"]:
            print(f"\nProcessing {semantic_label}")

            for f in filenames:
                file_path = os.path.join(dirpath, f)
                try:
                    signal, sr = librosa.load(file_path, sr=SAMPLE_RATE)
                except:
                    pass

                for s in [0,1,2]:
                    start_sample = num_samples_segment * s
                    end_sample = start_sample + num_samples_segment

                    # MEL SPECTROGRAM
                    mel_sp = librosa.feature.melspectrogram(y=signal[start_sample:end_sample],
                                                            sr=SAMPLE_RATE,
                                                            n_fft=n_fft,
                                                            hop_length=hop_length,
                                                            n_mels=n_mels)
                    log_mel_sp = librosa.power_to_db(mel_sp)

                    data["mels"].append(log_mel_sp.tolist())
                    data["labels"].append(i-1)
                    print(f"{file_path}, {s}")

            with open(json_path, "w") as fp:
                json.dump(data, fp, indent=4)

In [None]:
save_melspectrogram(DATASET_PATH, MEL_JSON_PATH)

In [None]:
def load_dataset(data_path):
    with open(data_path, "r") as f:
        data = json.load(f)
    X = np.array(data["mels"])
    y = np.array(data["labels"])
    
    return X, y

def split_in_sets_cnn(X, y, train_size, val_split):
    X_train, X_ev, y_train, y_ev = train_test_split(X, y, train_size=train_size)
    X_val, X_test, y_val, y_test = train_test_split(X_ev, y_ev, train_size=val_split)
        
    X_train = X_train[..., np.newaxis]
    X_val = X_val[..., np.newaxis]
    X_test = X_test[..., np.newaxis]
    
    return X_train, X_val, X_test, y_train, y_val, y_test

In [None]:
from keras.layers import Conv2D, MaxPooling2D, BatchNormalization, Flatten, Dense, Dropout

def build_vgg16(input_shape, num_classes):
    model = keras.models.Sequential([
        Conv2D(input_shape=input_shape, filters=64, kernel_size=(3,3), padding="same", activation="relu"),
        Conv2D(filters=64, kernel_size=(3,3), padding="same", activation="relu"),
        MaxPooling2D(pool_size=(2,2), strides=(2,2), padding="same"),

        Conv2D(filters=128, kernel_size=(3,3), padding="same", activation="relu"),
        Conv2D(filters=128, kernel_size=(3,3), padding="same", activation="relu"),
        MaxPooling2D(pool_size=(2,2), strides=(2,2), padding="same"),

        Conv2D(filters=256, kernel_size=(3,3), padding="same", activation="relu"),
        Conv2D(filters=256, kernel_size=(3,3), padding="same", activation="relu"),
        Conv2D(filters=256, kernel_size=(3,3), padding="same", activation="relu"),
        MaxPooling2D(pool_size=(2,2), strides=(2,2), padding="same"),

        Conv2D(filters=512, kernel_size=(3,3), padding="same", activation="relu"),
        Conv2D(filters=512, kernel_size=(3,3), padding="same", activation="relu"),
        Conv2D(filters=512, kernel_size=(3,3), padding="same", activation="relu"),
        MaxPooling2D(pool_size=(2,2), strides=(2,2), padding="same"),

        Conv2D(filters=512, kernel_size=(3,3), padding="same", activation="relu"),
        Conv2D(filters=512, kernel_size=(3,3), padding="same", activation="relu"),
        Conv2D(filters=512, kernel_size=(3,3), padding="same", activation="relu"),
        MaxPooling2D(pool_size=(2,2), strides=(2,2), padding="same"),
        
        Flatten(),
        Dense(4096, activation='relu', kernel_regularizer=keras.regularizers.L1L2(l1=1e-5, l2=1e-4)),
        Dropout(0.7),
        Dense(4096, activation='relu', kernel_regularizer=keras.regularizers.L1L2(l1=1e-5, l2=1e-4)),
        Dropout(0.7),
        Dense(2048, activation='relu', kernel_regularizer=keras.regularizers.L1L2(l1=1e-5, l2=1e-4)),
        Dropout(0.7),
        Dense(num_classes, activation='softmax')
    ])
    
    return model

In [None]:
X, y = load_dataset(MEL_JSON_PATH)
X_train, X_val, X_test, y_train, y_val, y_test = split_in_sets_cnn(X, y, train_size=0.6, val_split=0.5)

In [None]:
plt.figure(figsize=(25,10))
librosa.display.specshow(X[0], sr=SAMPLE_RATE, x_axis="time", y_axis="mel")
plt.colorbar(format="%+2.f")

In [None]:
input_shape = (X_train.shape[1], X_train.shape[2], X_train.shape[3])
model = build_vgg16(input_shape, 10)
model.summary()

In [None]:
optimizer = keras.optimizers.Adam(learning_rate=0.001)
model.compile(optimizer = optimizer,
             loss="sparse_categorical_crossentropy",
             metrics=["accuracy"])

model.fit(X_train, y_train, validation_data=(X_val, y_val), batch_size=128, epochs=50)