#Modeling the GTZAN Dataset for Music Genre Classification

The GTZAN dataset is a widely used collection of audio files used for music genre classification research. It consists of 1,000 audio tracks each 30 seconds long, sampled at 22,050 Hz and stored in the uncompressed WAV format. The dataset covers 10 genres, with 100 tracks per genre. The genres included are blues, classical, country, disco, hip-hop, jazz, metal, pop, reggae, and rock.

Import required libraries

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.preprocessing import image
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, GlobalAveragePooling2D, Dropout, LSTM
from sklearn.model_selection import train_test_split
import librosa
import numpy as np
import os


The load_and_preprocess_image function loads an image from a given path, resizes it to a standard size (224x224 pixels), converts it to a numpy array, and then normalizes the pixel values to be between 0 and 1. This is essential for ensuring that the image data is in a suitable format for feeding into a neural network.

The load_and_preprocess_audio function loads an audio file from a given path, ensures it has the desired length (in samples), and then computes the Mel-frequency cepstral coefficients (MFCCs) from the audio data. MFCCs are a representation of the short-term power spectrum of sound, often used as features for audio processing tasks like speech recognition or music genre classification.

In [None]:
import numpy as np
import librosa
from keras.preprocessing import image

def load_and_preprocess_image(image_path):
    """
    Load and preprocess an image from the given path.

    Args:
    - image_path (str): Path to the image file.

    Returns:
    - img_array (numpy.ndarray): Preprocessed image array.
    """
    # Load the image and resize it to (224, 224) as required by some models
    img = image.load_img(image_path, target_size=(224, 224))
    # Convert the image to a numpy array
    img_array = image.img_to_array(img)
    # Normalize the image array to values between 0 and 1
    return img_array/255.0

def load_and_preprocess_audio(audio_path, max_audio_length):
    """
    Load and preprocess an audio file from the given path.

    Args:
    - audio_path (str): Path to the audio file.
    - max_audio_length (int): Maximum length of audio data (in samples).

    Returns:
    - mfccs (numpy.ndarray): Mel-frequency cepstral coefficients (MFCCs) of the audio.
    """
    # Load the audio data
    audio_data, _ = librosa.load(audio_path, sr=SAMPLE_RATE)

    # Ensure the audio has the desired length
    if len(audio_data) < max_audio_length:
        # If too short, pad with zeros
        audio_data = np.pad(audio_data, (0, max_audio_length - len(audio_data)))
    else:
        # If too long, truncate
        audio_data = audio_data[:max_audio_length]

    # Compute MFCCs (Mel-frequency cepstral coefficients) from the audio data
    mfccs = librosa.feature.mfcc(y=audio_data, sr=SAMPLE_RATE, n_mfcc=num_mfcc, n_fft=n_fft, hop_length=hop_length)

    return mfccs


In [None]:
import librosa.util

# Constants
SAMPLE_RATE = 22050  # Sample rate of the audio
TRACK_DURATION = 30  # Duration of each audio track in seconds
SAMPLES_PER_TRACK = SAMPLE_RATE * TRACK_DURATION  # Total samples in each track

num_mfcc = 13  # Number of MFCC coefficients to extract
n_fft = 2048  # Length of the FFT window
hop_length = 512  # Hop length for the STFT

num_segments = 15  # Number of segments to divide each audio track into

# Calculate the maximum audio length in samples based on the number of segments
max_audio_length = SAMPLES_PER_TRACK * num_segments  # Max samples for all segments


In [None]:
import random
import os

# Function to create combinations of image and audio data
def create_data_combinations(image_folder, audio_folder, max_audio_length):
    # Initialize empty lists to store images, labels, and audio data
    images, labels, voices = [], [], []

    # Mapping of class names to numeric labels
    class_mapping = {
        'disco': 0,
        'metal': 1,
        'reggae': 2,
        'blues': 3,
        'rock': 4,
        'classical': 5,
        'jazz': 6,
        'hiphop': 7,
        'country': 8,
        'pop': 9
    }

    # Iterate through each class folder in the image folder
    for class_folder in os.listdir(image_folder):
        # Path to the current class folder
        class_path = os.path.join(image_folder, class_folder)
        i = 0
        # Iterate through each image in the class folder
        for image_name in os.listdir(class_path):
            image_path = os.path.join(class_path, image_name)# Path to the image file
            # Path to the audio folder corresponding to the class
            audio_path = os.path.join(audio_folder, class_folder)
            # Load and preprocess image data
            img_data = load_and_preprocess_image(image_path)
            # Get a list of audio files in the audio folder
            audio_files = os.listdir(audio_path)
            # Select a random sample of audio files (1 or less)
            selected_audio_files = random.sample(audio_files, min(1, len(audio_files)))
            # Iterate through selected audio files
            for audio in selected_audio_files:
                try:
                    # Path to the audio file
                    data_path = os.path.join(audio_path, audio)
                    # Load and preprocess audio data
                    audio_data = load_and_preprocess_audio(data_path, max_audio_length)
                    # Append image, audio, and label to the respective lists
                    images.append(img_data)
                    voices.append(audio_data)
                    label = class_mapping[class_folder]
                    labels.append(label)
                except:
                    continue
            # Limit the number of images per class to 30
            if i == 30:
                break
            i += 1
        # Print the class name after processing images for that class
        print(class_folder)

    return images, voices, labels


In [None]:
import os

# Change the current working directory to the specified path
os.chdir("/kaggle/input/gtzan-dataset-music-genre-classification/Data")

# Create data combinations from image and audio folders
image_folder = 'images_original'
audio_folder = 'genres_original'
# Call the create_data_combinations function to generate image, audio, and label combinations
images, voices, labels = create_data_combinations(image_folder, audio_folder, max_audio_length)


disco
metal
reggae
blues
rock
classical
jazz
hiphop
country
pop


In [None]:
# Split data into features and labels
X_image = np.array(images)
X_audio = np.array(voices)
Y_labels = np.array(labels)

In [None]:
X_audio.shape,X_image.shape,Y_labels.shape

((310, 13, 19380), (310, 224, 224, 3), (310,))

In [None]:
print(np.unique(Y_labels))

[0 1 2 3 4 5 6 7 8 9]


In [None]:
# Convert the integer labels to categorical labels
from tensorflow.keras.utils import to_categorical
Y_labels = to_categorical(Y_labels, num_classes=10)

In [None]:
X_image_train, X_image_val, X_audio_train, X_audio_val, y_train, y_val = train_test_split(
    X_image, X_audio, Y_labels, test_size=0.2, random_state=42
)

In [None]:
X_image_train.shape,X_audio_train.shape,y_train.shape

((248, 224, 224, 3), (248, 13, 19380), (248, 10))

In [None]:
X_image_val.shape,X_audio_val.shape,y_val.shape

((62, 224, 224, 3), (62, 13, 19380), (62, 10))

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Conv2D, MaxPooling2D, Flatten, LSTM, concatenate
from tensorflow.keras.applications import VGG16

# Load pre-trained VGG16 model with weights trained on ImageNet
vgg_model = VGG16(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

# Freeze the layers of VGG16
for layer in vgg_model.layers:
    layer.trainable = False



In [None]:
# Define the shape of the input data
audio_input_shape = (num_mfcc, X_audio.shape[2])  # (number of MFCC coefficients, audio length)
image_input_shape = X_image.shape[1:]  # (image height, image width, number of channels)

# Define the input layers
audio_input = Input(shape=audio_input_shape, name='audio_input')
image_input = Input(shape=image_input_shape, name='image_input')

In [None]:
# Audio processing
audio_lstm = LSTM(64)(audio_input)
audio_output = Dense(32, activation='relu')(audio_lstm)

# Image processing using VGG16
image_vgg = vgg_model(image_input)
image_flatten = Flatten()(image_vgg)
image_output = Dense(512, activation='relu')(image_flatten)

In [None]:
# Concatenate the outputs of audio and image processing
merged = concatenate([audio_output, image_output])
merged = Dense(512, activation='relu')(merged)

# Final output layer
output = Dense(10, activation='softmax')(merged)  # Assuming a binary classification task

# Create the model
model = Model(inputs=[image_input, audio_input], outputs=output)

In [None]:
# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Display the model summary
model.summary()

In [None]:
# Train the model
model.fit(
    [X_image_train, X_audio_train],
    y_train,
    epochs=100,
    validation_data=([X_image_val, X_audio_val], y_val),
    batch_size=2
)

Epoch 1/100
[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 21ms/step - accuracy: 0.7190 - loss: 0.7594 - val_accuracy: 0.4032 - val_loss: 1.8729
Epoch 2/100
[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 20ms/step - accuracy: 0.8363 - loss: 0.5748 - val_accuracy: 0.4516 - val_loss: 2.0532
Epoch 3/100
[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 21ms/step - accuracy: 0.8400 - loss: 0.5215 - val_accuracy: 0.4516 - val_loss: 2.0941
Epoch 4/100
[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 20ms/step - accuracy: 0.8830 - loss: 0.3686 - val_accuracy: 0.4677 - val_loss: 2.1275
Epoch 5/100
[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 20ms/step - accuracy: 0.9247 - loss: 0.3018 - val_accuracy: 0.3548 - val_loss: 2.4990
Epoch 6/100
[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 20ms/step - accuracy: 0.8880 - loss: 0.3479 - val_accuracy: 0.5000 - val_loss: 1.9349
Epoch 7/100
[1m

<keras.src.callbacks.history.History at 0x7dff4b7d2b30>

In [None]:
# Evaluate the model
test_loss, test_acc = model.evaluate([X_image_val, X_audio_val], y_val)
print(f"Test accuracy: {test_acc}")

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 91ms/step - accuracy: 0.4264 - loss: 5.5020
Test accuracy: 0.4677419364452362


In [None]:
# Mapping of class names to numeric labels
class_mapping = {
    'disco': 0,
    'metal': 1,
    'reggae': 2,
    'blues': 3,
    'rock': 4,
    'classical': 5,
    'jazz': 6,
    'hiphop': 7,
    'country': 8,
    'pop': 9
    }

In [None]:
# Specify the paths to the new audio and image files
new_audio_path = '/kaggle/input/gtzan-dataset-music-genre-classification/Data/genres_original/blues/blues.00000.wav'
new_image_path = '/kaggle/input/gtzan-dataset-music-genre-classification/Data/images_original/blues/blues00000.png'

# Load and preprocess the single image and audio data
new_img_data = load_and_preprocess_image(new_image_path)
new_audio_data = load_and_preprocess_audio(new_audio_path, max_audio_length)

# Reshape the data to match the model input shape
new_img_data = np.expand_dims(new_img_data, axis=0)
new_audio_data = np.expand_dims(new_audio_data, axis=0)

# Make a prediction using the model
prediction = model.predict([new_img_data, new_audio_data])
# Get the predicted label
predicted_label = np.argmax(prediction)

print(f'The predicted label is: {predicted_label}')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
The predicted label is: 3


In [None]:
from sklearn.metrics import confusion_matrix, classification_report
# Assuming you have trained your model and obtained predictions on the validation set
predictions = model.predict([X_image_val, X_audio_val])

predicted_labels = np.argmax(predictions, axis=1)
actual_labels = np.argmax(y_val, axis=1)
# Create a list of class labels
class_labels = list(class_mapping.keys())
print()
# Generate confusion matrix with class names
conf_matrix = confusion_matrix(actual_labels, predicted_labels)
print("Confusion Matrix:")
print(conf_matrix)

# Generate classification report with class names
class_report = classification_report(actual_labels, predicted_labels, target_names=class_labels)
print("Classification Report:")
print(class_report)

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 77ms/step

Confusion Matrix:
[[0 0 1 5 0 0 0 1 0 0]
 [0 0 0 2 0 0 0 4 0 0]
 [0 2 3 1 0 0 1 1 0 0]
 [0 0 0 5 1 0 1 0 0 0]
 [0 2 0 0 0 0 0 0 0 0]
 [0 0 0 0 1 3 4 0 1 0]
 [0 0 1 2 0 0 2 0 1 0]
 [0 1 0 0 0 0 0 4 0 1]
 [0 0 0 0 0 1 2 0 0 0]
 [0 0 2 0 0 0 0 4 0 2]]
Classification Report:
              precision    recall  f1-score   support

       disco       0.00      0.00      0.00         7
       metal       0.00      0.00      0.00         6
      reggae       0.43      0.38      0.40         8
       blues       0.33      0.71      0.45         7
        rock       0.00      0.00      0.00         2
   classical       0.75      0.33      0.46         9
        jazz       0.20      0.33      0.25         6
      hiphop       0.29      0.67      0.40         6
     country       0.00      0.00      0.00         3
         pop       0.67      0.25      0.36         8

    accuracy                           0.31        62
   ma

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
#The model achieves an accuracy of 32% on the validation set->with 10 epochs
# model performs reasonably well for some classes, such as 'disco' and 'metal', it struggles with others, such as 'country'.