# Task 6: Music Genre Classification (GTZAN Dataset)


**1) Tabular approach (MFCC + traditional ML)**


In [None]:
import os
import numpy as np
import pandas as pd
import librosa
import librosa.display
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.applications import VGG16
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
import warnings
warnings.filterwarnings("ignore")


In [None]:
# Step 1: Set dataset path
DATASET_PATH = "/kaggle/input/gtzan-dataset-music-genre-classification/Data/genres_original"
SPECTRO_PATH = "spectrograms"

In [None]:
# Step 2: Extract better MFCC features for tabular model
def extract_features_tabular(dataset_path):
    features = []
    labels = []
    for genre in os.listdir(dataset_path):
        genre_path = os.path.join(dataset_path, genre)
        if not os.path.isdir(genre_path):
            continue
        for file in os.listdir(genre_path):
            file_path = os.path.join(genre_path, file)
            try:
                y, sr = librosa.load(file_path, duration=30)
                
                # Core MFCCs
                mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20)
                mfcc_mean = np.mean(mfcc.T, axis=0)
                
                # Extra features for more info
                chroma = librosa.feature.chroma_stft(y=y, sr=sr)
                chroma_mean = np.mean(chroma.T, axis=0)
                
                contrast = librosa.feature.spectral_contrast(y=y, sr=sr)
                contrast_mean = np.mean(contrast.T, axis=0)
                
                tonnetz = librosa.feature.tonnetz(y=librosa.effects.harmonic(y), sr=sr)
                tonnetz_mean = np.mean(tonnetz.T, axis=0)
                
                # Combine all
                feature_vector = np.hstack([mfcc_mean, chroma_mean, contrast_mean, tonnetz_mean])
                features.append(feature_vector)
                labels.append(genre)
            except Exception as e:
                print(f"Error processing {file_path}: {e}")
    return np.array(features), np.array(labels)

print("Extracting MFCC + extra features for RandomForest...")
X_tabular, y_tabular = extract_features_tabular(DATASET_PATH)

# Encode labels
encoder = LabelEncoder()
y_tabular_encoded = encoder.fit_transform(y_tabular)

# Scale
scaler = StandardScaler()
X_tabular_scaled = scaler.fit_transform(X_tabular)

# Train/test split
X_train_tab, X_test_tab, y_train_tab, y_test_tab = train_test_split(
    X_tabular_scaled, y_tabular_encoded, test_size=0.2, random_state=42
)

In [9]:
# Step 3: RandomForest with tuning
print("\nTraining tuned RandomForest...")
rf = RandomForestClassifier(random_state=42)
param_grid = {
    'n_estimators': [200, 300],
    'max_depth': [None, 20, 30],
    'min_samples_split': [2, 4],
}
grid_search = GridSearchCV(rf, param_grid, cv=3, n_jobs=-1, verbose=1)
grid_search.fit(X_train_tab, y_train_tab)
best_rf = grid_search.best_estimator_

y_pred_tab = best_rf.predict(X_test_tab)
print("\n--- RandomForest Results ---")
print("Accuracy:", accuracy_score(y_test_tab, y_pred_tab))
print(classification_report(y_test_tab, y_pred_tab, target_names=encoder.classes_))



Training tuned RandomForest...
Fitting 3 folds for each of 12 candidates, totalling 36 fits

--- RandomForest Results ---
Accuracy: 0.71
              precision    recall  f1-score   support

       blues       0.73      0.73      0.73        22
   classical       0.90      0.93      0.91        28
     country       0.67      0.73      0.70        22
       disco       0.62      0.62      0.62        21
      hiphop       0.64      0.74      0.68        19
        jazz       0.71      0.71      0.71        17
       metal       0.80      1.00      0.89        12
         pop       0.75      0.75      0.75        20
      reggae       0.58      0.58      0.58        24
        rock       0.67      0.27      0.38        15

    accuracy                           0.71       200
   macro avg       0.71      0.70      0.69       200
weighted avg       0.71      0.71      0.70       200



**2) Image-based approach (Spectrograms Transfer Learning)**

In [None]:
# Step 4: Create spectrogram images
def create_clean_spectrograms(dataset_path, output_path, img_size=(128, 128)):
    os.makedirs(output_path, exist_ok=True)
    for genre in os.listdir(dataset_path):
        genre_path = os.path.join(dataset_path, genre)
        if not os.path.isdir(genre_path):
            continue

        genre_out_path = os.path.join(output_path, genre)
        os.makedirs(genre_out_path, exist_ok=True)

        for file in os.listdir(genre_path):
            if not file.lower().endswith(".wav"):
                continue

            file_path = os.path.join(genre_path, file)
            try:
                # Load first 10 seconds for consistency
                y, sr = librosa.load(file_path, duration=10)

                # Compute mel spectrogram
                melspec = librosa.feature.melspectrogram(
                    y=y, sr=sr, n_mels=128, fmax=8000
                )
                melspec_db = librosa.power_to_db(melspec, ref=np.max)

                # Create plot without axes
                fig = plt.figure(figsize=(img_size[0] / 100, img_size[1] / 100), dpi=100)
                ax = plt.Axes(fig, [0., 0., 1., 1.])
                ax.set_axis_off()
                fig.add_axes(ax)

                librosa.display.specshow(
                    melspec_db,
                    sr=sr,
                    cmap="magma",
                    vmin=-80, vmax=0  # fixed color scale
                )

                save_path = os.path.join(genre_out_path, file.replace(".wav", ".png"))
                plt.savefig(save_path, bbox_inches=None, pad_inches=0)
                plt.close(fig)

            except Exception as e:
                print(f"Error processing {file_path}: {e}")

print("\nCreating clean spectrograms...")
create_clean_spectrograms(DATASET_PATH, SPECTRO_PATH)


In [20]:
# Step 5: Image generators with augmentation
IMG_SIZE = (128, 128)
BATCH_SIZE = 32

train_gen = ImageDataGenerator(
    rescale=1./255,
    validation_split=0.2,
)

train_data = train_gen.flow_from_directory(
    SPECTRO_PATH,
    target_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    subset="training"
)
val_data = train_gen.flow_from_directory(
    SPECTRO_PATH,
    target_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    subset="validation"
)

print("\nTraining Transfer Learning model (VGG16)...")

# Load base model without top layers
base_model = VGG16(weights='imagenet', include_top=False,
                   input_shape=(IMG_SIZE[0], IMG_SIZE[1], 3))

# Unfreeze last 8 layers for fine-tuning
for layer in base_model.layers[:-8]:
    layer.trainable = False
for layer in base_model.layers[-8:]:
    layer.trainable = True

# Build transfer learning model
transfer_model = models.Sequential([
    base_model,
    layers.GlobalAveragePooling2D(),
    layers.Dense(256, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(0.5),
    layers.Dense(len(train_data.class_indices), activation='softmax')
])

# Compile with Adam + weight decay for stability
transfer_model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

# Callbacks
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

callbacks = [
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-7)
]

# Train model
history = transfer_model.fit(
    train_data,
    validation_data=val_data,
    epochs=50,
    callbacks=callbacks
)

# Step 7:compare results

# Tabular model accuracy
tabular_acc = accuracy_score(y_test_tab, y_pred_tab)

# Transfer Learning accuracy
vgg_loss, vgg_acc = transfer_model.evaluate(val_data, verbose=0)

# Create comparison table
results_df = pd.DataFrame({
    "Model": ["RandomForest (MFCC+Extra Features)", "VGG16 Transfer Learning"],
    "Accuracy": [tabular_acc, vgg_acc]
})

print("\n--- Model Accuracy Comparison ---")
print(results_df.to_string(index=False))


Found 800 images belonging to 10 classes.
Found 199 images belonging to 10 classes.

Training Transfer Learning model (VGG16)...
Epoch 1/50
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 183ms/step - accuracy: 0.1552 - loss: 2.6833 - val_accuracy: 0.1005 - val_loss: 2.2767 - learning_rate: 1.0000e-05
Epoch 2/50
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 131ms/step - accuracy: 0.3850 - loss: 1.7893 - val_accuracy: 0.1005 - val_loss: 2.1929 - learning_rate: 1.0000e-05
Epoch 3/50
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 134ms/step - accuracy: 0.4988 - loss: 1.5276 - val_accuracy: 0.2010 - val_loss: 2.1220 - learning_rate: 1.0000e-05
Epoch 4/50
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 136ms/step - accuracy: 0.5449 - loss: 1.3322 - val_accuracy: 0.2513 - val_loss: 2.0558 - learning_rate: 1.0000e-05
Epoch 5/50
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 138ms/step - accuracy: 0.5870 - los