Convolutional Neural Network

In [30]:
import os
import kagglehub
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, random_split

In [22]:
path = kagglehub.dataset_download("andradaolteanu/gtzan-dataset-music-genre-classification")

genres = [
    "blues",
    "classical",
    "country",
    "disco",
    "hiphop",
    "jazz",
    "metal",
    "pop",
    "reggae",
    "rock",
]
sr = 22050

songs = {}
for genre in genres:
    songs[genre] = []
    for i in tqdm(range(0, 100)):
        song, sr = librosa.load(
            f"{path}/Data/genres_original/{genre}/{genre}.000{i:02d}.wav"
        )
        songs[genre].append(song)



100%|██████████| 100/100 [00:00<00:00, 391.72it/s]
100%|██████████| 100/100 [00:00<00:00, 404.74it/s]
100%|██████████| 100/100 [00:00<00:00, 403.78it/s]
100%|██████████| 100/100 [00:00<00:00, 367.74it/s]
100%|██████████| 100/100 [00:00<00:00, 353.87it/s]
100%|██████████| 100/100 [00:00<00:00, 351.00it/s]
100%|██████████| 100/100 [00:00<00:00, 239.34it/s]
100%|██████████| 100/100 [00:00<00:00, 326.47it/s]
100%|██████████| 100/100 [00:00<00:00, 315.32it/s]
100%|██████████| 100/100 [00:00<00:00, 361.39it/s]


In [24]:
spectrograms = {}
for genre in genres:
    os.makedirs(f"spectrograms/{genre}", exist_ok=True)
    spectrograms[genre] = []
    for i in tqdm(range(0, 100)):
        spectrogram = librosa.feature.melspectrogram(
            y=songs[genre][i], sr=sr, n_fft=2048, hop_length=512, n_mels=128
        )
        spectrogram_db = librosa.amplitude_to_db(spectrogram, ref=np.max)
        spectrograms[genre].append(spectrogram_db)

        fig = plt.figure(figsize=(10, 4))
        librosa.display.specshow(spectrogram_db, sr=sr, hop_length=512, x_axis='time', y_axis='mel')
        plt.axis('off')
        plt.tight_layout()
        plt.savefig(f"spectrograms/{genre}/{genre}.000{i:02d}.png", bbox_inches='tight', pad_inches=0, transparent=False)
        plt.close(fig)

100%|██████████| 100/100 [00:11<00:00,  8.38it/s]
100%|██████████| 100/100 [00:12<00:00,  8.26it/s]
100%|██████████| 100/100 [00:11<00:00,  8.42it/s]
100%|██████████| 100/100 [00:12<00:00,  8.03it/s]
100%|██████████| 100/100 [00:12<00:00,  8.11it/s]
100%|██████████| 100/100 [00:13<00:00,  7.57it/s]
100%|██████████| 100/100 [00:13<00:00,  7.52it/s]
100%|██████████| 100/100 [00:13<00:00,  7.58it/s]
100%|██████████| 100/100 [00:13<00:00,  7.49it/s]
100%|██████████| 100/100 [00:13<00:00,  7.67it/s]


In [33]:
# 1. Hyperparameters and device
DATA_DIR = "spectrograms"   # your folder with subfolders for each genre
BATCH_SIZE = 16
NUM_EPOCHS = 50
LEARNING_RATE = 1e-3
IMG_SIZE = (128, 128)       # resize all spectrograms to this square
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 2. Prepare transforms and dataset
transform = transforms.Compose([
    transforms.Grayscale(num_output_channels=1),      # ensure single-channel
    transforms.Resize(IMG_SIZE),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5], std=[0.5])
])

full_dataset = datasets.ImageFolder(root=DATA_DIR, transform=transform)
num_classes = len(full_dataset.classes)

# 3. Split into train/test
train_size = int(0.8 * len(full_dataset))
test_size  = len(full_dataset) - train_size
train_ds, test_ds = random_split(full_dataset, [train_size, test_size])

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
test_loader  = DataLoader(test_ds,  batch_size=BATCH_SIZE)

# 4. Define a simple CNN LSTM
class CNN_LSTM(nn.Module):
    def __init__(self, num_classes, lstm_hidden=128, lstm_layers=1, dropout=0.3):
        super().__init__()
        # CNN feature extractor (same as before)
        self.cnn = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2),       # 16×64×64
            nn.Conv2d(16, 32, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2),       # 32×32×32
            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2),       # 64×16×16
        )
        # LSTM over the time dimension (we treat width as “time”)
        # input_size = channels * height = 64 * 16 = 1024
        self.lstm = nn.LSTM(
            input_size=64 * 16,
            hidden_size=lstm_hidden,
            num_layers=lstm_layers,
            batch_first=True,
            dropout=dropout if lstm_layers > 1 else 0
        )
        # final classifier
        self.classifier = nn.Sequential(
            nn.Linear(lstm_hidden, num_classes)
        )

    def forward(self, x):
        # x: [B, 1, 128, 128]
        feat = self.cnn(x)                # -> [B, 64, 16, 16]
        # prepare for LSTM: time dimension = width = 16
        feat = feat.permute(0, 3, 1, 2)   # -> [B, 16, 64, 16]
        feat = feat.contiguous().view(
            feat.size(0),
            feat.size(1),
            -1
        )                                  # -> [B, 16, 64*16=1024]
        lstm_out, _ = self.lstm(feat)      # -> [B, 16, lstm_hidden]
        final_feat = lstm_out[:, -1, :]    # take last time‑step -> [B, lstm_hidden]
        logits = self.classifier(final_feat)
        return logits

model = CNN_LSTM(num_classes).to(DEVICE)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

# 5. Training loop
for epoch in range(NUM_EPOCHS):
    model.train()
    running_loss = 0.0
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * inputs.size(0)
    epoch_loss = running_loss / train_size
    print(f"Epoch {epoch+1}/{NUM_EPOCHS} — Loss: {epoch_loss:.4f}")

# 6. Quick evaluation on a few test batches
model.eval()
with torch.no_grad():
    # grab one batch
    inputs, labels = next(iter(test_loader))
    inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)
    outputs = model(inputs)
    _, preds = torch.max(outputs, 1)

    # map indices back to genre names
    idx_to_genre = {v: k for k, v in full_dataset.class_to_idx.items()}
    pred_names = [idx_to_genre[p.item()] for p in preds]
    true_names = [idx_to_genre[l.item()] for l in labels]

    print("\nSample predictions:")
    for i in range(8):
        print(f"  Predicted: {pred_names[i]:10s}   —   Actual: {true_names[i]}")

# 7. Compute overall test accuracy
correct = 0
total = 0
with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)
        outputs = model(inputs)
        _, preds = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (preds == labels).sum().item()
print(f"\nTest Accuracy: {correct/total:.2%}")

Epoch 1/50 — Loss: 2.2490
Epoch 2/50 — Loss: 1.8709
Epoch 3/50 — Loss: 1.8058
Epoch 4/50 — Loss: 1.7115
Epoch 5/50 — Loss: 1.6516
Epoch 6/50 — Loss: 1.6230
Epoch 7/50 — Loss: 1.5706
Epoch 8/50 — Loss: 1.4990
Epoch 9/50 — Loss: 1.4297
Epoch 10/50 — Loss: 1.4089
Epoch 11/50 — Loss: 1.4074
Epoch 12/50 — Loss: 1.2861
Epoch 13/50 — Loss: 1.2162
Epoch 14/50 — Loss: 1.1879
Epoch 15/50 — Loss: 1.1600
Epoch 16/50 — Loss: 1.0168
Epoch 17/50 — Loss: 0.9492
Epoch 18/50 — Loss: 0.8927
Epoch 19/50 — Loss: 0.8476
Epoch 20/50 — Loss: 0.7849
Epoch 21/50 — Loss: 0.7338
Epoch 22/50 — Loss: 0.7126
Epoch 23/50 — Loss: 0.6115
Epoch 24/50 — Loss: 0.5305
Epoch 25/50 — Loss: 0.4358
Epoch 26/50 — Loss: 0.3548
Epoch 27/50 — Loss: 0.3851
Epoch 28/50 — Loss: 0.2972
Epoch 29/50 — Loss: 0.2520
Epoch 30/50 — Loss: 0.1965
Epoch 31/50 — Loss: 0.1543
Epoch 32/50 — Loss: 0.1432
Epoch 33/50 — Loss: 0.1550
Epoch 34/50 — Loss: 0.0988
Epoch 35/50 — Loss: 0.0874
Epoch 36/50 — Loss: 0.1136
Epoch 37/50 — Loss: 0.0736
Epoch 38/5