<a href="https://colab.research.google.com/github/omier/music-genre-classifier/blob/master/DL_Final_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Init Notebook

In [1]:
!git clone https://github.com/omier/music-genre-classifier.git

Cloning into 'music-genre-classifier'...
remote: Enumerating objects: 12, done.[K
remote: Counting objects: 100% (12/12), done.[K
remote: Compressing objects: 100% (12/12), done.[K
remote: Total 2017 (delta 6), reused 0 (delta 0), pack-reused 2005[K
Receiving objects: 100% (2017/2017), 1.19 GiB | 36.96 MiB/s, done.
Resolving deltas: 100% (7/7), done.
Checking out files: 100% (2004/2004), done.


In [2]:
!pip3 install pytorch_lightning efficientnet_pytorch

Collecting pytorch_lightning
[?25l  Downloading https://files.pythonhosted.org/packages/e7/d4/d2751586c7961f238a6077a6dc6e4a9214445da3219f463aa44b29fe4b42/pytorch_lightning-1.1.8-py3-none-any.whl (696kB)
[K     |▌                               | 10kB 20.5MB/s eta 0:00:01[K     |█                               | 20kB 25.0MB/s eta 0:00:01[K     |█▍                              | 30kB 23.0MB/s eta 0:00:01[K     |█▉                              | 40kB 20.0MB/s eta 0:00:01[K     |██▍                             | 51kB 21.0MB/s eta 0:00:01[K     |██▉                             | 61kB 15.6MB/s eta 0:00:01[K     |███▎                            | 71kB 15.1MB/s eta 0:00:01[K     |███▊                            | 81kB 14.8MB/s eta 0:00:01[K     |████▎                           | 92kB 14.7MB/s eta 0:00:01[K     |████▊                           | 102kB 15.2MB/s eta 0:00:01[K     |█████▏                          | 112kB 15.2MB/s eta 0:00:01[K     |█████▋                  

In [3]:
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
import torch.nn.functional as F
import torch.optim as optim
import math
from pytorch_lightning import metrics
import plotly.express as px
import pandas as pd
import numpy as np

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [5]:
img_data = 'music-genre-classifier/Data/images_original/'
dataset = torchvision.datasets.ImageFolder(
    root=img_data,
    transform=torchvision.transforms.ToTensor(),
)

In [6]:
NUM_CLASSES = len(dataset.classes)

In [7]:
# 60% train, 20% validate, 20% test
trainset_size=math.ceil(len(dataset)*0.6)
valset_size=math.ceil(len(dataset)*0.2)
testset_size=len(dataset) - trainset_size - valset_size

trainset, valset, testset = torch.utils.data.random_split(dataset, [trainset_size, valset_size, testset_size])

In [8]:
trainloader = torch.utils.data.DataLoader(trainset, batch_size=16,
                                          shuffle=True)

valloader = torch.utils.data.DataLoader(valset, batch_size=16,
                                         shuffle=False)

testloader = torch.utils.data.DataLoader(testset, batch_size=16,
                                         shuffle=False)

In [9]:
data_loaders = {'train': trainloader, 'validation': valloader, 'test': testloader}

# Helpers

In [10]:
def train(model, n_epochs, criterion, trainloader):
  optimizer = optim.Adam(model.parameters(), lr=0.0001)
  
  history = []
  for e in range(1, n_epochs + 1):
    for counter, data in enumerate(trainloader):
      inputs, labels = data

      predicted_labels = model(inputs.to(device=device))
      optimizer.zero_grad()
      loss = criterion(predicted_labels, labels.to(device=device))
      loss.backward()
      optimizer.step()

    print(f'Epoch {e}')
    history.append(evaluate(model, criterion))

  return history

In [11]:
def evaluate(model, criterion, sets=['train', 'validation']):
  with torch.no_grad():
    sets_metrics = dict()

    for set_name, dataloader in data_loaders.items():
      if set_name in sets:
        recall = metrics.Recall(num_classes=NUM_CLASSES, average='macro').to(device=device)
        precision = metrics.Precision(num_classes=NUM_CLASSES, average='macro').to(device=device)
        accuracy = metrics.Accuracy().to(device=device)
        loss = 0

        for inputs, labels in dataloader:
          predicted_labels = model(inputs.to(device=device))
          labels = labels.to(device=device)
          loss += criterion(predicted_labels, labels.to(device=device)).item()

          recall.update(predicted_labels, labels)
          precision.update(predicted_labels, labels)
          accuracy.update(predicted_labels, labels)

        sets_metrics[set_name] = { 'recall': recall.compute().item(),
                                  'precision': precision.compute().item(),
                                  'accuracy': accuracy.compute().item(),
                                  'loss': loss / len(dataloader.dataset)}

    return sets_metrics

In [12]:
def plot(history):
  metrics_map = dict()

  for e_sets in history:
    for set_name, set_metrics in e_sets.items():
      for metric_name, metric_value in set_metrics.items():

        if metric_name not in metrics_map:
          metrics_map[metric_name] = dict()
        if set_name not in metrics_map[metric_name]:
          metrics_map[metric_name][set_name] = []

        metrics_map[metric_name][set_name].append(metric_value)
      
  for metric_name, sets in metrics_map.items():
    df = None
    for set_name, set_metrics in sets.items():
      size = len(set_metrics)
      if df is None:
        df = pd.DataFrame({"epoch": np.linspace(1, size, size),
                      metric_name: set_metrics,
                      "set": [set_name] * size})
      else:
        df = df.append(pd.DataFrame({"epoch": np.linspace(1, size, size),
                      metric_name: set_metrics,
                      "set": [set_name] * size}), ignore_index=True)

    fig = px.line(df, x="epoch", y=metric_name, line_group="set", title=f"epoch {metric_name} per dataset", color="set", hover_name="set")
    fig.show()

# CNN 3 Conv 3 Linear
convolution layer 1 (convolution -> relu -> max pool 2X2)

convolution layer 2 (convolution -> relu -> max pool 2X2)

convolution layer 3 (convolution -> relu -> max pool 2X2)

3 fully connected linear layers with relu activation function

In [19]:
class CNNGTZAN(nn.Module):

    def __init__(self):
        super(CNNGTZAN, self).__init__()

        self.conv1 = nn.Conv2d(3, 16, 3)
        self.conv2 = nn.Conv2d(16, 32, 3)
        self.conv3 = nn.Conv2d(32, 64, 3)

        # 288, 432 ->(3X3) 286, 430 ->(max pool 2X2) 143, 215 
        # 143, 215 ->(3X3) 141, 213 ->(max pool 2X2) 70, 106 
        # 70, 106  ->(3X3) 68, 104  ->(max pool 2X2) 34, 52
        self.fc1 = nn.Linear(64 * 34 * 52, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 10)

    def forward(self, x):
        # convolution layer 1 (convolution -> relu -> max pool 2X2)
        x = F.max_pool2d(F.relu(self.conv1(x)), 2)
        # convolution layer 2 (convolution -> relu -> max pool 2X2)
        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
        # convolution layer 3 (convolution -> relu -> max pool 2X2)
        x = F.max_pool2d(F.relu(self.conv3(x)), 2)

        # flatten x to (batch_size, 64 * 34 * 52) matrix - per instance flatten
        x = torch.flatten(x, start_dim=1)

        # fully connected linear layers with relu activation function
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))

        # last fc linear layer
        x = self.fc3(x)

        return x

In [20]:
gtzan = CNNGTZAN().to(device=device)

In [21]:
history = train(gtzan, 25, nn.CrossEntropyLoss(), trainloader)

Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19
Epoch 20
Epoch 21
Epoch 22
Epoch 23
Epoch 24
Epoch 25
Epoch 26
Epoch 27
Epoch 28
Epoch 29
Epoch 30
Epoch 31
Epoch 32
Epoch 33
Epoch 34
Epoch 35
Epoch 36
Epoch 37
Epoch 38
Epoch 39
Epoch 40


In [22]:
plot(history)

# CNN 4 Layers

In [23]:
class BigCNN(nn.Module):
    def __init__(self):
        super(BigCNN, self).__init__()

        # 4 layers of convolution and max pooling
        self._extractor = nn.Sequential(
            nn.Conv2d(in_channels=3, out_channels=32, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),

            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),

            nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=4),

            nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=8),
        )
        
       
        # some linear layers for classification
        self._classifier = nn.Sequential(nn.Dropout(0.2),
                                         nn.Linear(in_features=3*2*256, out_features=512),
                                         nn.ReLU(),
                                         nn.Linear(in_features=512, out_features=256),
                                         nn.ReLU(),
                                         nn.Linear(in_features=256, out_features=NUM_CLASSES))

    def forward(self, x):
        # torch.Size([16, 3, 288, 432])

        x = self._extractor(x)
        # BATCH_SIZE, CHANNELS, FREQUENCY, TIME
        # torch.Size([16, 256, 2, 3])
       
        x = x.view(x.size(0), -1)
        # BATCH_SIZE, 256 * 2 * 3
        # torch.Size([16, 1536])

        score = self._classifier(x)
        # torch.Size([16, 10])
        return score


In [24]:
bigcnn = BigCNN().to(device=device)

In [25]:
bigcnn_history = train(bigcnn, 25, nn.CrossEntropyLoss(), trainloader)

Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19
Epoch 20
Epoch 21
Epoch 22
Epoch 23
Epoch 24
Epoch 25


In [26]:
plot(bigcnn_history)

# EfficientNet-b0 Transfer Learning
EfficientNet layer

AvgPool 2d

Linear layer

Softmax

In [13]:
from efficientnet_pytorch import EfficientNet

In [14]:
model = EfficientNet.from_pretrained('efficientnet-b0', num_classes=NUM_CLASSES, advprop=True)

Downloading: "https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/adv-efficientnet-b0-b64d5a18.pth" to /root/.cache/torch/hub/checkpoints/adv-efficientnet-b0-b64d5a18.pth


HBox(children=(FloatProgress(value=0.0, max=21389172.0), HTML(value='')))


Loaded pretrained weights for efficientnet-b0


In [18]:
class MyEfficientNet(nn.Module):

  def __init__(self, efficientNetModel, use_GRU=False):
    super(MyEfficientNet, self).__init__()

    self.efficientNetModel = efficientNetModel
    # output size: torch.Size([batch_size, 1280, 9, 14])
    self.conv1 = nn.Sequential(
            nn.Conv2d(in_channels=1280, out_channels=256, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU()
    )
   
    self.pool = nn.AdaptiveAvgPool2d(2)

    self._rnnModule = nn.GRU(512, 512, bidirectional=True, num_layers=3)

    self.use_GRU = use_GRU
    if self.use_GRU:
      lin_size = 512 * 2 * 2
    else:
      lin_size = 1280 * 2 * 2

    self.lin = nn.Linear(lin_size, NUM_CLASSES)
    

  def forward(self, x):
    x = self.efficientNetModel.extract_features(x)

    x = self.pool(x)
    

    if self.use_GRU:
      x = self.conv1(x)
      # 16, 256, 2, 2

      x = x.permute(0, 3, 1, 2)
      # BATCH_SIZE, TIME, CHANNELS, FREQUENCY
      # torch.Size([16, 2, 256, 2])

      x = x.view(x.size(0), x.size(1), -1)
      # BATCH_SIZE, TIME, CHANNELS*FREQUENCY
      # torch.Size([16, 2, 512])
        
      x, hn = self._rnnModule(x)
      # BATCH_SIZE, TIME, 512 * 2
      # torch.Size([16, 2, 1024])
        
    x = x.view(x.size(0), -1)
    # BATCH_SIZE, 512 * 2 * 2
    # torch.Size([16, 2048])

    
    x = self.lin(x)
    x = nn.Softmax()(x)
   
    return x


In [19]:
gtzan_EfficientNet = MyEfficientNet(model).to(device=device)

In [20]:
history_EfficientNet = train(gtzan_EfficientNet, 27, nn.CrossEntropyLoss(), trainloader)


Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument.



Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19
Epoch 20
Epoch 21
Epoch 22
Epoch 23
Epoch 24
Epoch 25
Epoch 26
Epoch 27


In [21]:
plot(history_EfficientNet)

In [22]:
gtzan_EfficientNet_with_GRU = MyEfficientNet(model, use_GRU=True).to(device=device)

In [None]:
history_EfficientNet_with_GRU = train(gtzan_EfficientNet_with_GRU, 27, nn.CrossEntropyLoss(), trainloader)


Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument.



Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7


In [None]:
plot(history_EfficientNet_with_GRU)

# VGG11 Transfer Learning With CNN

In [None]:
vgg11 = torchvision.models.vgg11(pretrained=True)
vgg11

Downloading: "https://download.pytorch.org/models/vgg11-bbd30ac9.pth" to /root/.cache/torch/hub/checkpoints/vgg11-bbd30ac9.pth


HBox(children=(FloatProgress(value=0.0, max=531456000.0), HTML(value='')))




VGG(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (4): ReLU(inplace=True)
    (5): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): ReLU(inplace=True)
    (8): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): ReLU(inplace=True)
    (10): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (11): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (12): ReLU(inplace=True)
    (13): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (14): ReLU(inplace=True)
    (15): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
 

In [None]:
class MyVGG11(nn.Module):
    def __init__(self, model):
        super(MyVGG11, self).__init__()
        self.features = model.features
        # self.avgpool = model.avgpool
        self.avgpool = nn.AvgPool2d(2)
        self.classifier = nn.Sequential(
            nn.Linear(4*6*512, 8192),
            nn.ReLU(True),
            nn.Dropout(),
            nn.Linear(8192, 4096),
            nn.ReLU(True),
            nn.Dropout(),
            nn.Linear(4096, 4096),
            nn.ReLU(True),
            nn.Dropout(),
            nn.Linear(4096, NUM_CLASSES),
        )

    def forward(self, x):
        x = self.features(x)
        x = self.avgpool(x)
        x = x.view(x.size(0), -1)
        x = self.classifier(x)
        return x

In [None]:
myVGG11Model = MyVGG11(vgg11).to(device=device)

In [None]:
myvgg11history = train(myVGG11Model, 11, nn.CrossEntropyLoss(), trainloader)

Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11


In [None]:
plot(myvgg11history)

# 4L-2D CNN + GRU (fresh cnn with GRU)

In [27]:
class CNNGRU(nn.Module):
    def __init__(self):
        super(CNNGRU, self).__init__()

        # 4 layers of convolution and max pooling
        self._extractor = nn.Sequential(
            nn.Conv2d(in_channels=3, out_channels=32, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),

            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),

            nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=4),

            nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=8),
        )
        
        # bidirectional GRU model with 3 hidden layers
        self._rnnModule = nn.GRU(512, 512, bidirectional=True, num_layers=3)
                                       

        # some linear layers for classification
        self._classifier = nn.Sequential(nn.Dropout(0.2),
                                         nn.Linear(in_features=3*2*512, out_features=512),
                                         nn.ReLU(),
                                         nn.Linear(in_features=512, out_features=256),
                                         nn.ReLU(),
                                         nn.Linear(in_features=256, out_features=NUM_CLASSES))

    def forward(self, x):
        # torch.Size([16, 3, 288, 432])

        x = self._extractor(x)
        # BATCH_SIZE, CHANNELS, FREQUENCY, TIME
        # torch.Size([16, 256, 2, 3])

        x = x.permute(0, 3, 1, 2)
        # BATCH_SIZE, TIME, CHANNELS, FREQUENCY
        # torch.Size([16, 3, 256, 2])

        x = x.view(x.size(0), x.size(1), -1)
        # BATCH_SIZE, TIME, CHANNELS*FREQUENCY
        # torch.Size([16, 3, 512])
      
        x, hn = self._rnnModule(x)
        # BATCH_SIZE, TIME, 512 * 2
        # torch.Size([16, 3, 1024])
       
        x = x.view(x.size(0), -1)
        # BATCH_SIZE, 512 * 2 * 3
        # torch.Size([16, 3072])

        score = self._classifier(x)
        # torch.Size([16, 10])
        return score


In [28]:
cnngru = CNNGRU().to(device=device)

In [29]:
cnngru_history = train(cnngru, 27, nn.CrossEntropyLoss(), trainloader)

Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19
Epoch 20
Epoch 21
Epoch 22
Epoch 23
Epoch 24
Epoch 25
Epoch 26
Epoch 27


In [30]:
plot(cnngru_history)

# Models Compare (test set)

In [None]:
evaluate(cnngru, nn.CrossEntropyLoss(), sets=['test'])


add gru to efficient net and see its(gru) contribution.

test vgg19

time train

hyperparams expirements

evalute testset for each model