<a href="https://colab.research.google.com/github/omier/music-genre-classifier/blob/master/DL_Final_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
#@title RUN Pre-Processing
run_preprocessing = False #@param {type:"boolean"}


# Init Notebook

In [1]:
!git clone https://github.com/omier/music-genre-classifier.git

Cloning into 'music-genre-classifier'...
remote: Enumerating objects: 10878, done.[K
remote: Counting objects: 100% (10878/10878), done.[K
remote: Compressing objects: 100% (10877/10877), done.[K
remote: Total 12883 (delta 13), reused 10857 (delta 1), pack-reused 2005[K
Receiving objects: 100% (12883/12883), 1.70 GiB | 41.48 MiB/s, done.
Resolving deltas: 100% (14/14), done.
Checking out files: 100% (12993/12993), done.


In [2]:
!pip3 install pytorch_lightning efficientnet_pytorch

Collecting pytorch_lightning
[?25l  Downloading https://files.pythonhosted.org/packages/e7/d4/d2751586c7961f238a6077a6dc6e4a9214445da3219f463aa44b29fe4b42/pytorch_lightning-1.1.8-py3-none-any.whl (696kB)
[K     |▌                               | 10kB 25.1MB/s eta 0:00:01[K     |█                               | 20kB 14.0MB/s eta 0:00:01[K     |█▍                              | 30kB 12.6MB/s eta 0:00:01[K     |█▉                              | 40kB 11.9MB/s eta 0:00:01[K     |██▍                             | 51kB 7.7MB/s eta 0:00:01[K     |██▉                             | 61kB 7.3MB/s eta 0:00:01[K     |███▎                            | 71kB 8.3MB/s eta 0:00:01[K     |███▊                            | 81kB 9.2MB/s eta 0:00:01[K     |████▎                           | 92kB 8.6MB/s eta 0:00:01[K     |████▊                           | 102kB 7.6MB/s eta 0:00:01[K     |█████▏                          | 112kB 7.6MB/s eta 0:00:01[K     |█████▋                         

In [3]:
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
import torch.nn.functional as F
import torch.optim as optim
import math
from pytorch_lightning import metrics
import plotly.express as px
import pandas as pd
import numpy as np
import pprint

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

# Pre-Processing

In [6]:
import librosa
from librosa import display
import matplotlib.pyplot as plt
import glob
import os
from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas

In [7]:
n_fft = 2048
hop_length = 512
n_mels = 288
song_length = 30
song_mini_batch_length = 3

def preprocess(filename, out):
  for offset in range(0, song_length, song_mini_batch_length):
    y, sr = librosa.load(filename, duration=song_mini_batch_length, sr=None, offset=offset)
    song, _ = librosa.effects.trim(y)
    
    S = librosa.feature.melspectrogram(song, sr=sr, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels)
    S_DB = librosa.power_to_db(S, ref=np.max)

    fig = plt.Figure()
    canvas = FigureCanvas(fig)
    ax = fig.add_subplot(111)
    ax.set_axis_off()
    librosa.display.specshow(S_DB, ax=ax, y_axis='log', x_axis='time')
    fig.savefig(f'{out}_{offset}.png', transparent=True)

In [8]:
data_path = 'music-genre-classifier/Data/'
output_directory = 'melspectograms/'
genres = glob.glob(f'{data_path}genres_original/*')

def ensure_dir(dir):
  if not os.path.exists(dir):
    os.mkdir(dir)

if run_preprocessing:
  ensure_dir(f'{data_path}{output_directory}')
  for g in genres:
    waves = glob.glob(f'{g}/*')
    genre = g.split('/')[-1]
    for w in waves:
      filename = '.'.join(w.split('/')[-1].split('.')[:-1])
      ensure_dir(f'{data_path}{output_directory}{genre}')
      preprocess(w, f'{data_path}{output_directory}{genre}/{filename}')

# Load Data

In [9]:
img_data = 'music-genre-classifier/Data/melspectograms/'
dataset = torchvision.datasets.ImageFolder(
    root=img_data,
    transform=torchvision.transforms.ToTensor(),
)

In [10]:
len(dataset)

9990

In [11]:
NUM_CLASSES = len(dataset.classes)

In [12]:
# 60% train, 20% validate, 20% test
trainset_size=math.ceil(len(dataset)*0.6)
valset_size=math.ceil(len(dataset)*0.2)
testset_size=len(dataset) - trainset_size - valset_size

trainset, valset, testset = torch.utils.data.random_split(dataset, [trainset_size, valset_size, testset_size])

In [13]:
trainloader = torch.utils.data.DataLoader(trainset, batch_size=16,
                                          shuffle=True)

valloader = torch.utils.data.DataLoader(valset, batch_size=16,
                                         shuffle=False)

testloader = torch.utils.data.DataLoader(testset, batch_size=16,
                                         shuffle=False)

In [14]:
data_loaders = {'train': trainloader, 'validation': valloader, 'test': testloader}

# Helpers

In [15]:
def train(model, n_epochs, criterion, trainloader):
  optimizer = optim.Adam(model.parameters(), lr=0.0001)
  
  history = []
  for e in range(1, n_epochs + 1):
    for counter, data in enumerate(trainloader):
      inputs, labels = data

      predicted_labels = model(inputs.to(device=device))
      optimizer.zero_grad()
      loss = criterion(predicted_labels, labels.to(device=device))
      loss.backward()
      optimizer.step()
    
    current_metrics = evaluate(model, criterion)
    print(f'Epoch {e}\\{n_epochs} Metrics')
    pprint.pprint(current_metrics, indent=4)

    history.append(current_metrics)

  return history

In [16]:
def evaluate(model, criterion, sets=['train', 'validation']):
  with torch.no_grad():
    sets_metrics = dict()

    for set_name, dataloader in data_loaders.items():
      if set_name in sets:
        recall = metrics.Recall(num_classes=NUM_CLASSES, average='macro').to(device=device)
        precision = metrics.Precision(num_classes=NUM_CLASSES, average='macro').to(device=device)
        accuracy = metrics.Accuracy().to(device=device)
        loss = 0

        for inputs, labels in dataloader:
          predicted_labels = model(inputs.to(device=device))
          labels = labels.to(device=device)
          loss += criterion(predicted_labels, labels.to(device=device)).item()

          recall.update(predicted_labels, labels)
          precision.update(predicted_labels, labels)
          accuracy.update(predicted_labels, labels)

        sets_metrics[set_name] = { 'recall': recall.compute().item(),
                                  'precision': precision.compute().item(),
                                  'accuracy': accuracy.compute().item(),
                                  'loss': loss / len(dataloader.dataset)}

    return sets_metrics

In [17]:
def plot(history):
  metrics_map = dict()

  for e_sets in history:
    for set_name, set_metrics in e_sets.items():
      for metric_name, metric_value in set_metrics.items():

        if metric_name not in metrics_map:
          metrics_map[metric_name] = dict()
        if set_name not in metrics_map[metric_name]:
          metrics_map[metric_name][set_name] = []

        metrics_map[metric_name][set_name].append(metric_value)
      
  for metric_name, sets in metrics_map.items():
    df = None
    for set_name, set_metrics in sets.items():
      size = len(set_metrics)
      if df is None:
        df = pd.DataFrame({"epoch": np.linspace(1, size, size),
                      metric_name: set_metrics,
                      "set": [set_name] * size})
      else:
        df = df.append(pd.DataFrame({"epoch": np.linspace(1, size, size),
                      metric_name: set_metrics,
                      "set": [set_name] * size}), ignore_index=True)

    fig = px.line(df, x="epoch", y=metric_name, line_group="set", title=f"epoch {metric_name} per dataset", color="set", hover_name="set")
    fig.show()

# CNN 3 Conv 3 Linear
convolution layer 1 (convolution -> relu -> max pool 2X2)

convolution layer 2 (convolution -> relu -> max pool 2X2)

convolution layer 3 (convolution -> relu -> max pool 2X2)

3 fully connected linear layers with relu activation function

In [18]:
class CNNGTZAN(nn.Module):

    def __init__(self):
        super(CNNGTZAN, self).__init__()

        self.conv1 = nn.Conv2d(3, 16, 3)
        self.conv2 = nn.Conv2d(16, 32, 3)
        self.conv3 = nn.Conv2d(32, 64, 3)

        # 288, 432 ->(3X3) 286, 430 ->(max pool 2X2) 143, 215 
        # 143, 215 ->(3X3) 141, 213 ->(max pool 2X2) 70, 106 
        # 70, 106  ->(3X3) 68, 104  ->(max pool 2X2) 34, 52
        self.fc1 = nn.Linear(64 * 34 * 52, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 10)

    def forward(self, x):
        # convolution layer 1 (convolution -> relu -> max pool 2X2)
        x = F.max_pool2d(F.relu(self.conv1(x)), 2)
        # convolution layer 2 (convolution -> relu -> max pool 2X2)
        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
        # convolution layer 3 (convolution -> relu -> max pool 2X2)
        x = F.max_pool2d(F.relu(self.conv3(x)), 2)

        # flatten x to (batch_size, 64 * 34 * 52) matrix - per instance flatten
        x = torch.flatten(x, start_dim=1)

        # fully connected linear layers with relu activation function
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))

        # last fc linear layer
        x = self.fc3(x)

        return x

In [19]:
baseline_cnn = CNNGTZAN().to(device=device)

In [20]:
%%time
baseline_cnn_history = train(baseline_cnn, 15, nn.CrossEntropyLoss(), trainloader)

Epoch 1\15 Metrics
{   'train': {   'accuracy': 0.391391396522522,
                 'loss': 0.09918620963672578,
                 'precision': 0.495524138212204,
                 'recall': 0.3912365138530731},
    'validation': {   'accuracy': 0.3723723590373993,
                      'loss': 0.09970472542731254,
                      'precision': 0.48823004961013794,
                      'recall': 0.37881818413734436}}
Epoch 2\15 Metrics
{   'train': {   'accuracy': 0.4998331665992737,
                 'loss': 0.08523058506461594,
                 'precision': 0.5677370429039001,
                 'recall': 0.49965712428092957},
    'validation': {   'accuracy': 0.49299299716949463,
                      'loss': 0.08724371690649886,
                      'precision': 0.5529664158821106,
                      'recall': 0.49710533022880554}}
Epoch 3\15 Metrics
{   'train': {   'accuracy': 0.5909242630004883,
                 'loss': 0.0724995744836939,
                 'precision': 0.61

In [21]:
plot(baseline_cnn_history)

# 4L-2D CNN



In [22]:
class BigCNN(nn.Module):
    def __init__(self):
        super(BigCNN, self).__init__()

        # 4 layers of convolution and max pooling
        self._extractor = nn.Sequential(
            nn.Conv2d(in_channels=3, out_channels=32, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),

            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),

            nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=4),

            nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=8),
        )
        
       
        # some linear layers for classification
        self._classifier = nn.Sequential(nn.Dropout(0.2),
                                         nn.Linear(in_features=3*2*256, out_features=512),
                                         nn.ReLU(),
                                         nn.Linear(in_features=512, out_features=256),
                                         nn.ReLU(),
                                         nn.Linear(in_features=256, out_features=NUM_CLASSES))

    def forward(self, x):
        # torch.Size([16, 3, 288, 432])

        x = self._extractor(x)
        # BATCH_SIZE, CHANNELS, FREQUENCY, TIME
        # torch.Size([16, 256, 2, 3])
       
        x = x.view(x.size(0), -1)
        # BATCH_SIZE, 256 * 2 * 3
        # torch.Size([16, 1536])

        score = self._classifier(x)
        # torch.Size([16, 10])
        return score


In [23]:
big_cnn = BigCNN().to(device=device)

In [24]:
%%time
big_cnn_history = train(big_cnn, 25, nn.CrossEntropyLoss(), trainloader)

Epoch 1\25 Metrics
{   'train': {   'accuracy': 0.5807474255561829,
                 'loss': 0.07758965500601539,
                 'precision': 0.5993649363517761,
                 'recall': 0.5823022723197937},
    'validation': {   'accuracy': 0.5715715885162354,
                      'loss': 0.07880182744743111,
                      'precision': 0.5885558724403381,
                      'recall': 0.5779266357421875}}
Epoch 2\25 Metrics
{   'train': {   'accuracy': 0.6646646857261658,
                 'loss': 0.05975523150520083,
                 'precision': 0.6777054071426392,
                 'recall': 0.6663299798965454},
    'validation': {   'accuracy': 0.6581581830978394,
                      'loss': 0.06140456274823026,
                      'precision': 0.6702247858047485,
                      'recall': 0.6597409844398499}}
Epoch 3\25 Metrics
{   'train': {   'accuracy': 0.7262262105941772,
                 'loss': 0.04860351100181157,
                 'precision': 0.7548

In [25]:
plot(big_cnn_history)

# 4L-2D CNN + GRU (fresh cnn with GRU)

In [26]:
class CNNGRU(nn.Module):
    def __init__(self):
        super(CNNGRU, self).__init__()

        # 4 layers of convolution and max pooling
        self._extractor = nn.Sequential(
            nn.Conv2d(in_channels=3, out_channels=32, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),

            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),

            nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=4),

            nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=8),
        )
        
        # bidirectional GRU model with 3 hidden layers
        self._rnnModule = nn.GRU(512, 512, bidirectional=True, num_layers=3)
                                       

        # some linear layers for classification
        self._classifier = nn.Sequential(nn.Dropout(0.2),
                                         nn.Linear(in_features=3*2*512, out_features=512),
                                         nn.ReLU(),
                                         nn.Linear(in_features=512, out_features=256),
                                         nn.ReLU(),
                                         nn.Linear(in_features=256, out_features=NUM_CLASSES))

    def forward(self, x):
        # torch.Size([16, 3, 288, 432])

        x = self._extractor(x)
        # BATCH_SIZE, CHANNELS, FREQUENCY, TIME
        # torch.Size([16, 256, 2, 3])

        x = x.permute(0, 3, 1, 2)
        # BATCH_SIZE, TIME, CHANNELS, FREQUENCY
        # torch.Size([16, 3, 256, 2])

        x = x.view(x.size(0), x.size(1), -1)
        # BATCH_SIZE, TIME, CHANNELS*FREQUENCY
        # torch.Size([16, 3, 512])
      
        x, hn = self._rnnModule(x)
        # BATCH_SIZE, TIME, 512 * 2
        # torch.Size([16, 3, 1024])
       
        x = x.view(x.size(0), -1)
        # BATCH_SIZE, 512 * 2 * 3
        # torch.Size([16, 3072])

        score = self._classifier(x)
        # torch.Size([16, 10])
        return score


In [27]:
big_cnn_gru = CNNGRU().to(device=device)

In [28]:
%%time
big_cnn_gru_history = train(big_cnn_gru, 17, nn.CrossEntropyLoss(), trainloader)

Epoch 1\17 Metrics
{   'train': {   'accuracy': 0.3862195611000061,
                 'loss': 0.10201333354304622,
                 'precision': 0.29280683398246765,
                 'recall': 0.3858812749385834},
    'validation': {   'accuracy': 0.379879891872406,
                      'loss': 0.10179701963583151,
                      'precision': 0.28312408924102783,
                      'recall': 0.3791367709636688}}
Epoch 2\17 Metrics
{   'train': {   'accuracy': 0.5148481726646423,
                 'loss': 0.07757408637025176,
                 'precision': 0.49339112639427185,
                 'recall': 0.5139613151550293},
    'validation': {   'accuracy': 0.4934934973716736,
                      'loss': 0.08084674568863602,
                      'precision': 0.48594292998313904,
                      'recall': 0.5021165013313293}}
Epoch 3\17 Metrics
{   'train': {   'accuracy': 0.6661661863327026,
                 'loss': 0.0575294197301766,
                 'precision': 0.65

In [29]:
plot(big_cnn_gru_history)

# EfficientNet-b0 Transfer Learning
EfficientNet layer

AvgPool 2d

Linear layer

Softmax

In [30]:
from efficientnet_pytorch import EfficientNet

In [31]:
model = EfficientNet.from_pretrained('efficientnet-b0', num_classes=NUM_CLASSES, advprop=True)

Downloading: "https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/adv-efficientnet-b0-b64d5a18.pth" to /root/.cache/torch/hub/checkpoints/adv-efficientnet-b0-b64d5a18.pth


HBox(children=(FloatProgress(value=0.0, max=21389172.0), HTML(value='')))


Loaded pretrained weights for efficientnet-b0


In [32]:
class MyEfficientNet(nn.Module):

  def __init__(self, efficientNetModel, use_GRU=False):
    super(MyEfficientNet, self).__init__()

    self.efficientNetModel = efficientNetModel
    # output size: torch.Size([batch_size, 1280, 9, 14])
    self.conv1 = nn.Sequential(
            nn.Conv2d(in_channels=1280, out_channels=256, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU()
    )
   
    self.pool = nn.AdaptiveAvgPool2d(2)

    self._rnnModule = nn.GRU(512, 512, bidirectional=True, num_layers=3)

    self.use_GRU = use_GRU
    if self.use_GRU:
      lin_size = 512 * 2 * 2
    else:
      lin_size = 1280 * 2 * 2

    self.lin = nn.Linear(lin_size, NUM_CLASSES)
    

  def forward(self, x):
    x = self.efficientNetModel.extract_features(x)

    x = self.pool(x)
    
    if self.use_GRU:
      x = self.conv1(x)
      # 16, 256, 2, 2

      x = x.permute(0, 3, 1, 2)
      # BATCH_SIZE, TIME, CHANNELS, FREQUENCY
      # torch.Size([16, 2, 256, 2])

      x = x.view(x.size(0), x.size(1), -1)
      # BATCH_SIZE, TIME, CHANNELS*FREQUENCY
      # torch.Size([16, 2, 512])
        
      x, hn = self._rnnModule(x)
      # BATCH_SIZE, TIME, 512 * 2
      # torch.Size([16, 2, 1024])
        
    x = x.view(x.size(0), -1)
    # BATCH_SIZE, 512 * 2 * 2
    # torch.Size([16, 2048])

    x = self.lin(x)
    x = nn.Softmax()(x)
   
    return x


In [33]:
gtzan_EfficientNet = MyEfficientNet(model).to(device=device)

In [34]:
%%time
history_EfficientNet = train(gtzan_EfficientNet, 27, nn.CrossEntropyLoss(), trainloader)


Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument.



Epoch 1\27 Metrics
{   'train': {   'accuracy': 0.7615949511528015,
                 'loss': 0.10721098422049521,
                 'precision': 0.7631682753562927,
                 'recall': 0.7628800272941589},
    'validation': {   'accuracy': 0.7377377152442932,
                      'loss': 0.10849216511776975,
                      'precision': 0.7426643967628479,
                      'recall': 0.7431961894035339}}
Epoch 2\27 Metrics
{   'train': {   'accuracy': 0.8119786381721497,
                 'loss': 0.10365625781379702,
                 'precision': 0.8205214738845825,
                 'recall': 0.8130123019218445},
    'validation': {   'accuracy': 0.7912912964820862,
                      'loss': 0.10508542704033302,
                      'precision': 0.8011468052864075,
                      'recall': 0.7936258316040039}}
Epoch 3\27 Metrics
{   'train': {   'accuracy': 0.8526860475540161,
                 'loss': 0.10099517211223548,
                 'precision': 0.8578

In [35]:
plot(history_EfficientNet)

In [36]:
gtzan_EfficientNet_with_GRU = MyEfficientNet(model, use_GRU=True).to(device=device)

In [37]:
%%time
history_EfficientNet_with_GRU = train(gtzan_EfficientNet_with_GRU, 27, nn.CrossEntropyLoss(), trainloader)


Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument.



Epoch 1\27 Metrics
{   'train': {   'accuracy': 0.9629629850387573,
                 'loss': 0.09388217124137077,
                 'precision': 0.9646709561347961,
                 'recall': 0.9628807306289673},
    'validation': {   'accuracy': 0.8963963985443115,
                      'loss': 0.09796579058344539,
                      'precision': 0.9026166200637817,
                      'recall': 0.8969852328300476}}
Epoch 2\27 Metrics
{   'train': {   'accuracy': 0.9753086566925049,
                 'loss': 0.09305165463938568,
                 'precision': 0.9756068587303162,
                 'recall': 0.9753562211990356},
    'validation': {   'accuracy': 0.9204204082489014,
                      'loss': 0.09663926922642552,
                      'precision': 0.9228349924087524,
                      'recall': 0.9208486676216125}}
Epoch 3\27 Metrics
{   'train': {   'accuracy': 0.9676343202590942,
                 'loss': 0.09348214824159105,
                 'precision': 0.9680

In [38]:
plot(history_EfficientNet_with_GRU)

# VGG11 Transfer Learning

In [39]:
vgg11 = torchvision.models.vgg11(pretrained=True)
vgg11

Downloading: "https://download.pytorch.org/models/vgg11-bbd30ac9.pth" to /root/.cache/torch/hub/checkpoints/vgg11-bbd30ac9.pth


HBox(children=(FloatProgress(value=0.0, max=531456000.0), HTML(value='')))




VGG(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (4): ReLU(inplace=True)
    (5): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): ReLU(inplace=True)
    (8): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): ReLU(inplace=True)
    (10): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (11): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (12): ReLU(inplace=True)
    (13): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (14): ReLU(inplace=True)
    (15): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
 

In [40]:
class MyVGG11(nn.Module):
    def __init__(self, model):
        super(MyVGG11, self).__init__()
        self.features = model.features
        # self.avgpool = model.avgpool
        self.avgpool = nn.AvgPool2d(2)
        self.classifier = nn.Sequential(
            nn.Linear(4*6*512, 8192),
            nn.ReLU(True),
            nn.Dropout(),
            nn.Linear(8192, 4096),
            nn.ReLU(True),
            nn.Dropout(),
            nn.Linear(4096, 4096),
            nn.ReLU(True),
            nn.Dropout(),
            nn.Linear(4096, NUM_CLASSES),
        )

    def forward(self, x):
        x = self.features(x)
        x = self.avgpool(x)
        x = x.view(x.size(0), -1)
        x = self.classifier(x)
        return x

In [41]:
myVGG11Model = MyVGG11(vgg11).to(device=device)

In [42]:
%%time
myvgg11history = train(myVGG11Model, 11, nn.CrossEntropyLoss(), trainloader)

Epoch 1\11 Metrics
{   'train': {   'accuracy': 0.6673340201377869,
                 'loss': 0.058528810505832,
                 'precision': 0.7315589189529419,
                 'recall': 0.6684112548828125},
    'validation': {   'accuracy': 0.6631631851196289,
                      'loss': 0.06066543329227436,
                      'precision': 0.7281692624092102,
                      'recall': 0.6684015393257141}}
Epoch 2\11 Metrics
{   'train': {   'accuracy': 0.8081414699554443,
                 'loss': 0.03362364163960979,
                 'precision': 0.8299400210380554,
                 'recall': 0.8094421625137329},
    'validation': {   'accuracy': 0.7842842936515808,
                      'loss': 0.03886444949441486,
                      'precision': 0.8113005757331848,
                      'recall': 0.7868120074272156}}
Epoch 3\11 Metrics
{   'train': {   'accuracy': 0.9227560758590698,
                 'loss': 0.015757101587640432,
                 'precision': 0.92729

In [43]:
plot(myvgg11history)

# Models Compare (test set)

In [44]:
models = {
    'baseline CNN': baseline_cnn,
    '4 layers CNN': big_cnn,
    '4 layers CNN with GRU': big_cnn_gru,
    'EfficientNet-b0': gtzan_EfficientNet,
    'EfficientNet-b0 with GRU': gtzan_EfficientNet_with_GRU,
    'VGG11': myVGG11Model,
}

for model_name, model in models.items():
  print(model_name)
  print(evaluate(model, nn.CrossEntropyLoss(), sets=['test']))

baseline CNN
{'test': {'recall': 0.6768531799316406, 'precision': 0.6968328952789307, 'accuracy': 0.673173189163208, 'loss': 0.062056113396201644}}
4 layers CNN
{'test': {'recall': 0.8708521723747253, 'precision': 0.8816421627998352, 'accuracy': 0.8738738894462585, 'loss': 0.026895192054634517}}
4 layers CNN with GRU
{'test': {'recall': 0.8556930422782898, 'precision': 0.8609041571617126, 'accuracy': 0.8548548817634583, 'loss': 0.03110140386141009}}
EfficientNet-b0



Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument.



{'test': {'recall': 0.9090687036514282, 'precision': 0.9086891412734985, 'accuracy': 0.9089089035987854, 'loss': 0.09720377670274721}}
EfficientNet-b0 with GRU
{'test': {'recall': 0.9007700085639954, 'precision': 0.9011687636375427, 'accuracy': 0.9009009003639221, 'loss': 0.0975866328488599}}
VGG11
{'test': {'recall': 0.8727920651435852, 'precision': 0.883734405040741, 'accuracy': 0.8743743896484375, 'loss': 0.03073935941289607}}


# Audio features based model - 3 Fully Connected linear layers

In [None]:
extracted_features_path = 'music-genre-classifier/Data/features_30_sec.csv'
audio_features_df = pd.read_csv(extracted_features_path)

In [None]:
audio_features_df = audio_features_df.drop(columns=['filename','length'])

In [None]:
audio_features_df["label"] = audio_features_df["label"].astype('category').cat.codes

In [None]:
audio_features_df

Unnamed: 0,chroma_stft_mean,chroma_stft_var,rms_mean,rms_var,spectral_centroid_mean,spectral_centroid_var,spectral_bandwidth_mean,spectral_bandwidth_var,rolloff_mean,rolloff_var,zero_crossing_rate_mean,zero_crossing_rate_var,harmony_mean,harmony_var,perceptr_mean,perceptr_var,tempo,mfcc1_mean,mfcc1_var,mfcc2_mean,mfcc2_var,mfcc3_mean,mfcc3_var,mfcc4_mean,mfcc4_var,mfcc5_mean,mfcc5_var,mfcc6_mean,mfcc6_var,mfcc7_mean,mfcc7_var,mfcc8_mean,mfcc8_var,mfcc9_mean,mfcc9_var,mfcc10_mean,mfcc10_var,mfcc11_mean,mfcc11_var,mfcc12_mean,mfcc12_var,mfcc13_mean,mfcc13_var,mfcc14_mean,mfcc14_var,mfcc15_mean,mfcc15_var,mfcc16_mean,mfcc16_var,mfcc17_mean,mfcc17_var,mfcc18_mean,mfcc18_var,mfcc19_mean,mfcc19_var,mfcc20_mean,mfcc20_var,label
0,0.350088,0.088757,0.130228,0.002827,1784.165850,129774.064525,2002.449060,85882.761315,3805.839606,9.015054e+05,0.083045,0.000767,-4.529724e-05,0.008172,0.000008,0.005698,123.046875,-113.570648,2564.207520,121.571793,295.913818,-19.168142,235.574432,42.366421,151.106873,-6.364664,167.934799,18.623499,89.180840,-13.704891,67.660492,15.343150,68.932579,-12.274110,82.204201,10.976572,63.386311,-8.326573,61.773094,8.803792,51.244125,-3.672300,41.217415,5.747995,40.554478,-5.162882,49.775421,0.752740,52.420910,-1.690215,36.524071,-0.408979,41.597103,-2.303523,55.062923,1.221291,46.936035,0
1,0.340914,0.094980,0.095948,0.002373,1530.176679,375850.073649,2039.036516,213843.755497,3550.522098,2.977893e+06,0.056040,0.001448,1.395807e-04,0.005099,-0.000178,0.003063,67.999589,-207.501694,7764.555176,123.991264,560.259949,8.955127,572.810913,35.877647,264.506104,2.907320,279.932922,21.510466,156.477097,-8.560436,200.849182,23.370686,142.555954,-10.099661,166.108521,11.900497,104.358612,-5.555639,105.173630,5.376327,96.197212,-2.231760,64.914291,4.220140,73.152534,-6.012148,52.422142,0.927998,55.356403,-0.731125,60.314529,0.295073,48.120598,-0.283518,51.106190,0.531217,45.786282,0
2,0.363637,0.085275,0.175570,0.002746,1552.811865,156467.643368,1747.702312,76254.192257,3042.260232,7.840345e+05,0.076291,0.001007,2.105576e-06,0.016342,-0.000019,0.007458,161.499023,-90.722595,3319.044922,140.446304,508.765045,-29.093889,411.781219,31.684334,144.090317,-13.984504,155.493759,25.764742,74.548401,-13.664875,106.981827,11.639934,106.574875,-11.783643,65.447945,9.718760,67.908859,-13.133803,57.781425,5.791199,64.480209,-8.907628,60.385151,-1.077000,57.711136,-9.229274,36.580986,2.451690,40.598766,-7.729093,47.639427,-1.816407,52.382141,-3.439720,46.639660,-2.231258,30.573025,0
3,0.404785,0.093999,0.141093,0.006346,1070.106615,184355.942417,1596.412872,166441.494769,2184.745799,1.493194e+06,0.033309,0.000423,4.583644e-07,0.019054,-0.000014,0.002712,63.024009,-199.544205,5507.517090,150.090897,456.505402,5.662678,257.161163,26.859079,158.267303,1.771399,268.034393,14.234031,126.794128,-4.832006,155.912079,9.286494,81.273743,-0.759186,92.114090,8.137607,71.314079,-3.200653,110.236687,6.079319,48.251999,-2.480174,56.799400,-1.079305,62.289902,-2.870789,51.651592,0.780874,44.427753,-3.319597,50.206673,0.636965,37.319130,-0.619121,37.259739,-3.407448,31.949339,0
4,0.308526,0.087841,0.091529,0.002303,1835.004266,343399.939274,1748.172116,88445.209036,3579.757627,1.572978e+06,0.101461,0.001954,-1.756129e-05,0.004814,-0.000010,0.003094,135.999178,-160.337708,5195.291992,126.219635,853.784729,-35.587811,333.792938,22.148071,193.456100,-32.478600,336.276825,10.852294,134.831573,-23.352329,93.257095,0.498434,124.672127,-11.793437,130.073349,1.207256,99.675575,-13.088418,80.254066,-2.813867,86.430626,-6.933385,89.555443,-7.552725,70.943336,-9.164666,75.793404,-4.520576,86.099236,-5.454034,75.269707,-0.916874,53.613918,-4.404827,62.910812,-11.703234,55.195160,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.352063,0.080487,0.079486,0.000345,2008.149458,282174.689224,2106.541053,88609.749506,4253.557033,1.222421e+06,0.089227,0.001305,1.926835e-06,0.004594,-0.000012,0.000681,103.359375,-153.640961,2540.949463,109.864647,455.579956,-23.065695,189.883865,59.050125,132.334518,-7.475873,92.553497,19.726795,58.462181,-16.484838,89.278427,18.066439,43.789928,-22.202005,77.724525,15.409106,61.443748,-12.372051,42.666080,12.348828,43.414345,-12.297895,49.383522,5.777902,56.457893,-10.273881,36.433578,1.789867,45.050526,-13.289984,41.754955,2.484145,36.778877,-6.713265,54.866825,-1.193787,49.950665,9
996,0.398687,0.075086,0.076458,0.000588,2006.843354,182114.709510,2068.942009,82426.016726,4149.338328,1.046621e+06,0.097664,0.000868,-1.406142e-06,0.003981,-0.000012,0.000818,117.453835,-142.392029,3282.358887,116.189629,345.518890,-32.147167,191.464813,49.117840,66.674255,-8.373376,81.370674,22.882061,44.545662,-18.315985,63.946007,20.098146,43.887966,-22.093319,37.119415,10.798820,53.151684,-12.979385,44.253551,14.080058,38.195637,-16.392994,34.824745,4.154395,29.224157,-4.364872,43.484493,3.739020,33.851742,-10.848309,39.395096,1.881229,32.010040,-7.461491,39.196327,-2.795338,31.773624,9
997,0.432142,0.075268,0.081651,0.000322,2077.526598,231657.968040,1927.293153,74717.124394,4031.405321,8.042154e+05,0.121823,0.001697,-4.881450e-07,0.003825,-0.000014,0.001124,129.199219,-124.952271,1681.638794,115.177155,475.088074,-47.975151,290.302795,52.814674,113.682693,-13.484810,77.294281,21.742352,81.096153,-12.407492,59.990017,20.122042,41.712540,-18.363756,44.554043,10.120382,82.069191,-16.037611,43.337452,10.474113,42.966385,-17.947060,44.368690,1.212127,37.248077,-11.519417,39.505585,1.838090,33.597008,-12.845291,36.367264,3.440978,36.001110,-12.588070,42.502201,-2.106337,29.865515,9
998,0.362485,0.091506,0.083860,0.001211,1398.699344,240318.731073,1818.450280,109090.207161,3015.631004,1.332712e+06,0.048724,0.000808,7.986756e-07,0.006077,-0.000013,0.000653,73.828125,-225.007751,10766.367188,123.646751,492.819122,-9.724174,605.487488,56.605164,189.945770,10.436500,156.834641,20.622280,211.036163,-10.253696,107.069740,15.561657,103.171165,-8.227290,69.988968,12.800152,92.819435,-9.001975,63.858929,7.682845,75.528023,-10.070732,56.776089,-0.256386,42.620800,-5.275370,46.763134,-2.812176,46.324894,-4.416050,43.583942,1.556207,34.331261,-5.041897,47.227180,-3.590644,41.299088,9


In [None]:
msk = np.random.rand(len(audio_features_df)) <= 0.8
train_audio_features_df = audio_features_df[msk]
validate_audio_features_df = audio_features_df[~msk]
print(len(train_audio_features_df))
print(len(validate_audio_features_df))

805
195


In [None]:
def seperate_xy(df, label_column_name='label'):
  return df.drop(columns=[label_column_name]), df[label_column_name]

def subset_to_tensor(df, label_column_name='label'):
    x, y = seperate_xy(df, label_column_name)
    return torch.from_numpy(x.values).float().to(device), torch.from_numpy(y.values).long().to(device)

train_features, train_labels = subset_to_tensor(train_audio_features_df)
validate_features, validate_labels = subset_to_tensor(validate_audio_features_df)

print(train_features.shape)
print(train_labels.shape)
print(validate_features.shape)
print(validate_labels.shape)

torch.Size([805, 57])
torch.Size([805])
torch.Size([195, 57])
torch.Size([195])


In [None]:
class MyDataSet():
  def __init__(self, x, y):
    self.x = x
    self.y = y

  def __len__(self): 
    return len(self.y)

  def __getitem__(self, idx):
    return [self.x[idx], self.y[idx]]

In [None]:
csv_trainloader = torch.utils.data.DataLoader(MyDataSet(train_features, train_labels), batch_size=16, shuffle=True)               
csv_valloader = torch.utils.data.DataLoader(MyDataSet(validate_features, validate_labels), batch_size=16, shuffle=False)

data_loaders = {'train': csv_trainloader, 'validation': csv_valloader, 'test': None}

In [None]:
class FeedForwardNet(nn.Module):
    def __init__(self):
        super(FeedForwardNet, self).__init__()
       
        self.classifier = nn.Sequential(
            nn.Linear(57, 40),
            nn.Sigmoid(),
            nn.Linear(40, 15),
            nn.Sigmoid(),
            nn.Linear(15, NUM_CLASSES),
            nn.Sigmoid()
        )

    def forward(self, x):
        x = self.classifier(x)
        return x

In [None]:
feedforward_net = FeedForwardNet().to(device=device)

In [None]:
%%time
feedforward_net_history = train(feedforward_net, 10, nn.CrossEntropyLoss(), csv_trainloader)

In [None]:
plot(feedforward_net_history)