In [1]:
from tqdm import tqdm
from collections import OrderedDict

import os, pickle
import argparse
import json
import numpy as np
import pandas as pd

import librosa
import scipy
import torch
import torch.nn as nn
from torch.utils import data
import torch.nn.functional as F

from argparse import Namespace
import glob, json, argparse

MAGIC_NUMBER = 59049 # previously 66150
np.random.seed(MAGIC_NUMBER)
MAGIC_NUMBER = 66150
BATCH_SIZE = 125
EPOCHS = 500

In [2]:
label_map = pd.read_csv(f'/CSC413/labels.csv', index_col=0, squeeze=True)
data_dir = f'/CSC413/fma/fma_npy'
all_files = np.array(glob.glob(data_dir+'/*.npy'))
n = len(all_files)
subset_indices = np.random.choice(n, n//10, replace=False)

In [3]:
# Train test split
train_set = all_files[np.setdiff1d(np.arange(n), subset_indices)]
val_set = all_files[subset_indices]

In [4]:
class FMADataset(torch.utils.data.Dataset):
    def __init__(self, audio_list):
        self.audio_list = audio_list
        self.n = len(self.audio_list)

    def __len__(self):
        return self.n * 19

    def __getitem__(self, idx):
        audio_fn = self.audio_list[idx % self.n]
        audio = np.load(audio_fn)
        offset = (idx // self.n) * ((audio.shape[0]-MAGIC_NUMBER) // 19)
        label = label_map[int(audio_fn.split('/')[-1][:6])]
        # audio,_ = load(audio_fn, sr=22050, res_type='kaiser_fast', offset = 0, duration = 3.0)
        # audio,_ = librosa.load(audio_fn, sr=22050, res_type='kaiser_fast')
        audio = np.abs(librosa.stft(audio[offset:offset+MAGIC_NUMBER], n_fft=1024, window=scipy.signal.hanning, hop_length=512))[:,:128]
        return audio, label

In [5]:
trainset = FMADataset(train_set)
trainset_gen = data.DataLoader(
    trainset,
    batch_size = BATCH_SIZE,
    shuffle = True,
    drop_last = True,
    num_workers = 4)

valset = FMADataset(val_set)
valset_gen = data.DataLoader(
    valset,
    batch_size = BATCH_SIZE,
    shuffle = True,
    drop_last = True,
    num_workers = 4)

In [8]:
class ConvNet(nn.Module):
    def __init__(self, num_class):
        super().__init__()

        self.cnn1 = nn.Sequential(
          nn.BatchNorm2d(1),
          nn.Conv2d(1, 256, kernel_size=(4, 513)),
          nn.ReLU()
        )
        self.cnn2 = nn.Sequential(
          nn.Conv2d(256, 256, kernel_size=(4,1)),
          nn.BatchNorm2d(256),
          nn.ReLU()
        )
        self.cnn3 = nn.Sequential(
          nn.Conv2d(256, 256, kernel_size=(4,1)),
          nn.BatchNorm2d(256),
          nn.ReLU()
        )
        self.cnn4 = nn.Sequential(
          nn.Conv2d(256, 256, kernel_size=(4,1)),
          nn.BatchNorm2d(256),
          nn.ReLU()
        )
        self.cnn5 = nn.Sequential(
          nn.Conv2d(256, 256, kernel_size=(4,1)),
          nn.BatchNorm2d(256),
          nn.ReLU()
        )
        self.pool1 = nn.MaxPool2d((125, 1))
        self.pool2 = nn.AvgPool2d((125, 1))

        self.drop = nn.Dropout(0.2)

        self.dense1 = nn.Sequential(
          nn.Linear(1024, 480),
          nn.ReLU(),
          nn.Dropout(0.2)
        )
        self.dense2 = nn.Sequential(
          nn.Linear(480, 240),
          nn.ReLU(),
          nn.Dropout(0.2)
        )

        self.classify = nn.Sequential(
          nn.Linear(240, num_class),
          nn.Softmax(dim=1)
        )

    def forward(self, x):
        layer1 = self.cnn1(x)
        
        layer2 = self.cnn2(F.pad(layer1, (0,0,2,1)))
        layer3 = self.cnn3(F.pad(layer2, (0,0,2,1)))
        layer4 = self.cnn4(F.pad(layer3, (0,0,2,1)))
        layer5 = self.cnn5(F.pad(layer4, (0,0,2,1)))

        layer6 = self.pool1(torch.cat((layer5, layer1), dim=1))
        layer7 = self.pool2(torch.cat((layer5, layer1), dim=1))

        layer8 = self.drop(torch.cat((layer6, layer7), dim=1)).view(-1, 1024)
        layer9 = self.dense1(layer8)
        layer10 = self.dense2(layer9)

        output = self.classify(layer10)

        return output

### Evaluation

In [9]:
device = 'cuda'
model = MyModel4(num_class=16).cuda()
model.load_state_dict(torch.load('models/model4fma_1.pt'))
model.cuda();

In [9]:
def evaluate(model):
    global valset, valset_gen
    corr = 0
    y_true = torch.empty(0)
    y_pred = torch.empty(0)
    n = len(valset)
    with tqdm(total = len(valset_gen), leave=True) as pbar:
        for m_batch, m_label in valset_gen:
            m_batch, m_label = m_batch.cuda(), m_label.cuda()
            output = model(m_batch.transpose(2,1).unsqueeze(1))
            _, pred = torch.max(output, 1)
            y_true = torch.cat((y_true, m_label))
            y_pred = torch.cat((y_pred, pred))
            corr += (m_label == pred).sum().item()
            pbar.set_description(f'epoch: {epoch+1}, cce: {cce_loss:.3f}')
            pbar.update(1)
    print(f'accuracy: {corr*100/n:.2f}, f1: {f1_loss()}')

In [11]:
# setup y_true y_pred for f1, precision, and recall calculations
y_true = torch.empty(0).cuda()
y_pred = torch.empty(0).cuda()
for m_batch, m_label in tqdm(valset_gen):
    m_batch, m_label = m_batch.cuda(), m_label.cuda()
    output = model(m_batch.transpose(2,1).unsqueeze(1))
    _, pred = torch.max(output, 1)
    y_true = torch.cat((y_true, m_label))
    y_pred = torch.cat((y_pred, pred))

100%|██████████| 379/379 [03:02<00:00,  2.07it/s]


In [48]:
# Validation accuracy
evaluate(model)

accuracy: 45.14


In [13]:
F1 = 0
epsilon = 1e-7
for c in range(16):
    count = (y_true==c).sum()
    tp = (y_pred[y_true==c]==c).sum()
    fp = (y_pred[y_true!=c]==c).sum()
    fn = (y_pred[y_true==c]!=c).sum()
    p = tp/(tp+fp+epsilon)
    r = tp/(tp+fn+epsilon)
    f1 = 2*(p*r)/(p+r+epsilon)
    print(f'f1: {f1.cpu().item():.2f} r: {r.cpu().item():.2f} p: {p.cpu().item():.2f}')
    F1 += f1*count
F1/y_true.shape[0]

f1: 0.00 r: 0.00 p: 0.00
f1: 0.00 r: 0.00 p: 0.06
f1: 0.00 r: 0.00 p: 0.00
f1: 0.00 r: 0.00 p: 0.00
f1: 0.60 r: 0.71 p: 0.52
f1: 0.14 r: 0.11 p: 0.21
f1: 0.00 r: 0.00 p: 0.00
f1: 0.00 r: 0.00 p: 0.00
f1: 0.00 r: 0.00 p: 0.00
f1: 0.00 r: 0.00 p: 0.00
f1: 0.00 r: 0.00 p: 0.00
f1: 0.00 r: 0.00 p: 0.00
f1: 0.00 r: 0.00 p: 0.00
f1: 0.59 r: 0.94 p: 0.43
f1: 0.00 r: 0.00 p: 0.00
f1: 0.00 r: 0.00 p: 0.00


tensor(0.3281, device='cuda:0')

## Train Loop

In [7]:
#set save directory
save_dir = 'models2/'
os.makedirs(save_dir, exist_ok=True)

In [10]:
device = 'cuda'
model = ConvNet(num_class=16)
criterion = nn.CrossEntropyLoss()
# optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
optimizer = torch.optim.Adadelta(model.parameters(), lr=0.001, rho=0.95, eps=1e-7)
model.cuda()
for epoch in range(EPOCHS):
    model.train()
    corr = 0
    with tqdm(total = len(trainset_gen)+1, leave=True) as pbar:
        epoch_loss = 0
        for m_batch, m_label in trainset_gen:
            m_batch, m_label = m_batch.to(device), m_label.to(device)
            output = model(m_batch.transpose(2,1).unsqueeze(1))
            loss = criterion(output, m_label)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            epoch_loss += loss.item()
            _, pred = torch.max(output, 1)
            _corr = (m_label == pred).sum().item()
            corr += _corr
            
            pbar.set_description(f'epoch: {epoch+1}, cce: {loss:.3f}, acc: {_corr*100/BATCH_SIZE}')
            pbar.update(1)
                    
        epoch_loss /= len(trainset_gen)
        pbar.set_description(f'epoch: {epoch+1}, avg loss: {epoch_loss:.3f}, acc: {corr*100/len(trainset):.2f}')
        pbar.update(1)
                    
    if (epoch+1) % 1 == 0:
        torch.save(model.state_dict(), save_dir +  f'model4fma_{epoch+1}.pt')

epoch: 1, cce: 2.729, acc: 26.4:  70%|██████▉   | 2383/3418 [22:59<09:58,  1.73it/s]

KeyboardInterrupt

