# Paper 4: Very Deep Convolutional Neural Networks for Raw Waveforms
* Reference: https://colab.research.google.com/github/pytorch/tutorials/blob/gh-pages/_downloads/audio_classifier_tutorial.ipynb
* Paper: https://arxiv.org/pdf/1610.00087.pdf

### Imports

In [1]:
%matplotlib inline

### Imports ###
import json
from random import randint
import pandas as pd
import numpy as np

# Pytorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import Dataset
import torchaudio

# Audio Player
import IPython.display as ipd

### Location of Dataset

In [2]:
DATA_PATH = "/nfs/students/summer-term-2020/project-4/data/dataset1/finalDataset/"
DATA_FILES = ["training.json", "validation.json", "testing.json"]

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


### Load Dataset JSON

In [4]:
def getJSON(path):
    with open(path ) as f:
        d = json.load(f)
        return d
    
dTrain, dVal = getJSON(DATA_PATH + DATA_FILES[0]), getJSON(DATA_PATH + DATA_FILES[1])

N = len(dTrain)
i_random = randint(0, N)
print("found " + str(N) + " samples.")
print(dTrain[0])
print("Sample " + str(i_random) + " | " + str(dTrain[i_random]["binary_class"]) + " | " + str(dTrain[i_random]["label_names"]))
ipd.Audio(dTrain[i_random]["path"])

found 5060 samples.
{'youtube_id': '9kHMnPosPzw', 'start_seconds': 21.0, 'end_seconds': 31.0, 'binary_class': 'negative', 'labels': ['/m/07yv9', '/m/0k4j', '/t/dd00066'], 'label_names': ['Vehicle', 'Car', 'Medium engine (mid frequency)'], 'source': 'training_unbalanced', 'path': '/nfs/students/summer-term-2020/project-4/data/dataset1/download/training_unbalanced/negative/9kHMnPosPzw.wav'}
Sample 4109 | negative | ['Vehicle', 'Car', 'Race car, auto racing', 'Inside, large room or hall']


In [5]:
class EmergencyDataset(Dataset):
    
    def __init__(self, jsonData):

        self.paths = []
        self.labels = []
        
        for sample in jsonData:
            self.paths.append(sample["path"])
            self.labels.append(1 if sample["binary_class"] == "positive" else 0)
        
    def __getitem__(self, index):
        #print(self.paths[index])

        path = self.paths[index]
        if path == "/nfs/students/summer-term-2020/project-4/data/dataset1/download/training_unbalanced/negative/wM5Qf5xXT8w.wav":
            path = "/nfs/students/summer-term-2020/project-4/data/dataset1/download/training_unbalanced/negative/LrRe3G30fYM.wav"
        
        sound = torchaudio.load(path, out = None, normalization = True)
        
        #load returns a tensor with the sound data and the sampling frequency (44.1kHz for UrbanSound8K)
        soundData = sound[0][0] #self.mixer(sound[0])
        soundData = torch.mean(sound[0], axis=0) #self.mixer(sound[0])
        soundData = soundData.view(-1,1)
        
        #downsample the audio to ~8kHz
        tempData = torch.zeros([160000,1]) #tempData accounts for audio clips that are too short
        if soundData.numel() < 160000:
            tempData[:soundData.numel()] = soundData[:]
        else:
            tempData[:] = soundData[:160000]
        
        soundData = tempData
        soundFormatted = torch.zeros([32000,1])
        soundFormatted[:32000] = soundData[::5] #take every fifth sample of soundData
        soundFormatted = soundFormatted.permute(1, 0)
        return soundFormatted, self.labels[index]
    
    def __len__(self):
        return len(self.labels)
    
    def getPath(self, index):
        return self.paths[index]

    
train_set = EmergencyDataset(dTrain)
val_set = EmergencyDataset(dVal)

kwargs = {'num_workers': 1, 'pin_memory': True} if device == 'cuda' else {} #needed for using datasets on gpu

train_loader = torch.utils.data.DataLoader(train_set, batch_size = 128, shuffle = True, **kwargs)
val_loader = torch.utils.data.DataLoader(val_set, batch_size = 128, shuffle = True, **kwargs)

In [6]:
# M5 (4 Conv Layers)
class NetM5(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv1d(1, 128, 80, 4)
        self.bn1 = nn.BatchNorm1d(128)
        self.pool1 = nn.MaxPool1d(4)
        self.conv2 = nn.Conv1d(128, 128, 3)
        self.bn2 = nn.BatchNorm1d(128)
        self.pool2 = nn.MaxPool1d(4)
        self.conv3 = nn.Conv1d(128, 256, 3)
        self.bn3 = nn.BatchNorm1d(256)
        self.pool3 = nn.MaxPool1d(4)
        self.conv4 = nn.Conv1d(256, 512, 3)
        self.bn4 = nn.BatchNorm1d(512)
        self.pool4 = nn.MaxPool1d(4)
        self.avgPool = nn.AvgPool1d(30) #input should be 512x30 so this outputs a 512x1
        self.fc1 = nn.Linear(512, 2)
        
    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(self.bn1(x))
        x = self.pool1(x)
        x = self.conv2(x)
        x = F.relu(self.bn2(x))
        x = self.pool2(x)
        x = self.conv3(x)
        x = F.relu(self.bn3(x))
        x = self.pool3(x)
        x = self.conv4(x)
        x = F.relu(self.bn4(x))
        x = self.pool4(x)
        #print(x.shape)
        x = self.avgPool(x)
        x = x.permute(0, 2, 1) #change the 512x1 to 1x512
        x = self.fc1(x)
        return F.log_softmax(x, dim = 2)
    

    
    
# M11 (10 Conv Layers)
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv1d(1, 64, 80, 4)
        self.bn1 = nn.BatchNorm1d(64)
        self.pool1 = nn.MaxPool1d(4)

        self.conv2a = nn.Conv1d(64, 64, 3)
        self.bn2a = nn.BatchNorm1d(64)
        self.conv2b = nn.Conv1d(64, 64, 3)
        self.bn2b = nn.BatchNorm1d(64)        
        self.pool2 = nn.MaxPool1d(4)
        
        self.conv3a = nn.Conv1d(64, 128, 3)
        self.bn3a = nn.BatchNorm1d(128)
        self.conv3b = nn.Conv1d(128, 128, 3)
        self.bn3b = nn.BatchNorm1d(128)
        self.pool3 = nn.MaxPool1d(4)

        self.conv4a = nn.Conv1d(128, 256, 3)
        self.bn4a = nn.BatchNorm1d(256)
        self.conv4b = nn.Conv1d(256, 256, 3)
        self.bn4b = nn.BatchNorm1d(256)
        self.pool4 = nn.MaxPool1d(4)

        self.conv5 = nn.Conv1d(256, 512, 3)
        self.bn5 = nn.BatchNorm1d(512)
        
        self.avgPool = nn.AvgPool1d(27) #input should be 512x30 so this outputs a 512x1
        self.fc1 = nn.Linear(512, 2) # right now softmax for 2 classes; can be changed to sigmoid but doesnt matter
        
    def forward(self, out):
        out = self.conv1(out)
        out = F.relu(self.bn1(out))
        out = self.pool1(out)
        
        out = self.conv2a(out)
        out = F.relu(self.bn2a(out))
        out = self.conv2b(out)
        out = F.relu(self.bn2b(out))
        out = self.pool2(out)
        
        out = self.conv3a(out)
        out = F.relu(self.bn3a(out))
        out = self.conv3b(out)
        out = F.relu(self.bn3b(out))
        out = self.pool3(out)
        
        out = self.conv4a(out)
        out = F.relu(self.bn4a(out))
        out = self.conv4b(out)
        out = F.relu(self.bn4b(out))
        out = self.pool4(out)
        
        out = self.conv5(out)
        out = F.relu(self.bn5(out))
        #print(out.shape)
        out = self.avgPool(out)
        out = out.permute(0, 2, 1) #change the 512x1 to 1x512
        out = self.fc1(out)
        return F.log_softmax(out, dim = 2)

model = Net()
model.to(device)
print(model)

Net(
  (conv1): Conv1d(1, 64, kernel_size=(80,), stride=(4,))
  (bn1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool1): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  (conv2a): Conv1d(64, 64, kernel_size=(3,), stride=(1,))
  (bn2a): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv2b): Conv1d(64, 64, kernel_size=(3,), stride=(1,))
  (bn2b): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool2): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  (conv3a): Conv1d(64, 128, kernel_size=(3,), stride=(1,))
  (bn3a): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv3b): Conv1d(128, 128, kernel_size=(3,), stride=(1,))
  (bn3b): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool3): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=F

In [7]:
optimizer = optim.Adam(model.parameters(), lr = 0.001)#0.01
scheduler = optim.lr_scheduler.ExponentialLR(optimizer=optimizer, gamma=0.99)

In [8]:
def train(model, epoch):
    model.train()
    correct = 0
    for batch_idx, (data, target) in enumerate(train_loader):
        optimizer.zero_grad()
        data = data.to(device)
        target = target.to(device)
        data = data.requires_grad_() #set requires_grad to True for training
        output = model(data)
        output = output.permute(1, 0, 2) #original output dimensions are batchSizex1x10 
        pred = output.max(2)[1] # get the index of the max log-probability
        correct += pred.eq(target).cpu().sum().item()
        loss = F.nll_loss(output[0], target) #the loss functions expects a batchSizex10 input
        loss.backward()
        optimizer.step()
        if batch_idx % log_interval == 0: #print training stats
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(epoch, batch_idx * len(data), len(train_loader.dataset),100. * batch_idx / len(train_loader), loss))
    print('           TRAIN-ACC: {}/{} ({:.0f}%)'.format(correct, len(train_loader.dataset),100. * correct / len(train_loader.dataset)))
            
def test(model, epoch):
    model.eval()
    correct = 0
    for data, target in val_loader:
        data = data.to(device)
        target = target.to(device)
        output = model(data)
        output = output.permute(1, 0, 2)
        pred = output.max(2)[1] # get the index of the max log-probability
        correct += pred.eq(target).cpu().sum().item()
    print('\nVAL-ACC: {}/{} ({:.0f}%)\n'.format(
        correct, len(val_loader.dataset),
        100. * correct / len(val_loader.dataset)))

In [None]:
log_interval = 20
for epoch in range(1, 20):
    scheduler.step()
    train(model, epoch)
    test(model, epoch)



           TRAIN-ACC: 3768/5060 (74%)

VAL-ACC: 1211/1687 (72%)

           TRAIN-ACC: 3899/5060 (77%)

VAL-ACC: 1351/1687 (80%)

           TRAIN-ACC: 3992/5060 (79%)

VAL-ACC: 1278/1687 (76%)

           TRAIN-ACC: 4035/5060 (80%)

VAL-ACC: 1359/1687 (81%)

           TRAIN-ACC: 4075/5060 (81%)

VAL-ACC: 1329/1687 (79%)

           TRAIN-ACC: 4079/5060 (81%)

VAL-ACC: 1277/1687 (76%)

           TRAIN-ACC: 4125/5060 (82%)

VAL-ACC: 1297/1687 (77%)

           TRAIN-ACC: 4141/5060 (82%)

VAL-ACC: 1349/1687 (80%)

           TRAIN-ACC: 4137/5060 (82%)

VAL-ACC: 1372/1687 (81%)

           TRAIN-ACC: 4141/5060 (82%)

VAL-ACC: 1380/1687 (82%)

           TRAIN-ACC: 4179/5060 (83%)

VAL-ACC: 1362/1687 (81%)

           TRAIN-ACC: 4199/5060 (83%)

VAL-ACC: 1371/1687 (81%)

           TRAIN-ACC: 4209/5060 (83%)

VAL-ACC: 1323/1687 (78%)

           TRAIN-ACC: 4182/5060 (83%)

VAL-ACC: 1192/1687 (71%)

           TRAIN-ACC: 4214/5060 (83%)

VAL-ACC: 1357/1687 (80%)

           TRAIN-ACC: 421

# Results    

In [None]:
torch.save(model.state_dict(), "./model")

In [None]:
def computePerformanceMetrics(model):
    tp = 0
    fp = 0
    tn = 0
    fn = 0
    model.eval()
    correct = 0
    
    for data, target in val_loader:
        data = data.to(device)
        target = target.to(device)
        output = model(data)
        output = output.permute(1, 0, 2)
        pred = output.max(2)[1] # get the index of the max log-probability
        correct += pred.eq(target).cpu().sum().item()
        
        with torch.no_grad():
            tp += torch.sum(pred & target)
            tn += torch.sum((pred == 0) & (target == 0))
            fp += torch.sum(pred & (target == 0))
            fn += torch.sum(target & (pred == 0))
            
    fp = fp.data.cpu().numpy()
    tp = tp.data.cpu().numpy()
    fn = fn.data.cpu().numpy()
    tn = tn.data.cpu().numpy()
    
    acc = (tp + tn) / (fp + fn + tp + tn)
    prec = tp / (tp + fp)
    rec = tp / (tp + fn)
    f1 = 2*(prec*rec)/(prec+rec)
    
    print('Prec={:.2f}'.format(prec))
    print('Rec={:.2f}'.format(rec))
    print('F1={:.2f}'.format(f1))

    print('\nVAL-ACC: {}/{} ({:.0f}%)\n'.format(correct, len(val_loader.dataset),
        100. * correct / len(val_loader.dataset)))
    
computePerformanceMetrics(model)

### Analyzing wrong predictions

In [None]:
correct = 0

for i in range(100):
    randomIndex = randint(0, len(val_set)-1)
    x, y = val_set.__getitem__(randomIndex)
    x.unsqueeze_(0)
    x = x.to(device)
    output = model(x)
    pred = output.max(2)[1]
    if pred == y: correct += 1
    else: 
        path = val_set.getPath(randomIndex)
        print(path)
        print("Predicted: " + ("EM" if pred else "NON-EM") + " - Label:" + ("EM" if y else "NON-EM"))
        ipd.display(ipd.Audio(path))
        
print("Correct classified: " + str(correct))

In [21]:
%autosave 1

Autosaving every 1 seconds
