# Preamble

In [1]:
# json
import json

# Pytorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchaudio

import matplotlib.pyplot as plt
import librosa

import librosa.display
import IPython.display as ipd

import numpy as np
import pickle
import random

In [2]:
import torch
import librosa
from torch_specinv import griffin_lim
from torch_specinv.metrics import spectral_convergence as SC

from __future__ import print_function
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
import numpy as np
import matplotlib.pyplot as plt

In [3]:
%load_ext autoreload

In [4]:
from tqdm import tqdm
from sklearn import metrics

In [5]:
# Define what device we are using
print("CUDA Available: ",torch.cuda.is_available())
use_cuda=True
device = torch.device("cuda" if (use_cuda and torch.cuda.is_available()) else "cpu")

CUDA Available:  False


In [6]:
import os
import sys
module_path = os.path.abspath(os.path.join('../../../src/'))
if module_path not in sys.path:
    sys.path.append(module_path)

# Load data

In [6]:
import torch
from datasets.audioset import Audioset
from torchvision import transforms
from torch.utils.data import DataLoader

audio_set_tr = Audioset(split_mode='training', fixed_padding=True)
audio_set_val = Audioset(split_mode='validation', fixed_padding=True)

ImportError: cannot import name 'Audioset' from 'datasets.audioset' (/nfs/homedirs/scholten/ml-lab-git/src/datasets/audioset.py)

# Validating

In [98]:
def reportScore(y_true, y_pred):
    print("\tAccuracy:\t" + str(metrics.accuracy_score(y_true,y_pred)))
    print("\tPrecision:\t" + str(metrics.precision_score(y_true,y_pred)))
    print("\tRecall:   \t" + str(metrics.recall_score(y_true,y_pred)))
    print("\tF1-score:\t" + str(metrics.f1_score(y_true,y_pred)))
    #tn, fp, fn, tp = metrics.confusion_matrix(y_true,y_pred).ravel()

In [135]:
def validate(model):
    valid_loader = DataLoader(audio_set_val, batch_size=1)
    
    model.eval()

    y_true = []
    y_pred = []
    with torch.no_grad():
        for data in tqdm(valid_loader):
            data = [item.cuda() for item in data] # move to gpu
            inputs, labels = data[:2], data[-1]
            outputs = model(inputs)

            y_pred.append(torch.max(outputs.data, 1)[1].item())
            y_true.append(labels.item())
            
    reportScore(y_true, y_pred)

# Model definition

In [45]:
cfg = {'name': 'Urban Testing',
 'data': {'type': 'YanDataManager',
  'path': '/nfs/students/summer-term-2020/project-4/data/dataset1/dataset_resampled',
  'format': 'audio',
  'loader': {'shuffle': True,
   'batch_size': 24,
   'num_workers': 4,
   'drop_last': True}},
 'transforms': {'type': 'AudioTransforms',
  'args': {'channels': 'avg', 'noise': [0.3, 0.001], 'crop': [0.0, 0.25]}},
 'optimizer': {'type': 'Adam',
  'args': {'lr': 0.002, 'weight_decay': 0.01, 'amsgrad': True}},
 'lr_scheduler': {'type': 'StepLR', 'args': {'step_size': 10, 'gamma': 0.5}},
 'model': {'type': 'AudioCRNN'},
 'train': {'loss': 'nll_loss',
  'epochs': 100,
  'save_dir': 'saved_cv/',
  'save_p': 1,
  'verbosity': 2,
  'monitor': 'min val_loss',
  'early_stop': 8,
  'tbX': True},
 'metrics': 'classification_metrics',
 'net_mode': 'init',
 'cfg': '[convs_module]\n    [conv2d]\n        out_channels=32\n        kernel_size=3\n        stride=1\n        padding=valid\n    [batchnorm2d]\n    [elu]\n    [maxpool2d]\n        kernel_size=3\n        stride=3\n    [dropout]\n        p=0.1\n    \n    REPEATx2\n        [conv2d]\n            out_channels=64\n            kernel_size=3\n            stride=1\n            padding=valid\n        [batchnorm2d]\n        [elu]\n        [maxpool2d]\n            kernel_size=4\n            stride=4\n        [dropout]\n            p=0.1\n    END\n\n[moddims]\n    permute=[2,1,0]\n    collapse=[1,2]\n\n[recur_module]\n    [lstm]\n        hidden_size = 256\n        num_layers = 2\n        bidirectional=True\n\n[moddims]\n    permute=[1]\n\n[dense_module]\n    [dropout]\n        p=0.2  \n    [batchnorm1d]\n    [linear]\n        out_features = 2\n'}

In [6]:
# audio.py
import numpy as np
import torch
import torch.nn as nn

from torchaudio.transforms import Spectrogram, MelSpectrogram , ComplexNorm
from torchaudio.transforms import TimeStretch, AmplitudeToDB 
from torch.distributions import Uniform

def _num_stft_bins(lengths, fft_length, hop_length, pad):
    return (lengths + 2 * pad - fft_length + hop_length) // hop_length

class MelspectrogramStretch(MelSpectrogram):

    def __init__(self, hop_length=None, 
                       sample_rate=48000, 
                       num_mels=128, 
                       fft_length=2048, 
                       norm='whiten', 
                       stretch_param=[0.4, 0.4]):

        super(MelspectrogramStretch, self).__init__(sample_rate=sample_rate, 
                                                    n_fft=fft_length, 
                                                    hop_length=hop_length, 
                                                    n_mels=num_mels)

        self.stft = Spectrogram(n_fft=self.n_fft, win_length=self.win_length,
                                       hop_length=self.hop_length, pad=self.pad, 
                                       power=None, normalized=False)

        # Augmentation
        self.prob = stretch_param[0]
        self.random_stretch = RandomTimeStretch(stretch_param[1], 
                                                self.hop_length, 
                                                self.n_fft//2+1, 
                                                fixed_rate=None)
        
        # Normalization (pot spec processing)
        self.complex_norm = ComplexNorm(power=2.)
        self.norm = SpecNormalization(norm)

    def forward(self, x, lengths=None):
        x = self.stft(x)

        if lengths is not None:
            lengths = _num_stft_bins(lengths, self.n_fft, self.hop_length, self.n_fft//2)
            lengths = lengths.long()
        
        if torch.rand(1)[0] <= self.prob and self.training:
            # Stretch spectrogram in time using Phase Vocoder
            x, rate = self.random_stretch(x)
            # Modify the rate accordingly
            lengths = (lengths.float()/rate).long()+1
        
        x = self.complex_norm(x)
        x = self.mel_scale(x)

        # Normalize melspectrogram
        x = self.norm(x)

        if lengths is not None:
            return x, lengths        
        return x

    def __repr__(self):
        return self.__class__.__name__ + '()'


class RandomTimeStretch(TimeStretch):

    def __init__(self, max_perc, hop_length=None, n_freq=201, fixed_rate=None):

        super(RandomTimeStretch, self).__init__(hop_length, n_freq, fixed_rate)
        self._dist = Uniform(1.-max_perc, 1+max_perc)

    def forward(self, x):
        rate = self._dist.sample().item()
        return super(RandomTimeStretch, self).forward(x, rate), rate


class SpecNormalization(nn.Module):

    def __init__(self, norm_type, top_db=80.0):

        super(SpecNormalization, self).__init__()

        if 'db' == norm_type:
            self._norm = AmplitudeToDB(stype='power', top_db=top_db)
        elif 'whiten' == norm_type:
            self._norm = lambda x: self.z_transform(x)
        else:
            self._norm = lambda x: x
        
    
    def z_transform(self, x):
        # Independent mean, std per batch
        non_batch_inds = [1, 2, 3]
        mean = x.mean(non_batch_inds, keepdim=True)
        std = x.std(non_batch_inds, keepdim=True)
        x = (x - mean)/std 
        return x

    def forward(self, x):
        return self._norm(x)

In [54]:
# model.py
import torch
import torch.nn as nn
import torch.nn.functional as F
import logging
import torch.nn as nn
import numpy as np

from torch.utils.data import DataLoader

# F.max_pool2d needs kernel_size and stride. If only one argument is passed, 
# then kernel_size = stride

from torchparse import parse_cfg

# Architecture inspiration from: https://github.com/keunwoochoi/music-auto_tagging-keras
class AudioCRNN(nn.Module):
    def __init__(self, config={}, state_dict=None):
        super(AudioCRNN, self).__init__()
        self.datasets = {}
        in_chan = 2 if config['transforms']['args']['channels'] == 'stereo' else 1

        self.logger = logging.getLogger(self.__class__.__name__)
        self.config = config
        
        self.classes = ['negative', 'positive']
        self.lstm_units = 64
        self.lstm_layers = 2
        self.spec = MelspectrogramStretch(hop_length=None, 
                                num_mels=128, 
                                fft_length=2048, 
                                norm='whiten', 
                                stretch_param=[0.4, 0.4])
        self.net = parse_cfg(config['cfg'], in_shape=[in_chan, self.spec.n_mels, 400])

    def _many_to_one(self, t, lengths):
        return t[torch.arange(t.size(0)), lengths - 1]

    def modify_lengths(self, lengths):
        def safe_param(elem):
            return elem if isinstance(elem, int) else elem[0]
        
        for name, layer in self.net['convs'].named_children():
            #if name.startswith(('conv2d','maxpool2d')):
            if isinstance(layer, (nn.Conv2d, nn.MaxPool2d)):
                p, k, s = map(safe_param, [layer.padding, layer.kernel_size,layer.stride]) 
                lengths = ((lengths + 2*p - k)//s + 1).long()

        return torch.where(lengths > 0, lengths, torch.tensor(1, device=lengths.device))

    def forward(self, batch):    
        x, lengths= batch['audio'], batch['lengths'] # unpacking seqs, lengths and srs
        # x-> (batch, time, channel)
        x = x.unsqueeze(2) # add channel dim
        # x-> (batch, channel, time)
        xt = x.float().transpose(1,2)
        # xt -> (batch, channel, freq, time)
        xt, lengths = self.spec(xt, lengths)                

        # (batch, channel, freq, time)
        xt = self.net['convs'](xt)
        lengths = self.modify_lengths(lengths)

        # xt -> (batch, time, freq, channel)
        x = xt.transpose(1, -1)

        # xt -> (batch, time, channel*freq)
        batch, time = x.size()[:2]
        x = x.reshape(batch, time, -1)
        x_pack = torch.nn.utils.rnn.pack_padded_sequence(x, lengths, batch_first=True)
    
        # x -> (batch, time, lstm_out)
        x_pack, hidden = self.net['recur'](x_pack)
        x, _ = torch.nn.utils.rnn.pad_packed_sequence(x_pack, batch_first=True)
        
        # (batch, lstm_out)
        x = self._many_to_one(x, lengths)
        # (batch, classes)
        x = self.net['dense'](x)

        x = F.log_softmax(x, dim=1)

        return x

    def predict(self, x):
        with torch.no_grad():
            out_raw = self.forward( x )
            out = torch.exp(out_raw)
            max_ind = out.argmax().item()        
            return self.classes[max_ind], out[:,max_ind].item()

    def getDatasetInfo(self):
        dataset_type = {"sample_rate": 48000}
        dataset_params = {"fixed_padding": False}
        return dataset_type, dataset_params
    
    def setDataset(self, split_mode, dataset):
        self.datasets[split_mode] = dataset
        
    def getDataLoader(self, split_mode, **params):
        dataset = self.datasets[split_mode]
        return DataLoader(dataset, collate_fn=dataset.pad_seq, **params)

In [55]:
audioCRNN = AudioCRNN(config=cfg)
audioCRNN

AudioCRNN(
  (spec): MelspectrogramStretch()
  (net): ModuleDict(
    (convs): Sequential(
      (conv2d_0): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=[0, 0])
      (batchnorm2d_0): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (elu_0): ELU(alpha=1.0)
      (maxpool2d_0): MaxPool2d(kernel_size=3, stride=3, padding=0, dilation=1, ceil_mode=False)
      (dropout_0): Dropout(p=0.1, inplace=False)
      (conv2d_1): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=[0, 0])
      (batchnorm2d_1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (elu_1): ELU(alpha=1.0)
      (maxpool2d_1): MaxPool2d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
      (dropout_1): Dropout(p=0.1, inplace=False)
      (conv2d_2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=[0, 0])
      (batchnorm2d_2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)

# Improving Dataset

In [21]:
from datasets.datasethandler import DatasetHandler 
datasetHandler = DatasetHandler()

In [23]:
from classification.models.M5 import M5PLModule
import config

hparams = {
    "batch_size": 2,
    "learning_rate": 0.001,
    "weight_decay": 0,
    "lr_decay": 1
}

model = M5PLModule(hparams)

datasetHandler.load_datasets(model, dataset_id=config.DATASET_CONTROL)

Loading cached training data of dataset 1 from /nfs/students/summer-term-2020/project-4/data/dataset2/dataset_8k/
Loading cached validation data of dataset 1 from /nfs/students/summer-term-2020/project-4/data/dataset2/dataset_8k/
Loading cached testing data of dataset 1 from /nfs/students/summer-term-2020/project-4/data/dataset2/dataset_8k/


In [24]:
model.get_dataloader("training")
model.get_dataloader("validation")
model.get_dataloader("testing")

<torch.utils.data.dataloader.DataLoader at 0x7f88557121f0>

# Training

In [136]:
torch.cuda.empty_cache()

model = SpectrogramCNN().cuda()
#print(model)
model.float()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

train_loader = DataLoader(audio_set_tr, batch_size=32)

model.train()

for epoch in range(5):  # loop over the dataset multiple times

    running_loss = 0.0
    for i, data in enumerate(train_loader, 0):
        data = [item.cuda() for item in data] # move to gpu
        x, labels = data[:2], data[-1]
        
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(x)

        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        # track statistics
        running_loss += loss.item()
    #print(outputs[:10])
    
    print('[%d] loss: %.3f' % (epoch + 1, running_loss / (i+1)))
    validate(model)
    
print('Finished Training')

print("Validate:")
validate(model)

  1%|          | 19/1687 [00:00<00:08, 189.95it/s]

[1] loss: 0.456


100%|██████████| 1687/1687 [00:06<00:00, 261.88it/s]


	Accuracy:	0.8494368701837581
	Precision:	0.8341121495327103
	Recall:   	0.864406779661017
	F1-score:	0.8489892984542212


  2%|▏         | 29/1687 [00:00<00:05, 285.82it/s]

[2] loss: 0.405


100%|██████████| 1687/1687 [00:05<00:00, 288.66it/s]


	Accuracy:	0.8429164196799052
	Precision:	0.8058887677208287
	Recall:   	0.8946731234866828
	F1-score:	0.8479632816982214


  2%|▏         | 32/1687 [00:00<00:05, 314.71it/s]

[3] loss: 0.376


100%|██████████| 1687/1687 [00:06<00:00, 279.74it/s]


	Accuracy:	0.8506224066390041
	Precision:	0.8072805139186295
	Recall:   	0.9128329297820823
	F1-score:	0.8568181818181818


  2%|▏         | 27/1687 [00:00<00:06, 269.82it/s]

[4] loss: 0.351


100%|██████████| 1687/1687 [00:05<00:00, 285.65it/s]


	Accuracy:	0.8761114404267931
	Precision:	0.8642266824085005
	Recall:   	0.8861985472154964
	F1-score:	0.8750747160789002


  2%|▏         | 32/1687 [00:00<00:05, 312.78it/s]

[5] loss: 0.328


100%|██████████| 1687/1687 [00:06<00:00, 278.85it/s]
  2%|▏         | 27/1687 [00:00<00:06, 267.39it/s]

	Accuracy:	0.8790752815649081
	Precision:	0.8676122931442081
	Recall:   	0.8886198547215496
	F1-score:	0.8779904306220097
Finished Training
Validate:


100%|██████████| 1687/1687 [00:05<00:00, 285.77it/s]

	Accuracy:	0.8790752815649081
	Precision:	0.8676122931442081
	Recall:   	0.8886198547215496
	F1-score:	0.8779904306220097





In [139]:
validate(model)

100%|██████████| 1687/1687 [00:05<00:00, 303.93it/s]

	Accuracy:	0.8790752815649081
	Precision:	0.8676122931442081
	Recall:   	0.8886198547215496
	F1-score:	0.8779904306220097





In [140]:
model_state_dict_path = "/nfs/students/summer-term-2020/project-4/data/models/SpectrogramBasedCNN.pt"
torch.save(model.state_dict(), model_state_dict_path)