Mount Google Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Change directory to current directory

In [2]:
cd /content/drive/My Drive/Colab Notebooks/disc_baseline

/content/drive/My Drive/Colab Notebooks/disc_baseline


In [3]:
pwd -f

'/content/drive/My Drive/Colab Notebooks/disc_baseline'

Install `torchaudio`

In [4]:
!pip install torchaudio

Collecting torchaudio
[?25l  Downloading https://files.pythonhosted.org/packages/96/34/c651430dea231e382ddf2eb5773239bf4885d9528f640a4ef39b12894cb8/torchaudio-0.6.0-cp36-cp36m-manylinux1_x86_64.whl (6.7MB)
[K     |████████████████████████████████| 6.7MB 2.6MB/s 
Installing collected packages: torchaudio
Successfully installed torchaudio-0.6.0


Note that with Google Colab, only absolute paths of the form:

`/path/to/file`

work correctly.

In [1]:
import os

path = '/content/drive/My Drive/data/data/other/6366c230_1.wav'
os.path.isfile(path)

True

Import relevant packages

In [6]:
import torch
import torchaudio
from sklearn.metrics import confusion_matrix

from Disc import Disc
from AudioDataset import AudioDataset

from net_train import train
from net_validate import validate
from net_test import test

from pprint import pprint
import copy
import time

Make different runs reproducible

In [7]:
torch.manual_seed(42)

<torch._C.Generator at 0x7ff4d9865708>

Check if GPU is available and create a `device` object to assign tensors to GPU

In [8]:
use_cuda = torch.cuda.is_available()
use_cuda

False

In [9]:
device = torch.device('cuda' if use_cuda else 'cpu')
device

device(type='cpu')

Initialize the discriminator network and put it onto the GPU

In [10]:
net = Disc().to(device)
net

Disc(
  (feature_extractor): FENet(
    (stage1): Sequential(
      (0): Conv2d(1, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU()
      (3): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (4): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (5): ReLU()
      (6): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
    )
    (stage2): Sequential(
      (0): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU()
      (3): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (4): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (5): ReLU()
      (6): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, 

Where the data splits are stored

In [11]:
data_split_dir = 'data_split'
data_split_dir

'data_split'

What sampling rate to resample audio to

In [12]:
sample_rate = 16000
sample_rate

16000

Store datasets and dataloaders in dictionaries

In [13]:
datasets = {}
datasets

{}

In [14]:
dataloaders = {}
dataloaders

{}

Put loaded data directly onto GPU and increase number of workers if possible

In [15]:
dl_config = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}
dl_config

{}

Batch sizes for training, validation, and testing data

In [16]:
train_batch_size = 64
val_batch_size = 64
test_batch_size = 64

Initialize dataloaders

In [17]:
for dataset,batch_size in [('train',train_batch_size),
                           ('val',val_batch_size),
                           ('test',test_batch_size)]:
    
    disc_dataset = AudioDataset(net_type='disc',
                                data_split_dir=data_split_dir,
                                sample_rate=sample_rate,
                                mode=dataset)
    recon_dataset = AudioDataset(net_type='disc',
                                 data_split_dir=data_split_dir,
                                 sample_rate=sample_rate,
                                 mode=dataset)
    datasets[dataset] = torch.utils.data.ConcatDataset([disc_dataset,
                                                        recon_dataset])
    dataloaders[dataset] = torch.utils.data.DataLoader(
                               dataset = datasets[dataset],
                               batch_size = batch_size,
                               shuffle = True,
                               **dl_config)
    pprint(vars(dataloaders[dataset]))

{'_DataLoader__initialized': True,
 '_DataLoader__multiprocessing_context': None,
 '_IterableDataset_len_called': None,
 '_dataset_kind': 0,
 'batch_sampler': <torch.utils.data.sampler.BatchSampler object at 0x7ff482a1e518>,
 'batch_size': 64,
 'collate_fn': <function default_collate at 0x7ff48e279158>,
 'dataset': <torch.utils.data.dataset.ConcatDataset object at 0x7ff482a1e4a8>,
 'drop_last': False,
 'generator': None,
 'num_workers': 0,
 'pin_memory': False,
 'sampler': <torch.utils.data.sampler.RandomSampler object at 0x7ff482a1e470>,
 'timeout': 0,
 'worker_init_fn': None}
{'_DataLoader__initialized': True,
 '_DataLoader__multiprocessing_context': None,
 '_IterableDataset_len_called': None,
 '_dataset_kind': 0,
 'batch_sampler': <torch.utils.data.sampler.BatchSampler object at 0x7ff482a1e860>,
 'batch_size': 64,
 'collate_fn': <function default_collate at 0x7ff48e279158>,
 'dataset': <torch.utils.data.dataset.ConcatDataset object at 0x7ff482a1e7f0>,
 'drop_last': False,
 'generato

Initialize loss function. Note that losses are summed and not averaged.

In [18]:
loss_func = torch.nn.BCEWithLogitsLoss(reduction='sum')
loss_func

BCEWithLogitsLoss()

Initialize optimizer. Note that net parameters must already be in the GPU before this step.

In [19]:
optimizer = torch.optim.Adam(params = net.parameters(), lr = 0.0003)
optimizer

Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 1e-08
    lr: 0.0003
    weight_decay: 0
)

Optionally, initialize a learning rate scheduler

In [20]:
"""
scheduler = torch.optim.lr_scheduler.StepLR(optimizer = optimizer,
                                            step_size = 3,
                                            gamma = 0.5,
                                            last_epoch = -1)
"""

'\nscheduler = torch.optim.lr_scheduler.StepLR(optimizer = optimizer,\n                                            step_size = 3,\n                                            gamma = 0.5,\n                                            last_epoch = -1)\n'

Number of epochs to train and validate for

In [21]:
num_epochs = 20
num_epochs

20

To store the best validation accuracy

In [22]:
best_val_acc = 0
best_val_acc

0

Store the starting time

In [23]:
start = time.time()
start

1602178972.6945453

Main training and validation loop

In [24]:
for epoch in range(num_epochs):
    
    # record the epoch start time
    
    epoch_start = time.time()
    
    # training #####################################################################
    
    # put net in training mode
    
    net.train()
    print('Training...')
    
    # record the number of correct predictions to compute
    # training accuracy over entire epoch
    
    num_true_pred = 0
    
    # to compute total training loss over entire epoch
    
    total_loss = 0
    
    # show number of epochs elapsed
    
    print('Epoch {}/{}'.format(epoch+1, num_epochs))
    
    for i,(signals,labels) in enumerate(dataloaders['train']):
        
        # track progress
        
        print('Progress: {:.2f}%'.format(i*dataloaders['train'].batch_size/len(dataloaders['train'])),
              end='\r',flush=True)
        
        # move to GPU
        
        signals = signals.to(device)#.unsqueeze(dim=1)
        #print('Signal batch shape: {}'.format(signals.shape))
        #print('Signal batch dtype: {}'.format(signals.dtype))
        labels = labels.to(device).type_as(signals) # needed for BCE loss
        #print('Labels batch shape: {}'.format(labels.shape))
        #print('Labels batch dtype: {}'.format(labels.dtype))
        
        # compute log Mel spectrogram
        
        mel_spec = torchaudio.transforms.MelSpectrogram(sample_rate = 16000,
                                                        n_fft = 1024,
                                                        n_mels = 256,
                                                        hop_length = 63).to(device)
        to_dB = torchaudio.transforms.AmplitudeToDB().to(device)
        images = to_dB(mel_spec(signals)).unsqueeze(dim=1) # add grayscale image channel
        
        # zero the accumulated parameter gradients
        
        optimizer.zero_grad()
        
        # outputs of net for batch input
        
        outputs = net(images).squeeze() # needed for loss_func
        #print('Output batch shape: {}'.format(outputs.shape))
        
        # compute (mean) loss
        
        loss = loss_func(outputs,labels)
        #print('Lossses shape: {}'.format(loss.shape))
        
        # compute loss gradients with respect to parameters
        
        loss.backward()
        
        # update parameters according to optimizer
        
        optimizer.step()
        
        # record running statistics
        
        # since sigmoid(0) = 0.5, then negative values correspond to class 0
        # and positive values correspond to class 1
        
        class_preds = outputs > 0
        num_true_pred = num_true_pred + torch.sum(class_preds == labels)
        
        # loss is not mean-reduced
        
        total_loss += loss
    
    train_loss = total_loss.item() / len(dataloaders['train'])
    
    train_acc = num_true_pred.item() / len(dataloaders['train'])
    
    print('Training Loss: {:.4f}'.format(train_loss))
    print('Training Accuracy: {:.2f}%'.format(train_acc*100))
    
    # validation #####################################################################
    
    # put net in testing mode
              
    net.eval()
    print('\nValidating...\n')
    
    # record the number of correct predictions to compute
    # validation accuracy over entire epoch
    
    num_true_pred = 0
    
    # to compute total validation loss over entire epoch
    
    total_loss = 0
    
    # show number of epochs elapsed
    
    print('Epoch {}/{}'.format(epoch+1, num_epochs))
    
    for i,(signals,labels) in enumerate(dataloaders['val']):
        
        # track progress
        
        print('Progress: {:.2f}%'.format(i*dataloaders['val'].batch_size/len(dataloaders['val'])),
              end='\r',flush=True)
        
        # move to GPU
        
        signals = signals.to(device)#.unsqueeze(dim=1)
        #print('Signal batch shape: {}'.format(signals.shape))
        #print('Signal batch dtype: {}'.format(signals.dtype))
        labels = labels.to(device).type_as(signals) # needed for BCE loss
        #print('Labels batch shape: {}'.format(labels.shape))
        #print('Labels batch dtype: {}'.format(labels.dtype))
        
        # compute log Mel spectrogram
        
        mel_spec = torchaudio.transforms.MelSpectrogram(sample_rate = 16000,
                                                        n_fft = 1024,
                                                        n_mels = 256,
                                                        hop_length = 63).to(device)
        to_dB = torchaudio.transforms.AmplitudeToDB().to(device)
        images = to_dB(mel_spec(signals)).unsqueeze(dim=1) # add grayscale image channel
        
        with torch.no_grad():
            
            # outputs of net for batch input

            outputs = net(images).squeeze()
            #print('Output batch shape: {}'.format(outputs.shape))

            # compute (mean) loss

            loss = loss_func(outputs,labels)
            #print('Lossses shape: {}'.format(loss.shape))
        
        # record running statistics
        
        # since sigmoid(0) = 0.5, then negative values correspond to class 0
        # and positive values correspond to class 1
        
        class_preds = outputs > 0
        num_true_pred = num_true_pred + torch.sum(class_preds == labels)
        
        # loss is not mean-reduced
        
        total_loss += loss
    
    val_loss = total_loss.item() / len(dataloaders['val'])
    
    val_acc = num_true_pred.item() / len(dataloaders['val'])
    
    print('Validation Loss: {:.4f}'.format(val_loss))
    print('Validation Accuracy: {:.2f}%'.format(val_acc*100)) 
    
    # scheduler.step()
    
    epoch_end = time.time()
    
    epoch_time = time.strftime("%H:%M:%S",time.gmtime(epoch_end-epoch_start))
    
    print('\nEpoch Elapsed Time (HH:MM:SS): ' + epoch_time)
    
    # save the weights for the best validation accuracy
        
    if val_acc > best_val_acc:
        
        print('Saving checkpoint...')
        
        best_val_acc = val_acc
        
        # deepcopy needed because a dict is a mutable object
        
        best_parameters = copy.deepcopy(net.state_dict())
        
        torch.save(net.state_dict(),
                   'best_param.pt')

end = time.time()
total_time = time.strftime("%H:%M:%S",time.gmtime(end-start))
print('\nTotal Time Elapsed (HH:MM:SS): ' + total_time)
print('Best Validation Accuracy: {:.2f}%'.format(best_val_acc*100))

Training...
Epoch 1/20


OSError: ignored

Testing

In [None]:
# put net in testing mode

net.eval()

print('\nTesting...\n')

# store class predictions and the true labels

class_preds = []
true_labels = []

for i,(signals,labels) in enumerate(dataloaders['test']):

    # track progress
    
    print('Progress: {:.2f}%'.format(i*dataloaders['val'].batch_size/len(dataloaders['val'])),
           end='\r',flush=True)

    # move to GPU

    signals = signals.to(device)
    #print('Signal batch shape: {}'.format(signals.shape))
    #print('Signal batch dtype: {}'.format(signals.dtype))
    
    # store labels
    
    true_labels.extend(labels.tolist())

    # compute log Mel spectrogram

    mel_spec = torchaudio.transforms.MelSpectrogram(sample_rate = 16000,
                                                    n_fft = 1024,
                                                    n_mels = 256,
                                                    hop_length = 63).to(device)
    to_dB = torchaudio.transforms.AmplitudeToDB().to(device)
    images = to_dB(mel_spec(signals)).unsqueeze(dim=1) # add grayscale image channel

    with torch.no_grad():

        # outputs of net for batch input

        outputs = net(images)#.squeeze()
        #print('Output batch shape: {}'.format(outputs.shape))

    # record running statistics

    # since sigmoid(0) = 0.5, then negative values correspond to class 0
    # and positive values correspond to class 1

    class_preds.extend((outputs > 0).squeeze().tolist())

CM = confusion_matrix(true_labels,class_preds,labels=[0,1])
    
TP = CM[1,1]

TN = CM[0,0]

FP = CM[0,1]

FN = CM[1,0]

sensitivity = TP/(TP+FN) # true positive rate (TPR)

specificity = TN/(TN+FP) # true negative rate (TNR)

accuracy = (TP+TN)/(TP+TN+FP+FN)

balanced_accuracy = (sensitivity+specificity)/2

# Matthews correlation coefficient

MCC = (TP*TN - FP*FN)/(((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))**0.5)

# positive predictive value (or precision)

PPV = TP/(TP+FP)

# negative predictive value

NPV = TN/(TN+FN)

metrics = {'CM':CM,
           'sens':sensitivity,
           'spec':specificity,
           'acc':accuracy,
           'bal_acc':balanced_accuracy,
           'MCC':MCC,
           'PPV':PPV,
           'NPV':NPV}

print('\nConfusion Matrix:\n{}\n'.format(metrics['CM']))
print('Sensitivity/Recall: {:.3f}'.format(metrics['sens']))
print('Specificity: {:.3f}'.format(metrics['spec']))
print('Accuracy: {:.3f}'.format(metrics['acc']))
print('Balanced Accuracy: {:.3f}'.format(metrics['bal_acc']))
print('Matthews correlation coefficient: {:.3f}'.format(metrics['MCC']))
print('Precision/PPV: {:.3f}'.format(metrics['PPV']))
print('NPV: {:.3f}'.format(metrics['NPV']))