# DAD for Wave by Triplet Loss

## Settings

In [1]:
from pathlib import Path
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import librosa
import librosa.display

import IPython.display as ipd

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import transforms

In [3]:
sys.path.append('../src')

import utils

from trainer import fit

In [4]:
no_cuda = False
seed = 0

In [None]:
data_dir = Path().cwd().parent/'data'
raw_data_dir = data_dir/'external'/'ESC-50'
processed_data_dir = data_dir/'processed'/'ESC-50'

model_path = Path().cwd().parent/'models'/'model.pth'
embeddingnet_path = Path().cwd().parent/'models'/'embeddingnet.pth'

In [6]:
metadata_path = raw_data_dir/'meta'/'esc50.csv'
audio_dir = raw_data_dir/'audio'
spectrogram_dir = processed_data_dir/'spectrogram'

In [7]:
train_metadata_path = processed_data_dir/'metadata_train.csv'
valid_metadata_path = processed_data_dir/'metadata_valid.csv'
test_metadata_path = processed_data_dir/'metadata_test.csv'

In [8]:
use_cuda = torch.cuda.is_available() and (not no_cuda)
device = 'cuda' if use_cuda else 'cpu'
torch.manual_seed(seed)

print('device: {}'.format(device))

device: cuda


## Dataloader

In [9]:
def calculate_sp(x, n_fft=512, hop_length=256):
    # change wave data to stft
    stft = librosa.stft(x, n_fft=n_fft, hop_length=hop_length)
    sp = librosa.amplitude_to_db(np.abs(stft))
    return sp

In [10]:
class ESC50Dataset(torch.utils.data.Dataset):
    def __init__(self, metadata_path, audio_dir, spectrogram_dir, transform=None,):
        self.transform = transform
        
        self.metadata = pd.read_csv(str(metadata_path))
        self.audio_dir = Path(audio_dir)
        self.spectrogram_dir = Path(spectrogram_dir)
        
        self.label_data = None
        self.labels = set()
        self.label2indices = {}

        self.build()

    def __len__(self):
        return len(self.metadata)

    def __getitem__(self, index):
        return self.load_audio(index)

    def load_audio(self, index):
        fname = self.metadata.at[index, 'filename']
        fpath = self.audio_dir/fname
        x, fs = librosa.load(str(fpath))
        return x, fs
    
    def load_spectrogram(self, index):
        fname = self.metadata.at[index, 'fname']+'.npy'
        fpath = self.spectrogram_dir/fname
        x = np.load(str(fpath))
        return x
    
    def load_label(self, index):
        label = self.metadata.at[index, 'target']
        return label
    
    def build_label_data(self):
        label_data = self.metadata.loc[:, ['target', 'category']].drop_duplicates()
        label_data = label_data.sort_values(by=['target'], ascending=True)
        label_data = label_data.reset_index(drop=True)

        label_data['number'] = 0
        label2indices = {}
        for i, target in enumerate(label_data['target']):
            data = self.metadata.query('target == @target')
            label_data.loc[i, 'number'] = len(data)
            label2indices[target] = data.index.values.tolist()

        self.labels = set(label2indices.keys())
        self.label_data = label_data
        self.label2indices = label2indices
        return label_data, label2indices

    def build_spectrogram(self):
        self.metadata['fname'] = ''
        for index in range(len(self.metadata)):
            audio_file_path = self.audio_dir/self.metadata.at[index, 'filename']
            fname = audio_file_path.stem
            self.metadata.at[index, 'fname'] = fname

            fname += '.npy'
            spec_file_path = self.spectrogram_dir/fname
            if not spec_file_path.exists():
                x, fs = self.load_audio(index)
                spec = calculate_sp(x, n_fft=512, hop_length=256)
                np.save(str(spec_file_path), spec)
        return

    def build(self):
        self.build_label_data()
        self.build_spectrogram()

In [11]:
class ESC50DatasetTriplet(ESC50Dataset):
    def __init__(self, metadata_path, audio_dir, spectrogram_dir, transform=None,):
        super(ESC50DatasetTriplet, self).__init__(metadata_path, audio_dir, spectrogram_dir, transform)
    
    def __len__(self):
        return len(self.metadata)

    def __getitem__(self, index):
        spec_anc, label_anc = self.load_spectrogram(index), self.load_label(index)
        
        # positive sampling
        indices_pos = self.label2indices[label_anc]
        index_pos = index
        if len(indices_pos) > 1:
            while index_pos == index:
                index_pos = np.random.choice(indices_pos)
        spec_pos = self.load_spectrogram(index_pos)

        # negative sampling
        labels_neg = list(self.labels - set([label_anc]))
        label_neg = np.random.choice(labels_neg)
        index_neg = np.random.choice(self.label2indices[label_neg])
        spec_neg = self.load_spectrogram(index_neg)
        
        if self.transform is not None:
            spec_anc = self.transform(spec_anc)
            spec_pos = self.transform(spec_pos)
            spec_neg = self.transform(spec_neg)

        return (spec_anc, spec_anc, spec_neg), []

## Models

In [12]:
def accuracy(pos_samples, neg_samples, device):
    """ pos_samples: Distance between positive pair
        neg_samples: Distance between negative pair
    """
    margin = 0
    pred = (pos_samples - neg_samples + margin).cpu().data
    acc = (pred > 0).sum()*1.0 / pos_samples.size()[0]
    acc = torch.from_numpy(np.array([acc], np.float32))
    acc = acc.to(device)
    return Variable(acc)

In [13]:
class TripletLoss(nn.Module):
    """
    Triplet loss
    Takes embeddings of an anchor sample, a positive sample and a negative sample
    """

    def __init__(self, margin):
        super(TripletLoss, self).__init__()
        self.margin = margin

    def forward(self, anchor, positive, negative, size_average=True):
        distance_positive = (anchor - positive).pow(2).sum(1)  # .pow(.5)
        distance_negative = (anchor - negative).pow(2).sum(1)  # .pow(.5)
        losses = F.relu(distance_positive - distance_negative + self.margin)
        return losses.mean() if size_average else losses.sum()

In [14]:
def calc_padding_size(i, o, k, s=1, d=1):
    p = int(((o-1)*s + k + (k-1)*(d-1) - i) / 2)
    return p

In [15]:
class EmbeddingNet(nn.Module):
    def __init__(self, input_size, output_size):
        super(EmbeddingNet, self).__init__()
        self.input_size = input_size
        self.output_size = output_size

        self.convnet_1 = self._make_conv_net(4)
        self.convnet_2 = self._make_conv_net(8)
        self.convnet_3 = self._make_conv_net(16)
        self.convnet_4 = self._make_conv_net(32)

        p = calc_padding_size
        i0, i1 = input_size
        self.convnet = nn.Sequential(
            nn.Conv2d(
                in_channels=64, out_channels=128,
                kernel_size=(1, 8), stride=(1, 8),
                padding=(p(i0, i0, 1, 1), p(i1, i1, 8, 2))
                ),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.Conv2d(
                in_channels=128, out_channels=128,
                kernel_size=(8, 1), stride=(2, 1),
                padding=(p(i0, i0, 8, 2), p(i1, i1, 1, 1))
                ),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            #nn.AvgPool2d(6, ceil_mode=True),
            )

        # global average pooling
        self.pooling = lambda x: F.avg_pool2d(x, kernel_size=x.size()[2:])

        
        #self.fc = nn.Sequential(
        #    nn.Dropout(0.5),
        #    nn.Linear(128, output_size),
        #    )
    
    def _make_conv_net(self, filter_size):        
        p = calc_padding_size
        i0, i1 = input_size

        convnet = nn.Sequential(
            nn.Conv2d(
                in_channels=1, out_channels=32,
                kernel_size=(1, filter_size), stride=(1, 2),
                padding=(p(i0, i0, 1, 1), p(i1, i1, filter_size, 2))
                ),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.Conv2d(
                in_channels=32, out_channels=32,
                kernel_size=(filter_size, 1), stride=(2, 1),
                padding=(p(i0, i0, filter_size, 2), p(i1, i1, 1, 1))
                ),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.Conv2d(
                in_channels=32, out_channels=64,
                kernel_size=(1, filter_size), stride=(1, 2),
                padding=(p(i0, i0, 1, 1), p(i1, i1, filter_size, 2))
                ),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.Conv2d(
                in_channels=64, out_channels=64,
                kernel_size=(filter_size, 1), stride=(2, 1),
                padding=(p(i0, i0, filter_size, 2), p(i1, i1, 1, 1))
                ),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            )
        return convnet
        
    def forward(self, x):
        embedding = self.convnet_1(x)
        embedding += self.convnet_2(x)
        embedding += self.convnet_3(x)
        embedding += self.convnet_4(x)
        embedding = self.convnet(embedding)
        embedding = self.pooling(embedding)
        embedding = embedding.view(embedding.size()[0], -1)
        #embedding = self.fc(embedding)
        #embedding /= embedding.pow(2).sum(1, keepdim=True).sqrt()  # normalize
        return embedding

In [16]:
class TripletNet(nn.Module):
    def __init__(self, embedding_net):
        super(TripletNet, self).__init__()
        self.embedding_net = embedding_net

    def forward(self, anchor, positive, negative):
        embed_anc = self.embedding_net(anchor)
        embed_pos = self.embedding_net(positive)
        embed_neg = self.embedding_net(negative)
        return embed_anc, embed_pos, embed_neg

## Train

### Hyperparameters

In [17]:
input_size = (257, 431)
output_size = 128

margin = 1.

lr = 1e-5
weight_decay = 1e-6

batch_size = 2
n_epochs = 2

log_interval = 100

### Dataloaders

In [18]:
kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}

In [19]:
transform = transforms.Compose([
    transforms.ToTensor(),
    ])

In [20]:
dataset_train = ESC50DatasetTriplet(
    train_metadata_path, audio_dir, spectrogram_dir, transform,
    )

In [21]:
dataset_valid = ESC50DatasetTriplet(
    valid_metadata_path, audio_dir, spectrogram_dir, transform,
    )

In [22]:
dataloader_train = torch.utils.data.DataLoader(dataset_train, batch_size=batch_size, shuffle=True, **kwargs)

In [23]:
dataloader_valid = torch.utils.data.DataLoader(dataset_valid, batch_size=batch_size, shuffle=False, **kwargs)

### Model

In [None]:
embedding_net = EmbeddingNet(input_size, output_size).to(device)
model = TripletNet(embedding_net).to(device)

In [None]:
loss_function = TripletLoss(margin)
optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 8, gamma=0.1, last_epoch=-1)

### Train

In [None]:
torch.backends.cudnn.benchmark = True

In [None]:
fit(
    dataloader_train,
    dataloader_valid,
    model, loss_function, optimizer, scheduler,
    n_epochs, use_cuda, log_interval,
    )

Epoch: 1/2. Train set: Average loss: 0.5609
Epoch: 1/2. Validation set: Average loss: 0.8099


In [None]:
torch.save(model.state_dict(), str(model_path))
torch.save(embedding_net.state_dict(), str(embeddingnet_path))

### Test

In [None]:
model.load_state_dict(torch.load(str(model_path)))