In [1]:
import os
import time
import copy
import random
import numpy as np
from skimage.color import gray2rgb

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import torch.optim as optim
from torch.optim import lr_scheduler
from torchvision import models
from torchsummary import summary

from config import models_folder, output_data_folder
from config import n_mels

In [2]:
os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [3]:
IMG_HEIGHT = n_mels

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
torch.cuda.is_available()

cuda


True

In [5]:
# Mobile net
model = models.mobilenet_v2(pretrained=False)
summary(model, input_size=(3, IMG_HEIGHT, IMG_HEIGHT), device='cpu')

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 32, 64, 64]             864
       BatchNorm2d-2           [-1, 32, 64, 64]              64
             ReLU6-3           [-1, 32, 64, 64]               0
            Conv2d-4           [-1, 32, 64, 64]             288
       BatchNorm2d-5           [-1, 32, 64, 64]              64
             ReLU6-6           [-1, 32, 64, 64]               0
            Conv2d-7           [-1, 16, 64, 64]             512
       BatchNorm2d-8           [-1, 16, 64, 64]              32
  InvertedResidual-9           [-1, 16, 64, 64]               0
           Conv2d-10           [-1, 96, 64, 64]           1,536
      BatchNorm2d-11           [-1, 96, 64, 64]             192
            ReLU6-12           [-1, 96, 64, 64]               0
           Conv2d-13           [-1, 96, 32, 32]             864
      BatchNorm2d-14           [-1, 96,

     BatchNorm2d-125            [-1, 160, 4, 4]             320
InvertedResidual-126            [-1, 160, 4, 4]               0
          Conv2d-127            [-1, 960, 4, 4]         153,600
     BatchNorm2d-128            [-1, 960, 4, 4]           1,920
           ReLU6-129            [-1, 960, 4, 4]               0
          Conv2d-130            [-1, 960, 4, 4]           8,640
     BatchNorm2d-131            [-1, 960, 4, 4]           1,920
           ReLU6-132            [-1, 960, 4, 4]               0
          Conv2d-133            [-1, 160, 4, 4]         153,600
     BatchNorm2d-134            [-1, 160, 4, 4]             320
InvertedResidual-135            [-1, 160, 4, 4]               0
          Conv2d-136            [-1, 960, 4, 4]         153,600
     BatchNorm2d-137            [-1, 960, 4, 4]           1,920
           ReLU6-138            [-1, 960, 4, 4]               0
          Conv2d-139            [-1, 960, 4, 4]           8,640
     BatchNorm2d-140            [-1, 960

In [6]:
# mobile net classifier
model.classifier

Sequential(
  (0): Dropout(p=0.2, inplace=False)
  (1): Linear(in_features=1280, out_features=1000, bias=True)
)

In [7]:
class EncoderNet(nn.Module):
    def __init__(self):
        super(EncoderNet, self).__init__()
        self.encoder = models.mobilenet_v2(pretrained=False)   # base model (transfer learning)
        self.encoder.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(1280, 512),   # encoding layer, mobile_netV2 output: 1280 
            nn.Linear(512, 512),   # projection layer
            nn.ReLU(),
            nn.Linear(512, 512),   # projection layer
        )      

    def forward(self, x):
        return self.encoder(x)  
    
summary(EncoderNet(), input_size=(3, IMG_HEIGHT, IMG_HEIGHT), device='cpu')

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 32, 64, 64]             864
       BatchNorm2d-2           [-1, 32, 64, 64]              64
             ReLU6-3           [-1, 32, 64, 64]               0
            Conv2d-4           [-1, 32, 64, 64]             288
       BatchNorm2d-5           [-1, 32, 64, 64]              64
             ReLU6-6           [-1, 32, 64, 64]               0
            Conv2d-7           [-1, 16, 64, 64]             512
       BatchNorm2d-8           [-1, 16, 64, 64]              32
  InvertedResidual-9           [-1, 16, 64, 64]               0
           Conv2d-10           [-1, 96, 64, 64]           1,536
      BatchNorm2d-11           [-1, 96, 64, 64]             192
            ReLU6-12           [-1, 96, 64, 64]               0
           Conv2d-13           [-1, 96, 32, 32]             864
      BatchNorm2d-14           [-1, 96,

     BatchNorm2d-125            [-1, 160, 4, 4]             320
InvertedResidual-126            [-1, 160, 4, 4]               0
          Conv2d-127            [-1, 960, 4, 4]         153,600
     BatchNorm2d-128            [-1, 960, 4, 4]           1,920
           ReLU6-129            [-1, 960, 4, 4]               0
          Conv2d-130            [-1, 960, 4, 4]           8,640
     BatchNorm2d-131            [-1, 960, 4, 4]           1,920
           ReLU6-132            [-1, 960, 4, 4]               0
          Conv2d-133            [-1, 160, 4, 4]         153,600
     BatchNorm2d-134            [-1, 160, 4, 4]             320
InvertedResidual-135            [-1, 160, 4, 4]               0
          Conv2d-136            [-1, 960, 4, 4]         153,600
     BatchNorm2d-137            [-1, 960, 4, 4]           1,920
           ReLU6-138            [-1, 960, 4, 4]               0
          Conv2d-139            [-1, 960, 4, 4]           8,640
     BatchNorm2d-140            [-1, 960

In [8]:
class MultiSiameseEncoderNet(nn.Module):
    def __init__(self):
        super(MultiSiameseEncoderNet, self).__init__()
        self.encoder = EncoderNet()
        
    def encode(self, x):
        output = self.encoder(x)
        return output
    
    def forward(self, input_imgs):
        # cosine sim of query img against each batch img
        query_img_encoding = self.encode(input_imgs[0])   # 1st img is the query img
        cosine_sims = []
        for i in range(1, len(input_imgs)):   # batch imgs
            batch_img_encoding = self.encode(input_imgs[i])
            cosine_sims.append(F.cosine_similarity(query_img_encoding, batch_img_encoding))
        return torch.stack(cosine_sims, dim=1)   # concat cosine sims
      

In [9]:
class DataGenerator:
    
    def __init__(self, spectrogram_samples_files, candidate_size, batch_size, num_batches, num_sub_samples, img_height):
        self.spectrogram_samples_files = spectrogram_samples_files   # list of filepaths
        self.candidate_size = candidate_size   # 1 positive, n-1 negatives
        self.batch_size = batch_size   # batch size
        self.num_batches = num_batches   # num batches per epoch
        self.num_sub_samples = num_sub_samples   # num sub-samples per epoch
        self.img_height = img_height   # height of square img to be generated in the batches
        self.sub_samples = []   # list of RGB converted spectrograms
    
    def generate_batches(self):
        while True:
            self.create_sub_samples()
            # batches per epoch
            for _ in range(self.num_batches):
                # create batch
                labels = []
                query_and_candidate_imgs = [[] for _ in range(self.candidate_size + 1)]
                for _ in range(self.batch_size):
                    sample_spectrograms_indices = random.sample(range(self.num_sub_samples), self.candidate_size)   # sample candidates
                    pos_idx = sample_spectrograms_indices[0]   # positive sample
                    # Generate query image
                    query_img = self.get_sliding_img_slice_from_spectrogram(self.sub_samples[pos_idx])
                    # Generate batch images
                    random.shuffle(sample_spectrograms_indices)
                    candidate_imgs = [self.get_sliding_img_slice_from_spectrogram(self.sub_samples[idx]) for idx in sample_spectrograms_indices]
                    # get class label / idx of positive sample
                    pos_candidate_idx = sample_spectrograms_indices.index(pos_idx)   
                    labels.append(pos_candidate_idx)
                    # Normalize input imgs
                    for i, img in enumerate([query_img, *candidate_imgs]):
                        img = img / np.amax(np.absolute(img))   # normalize to range [-1, 1]
                        query_and_candidate_imgs[i].append(img)
                # Convert to tensor
                labels = torch.tensor(labels)
                input_imgs = torch.tensor(query_and_candidate_imgs)
                yield (input_imgs, labels)
    
    def create_sub_samples(self):
        self.sub_samples = []   # reset
        files = random.sample(self.spectrogram_samples_files, self.num_sub_samples)   # sampling without replacement
        for file in files:
#             print(file)
            spectrogram = np.load(file)
            assert spectrogram.shape[0] == self.img_height, "Input spectrogram height does not match img height"
            self.sub_samples.append(spectrogram)
    
    @classmethod
    def get_sliding_img_slice_from_spectrogram(cls, spectrogram, depth=3, sliding_ratio=2):
        ### Combine multiple sliding greyscale img slices into an n-depth image
        height = spectrogram.shape[0]
        slide_step = height//sliding_ratio
        img_slice = np.zeros((depth,height,height))   # initialize empty img (pytorch style)
        # Get random start idx
        slice_start = random.randint(0, spectrogram.shape[1] - (slide_step*(depth+1)) - 1)
        for i in range(depth):
            img_slice[i,:,:] = spectrogram[:, slice_start:slice_start+height]   # get slice (pytorch style)
#             slice_start += slide_step   # slide
        img_slice = img_slice.astype("float32")
        return img_slice
    
    @classmethod
    def spectrogram_to_RGB(cls, spectrogram):
        assert len(spectrogram.shape) == 2, "Spectrogram input should be a 2D array"
        spectrogram_rgb = gray2rgb(spectrogram)
        return spectrogram_rgb
        

In [10]:
training_folder = os.path.join(output_data_folder, "training_dataset_full_spectrogram/vox1_dev_wav")
spectrogram_samples_files = [os.path.join(training_folder, file) for file in os.listdir(training_folder)]
candidate_size = 5
batch_size = 15
num_batches = 1000 // batch_size
# num_batches = 200 // batch_size
# num_batches = 100
num_sub_samples = 100
# num_sub_samples = 20
training_data_generator = DataGenerator(spectrogram_samples_files, candidate_size, batch_size, num_batches, num_sub_samples, IMG_HEIGHT)

In [11]:
def train_model(model, criterion, optimizer, scheduler, num_epochs, num_batches, training_data_generator):
    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    t1 = time.time()
    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            num_batch_modified = num_batches if phase == 'train' else num_batches // 20
            for i, data in zip(range(num_batch_modified), training_data_generator.generate_batches()):               
                input_imgs, labels = data
                inputs = [img.to(device) for img in input_imgs]
                labels = labels.to(device)
                
                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):   # gradient only for train
                    outputs = model(inputs)   
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # statistics
                running_loss += loss.item() * inputs[0].size(0)
                running_corrects += torch.sum(preds == labels.data)
                
            if phase == 'train':
                scheduler.step()

            epoch_loss = running_loss / (num_batch_modified * inputs[0].size(0))
            epoch_acc = running_corrects.double() / (num_batch_modified * inputs[0].size(0))

            print('{} Loss: {:.4f} Acc: {:.4f}'.format(
                phase, epoch_loss, epoch_acc))

            # deep copy the model
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())

        # end of epoch
        print("Time taken is {} seconds".format(int(time.time()-t1)))
        t1 = time.time()
        print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model

In [12]:
### Train

epochs = 50

model_ft = MultiSiameseEncoderNet().to(device)
criterion = nn.CrossEntropyLoss()

# Observe that all parameters are being optimized
optimizer_ft = optim.Adam(model_ft.parameters(), lr = 0.001)

# Decay LR by a factor of 0.1 every 7 epochs
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1)

### Train 

model_ft = train_model(model_ft, criterion, optimizer_ft, exp_lr_scheduler, epochs, num_batches, training_data_generator)


Epoch 0/49
----------
train Loss: 1.6006 Acc: 0.2596
val Loss: 1.6049 Acc: 0.2444
Time taken is 58 seconds

Epoch 1/49
----------
train Loss: 1.5878 Acc: 0.2424
val Loss: 1.6089 Acc: 0.2444
Time taken is 59 seconds

Epoch 2/49
----------
train Loss: 1.5816 Acc: 0.2747
val Loss: 1.5857 Acc: 0.2889
Time taken is 59 seconds

Epoch 3/49
----------
train Loss: 1.5255 Acc: 0.3162
val Loss: 1.4342 Acc: 0.3778
Time taken is 60 seconds

Epoch 4/49
----------
train Loss: 1.4523 Acc: 0.3424
val Loss: 1.4670 Acc: 0.3556
Time taken is 60 seconds

Epoch 5/49
----------
train Loss: 1.4388 Acc: 0.3808
val Loss: 1.2987 Acc: 0.5556
Time taken is 59 seconds

Epoch 6/49
----------
train Loss: 1.3957 Acc: 0.3677
val Loss: 1.3008 Acc: 0.3111
Time taken is 60 seconds

Epoch 7/49
----------
train Loss: 1.4104 Acc: 0.3596
val Loss: 1.3340 Acc: 0.5333
Time taken is 60 seconds

Epoch 8/49
----------
train Loss: 1.3913 Acc: 0.3414
val Loss: 1.3603 Acc: 0.3778
Time taken is 59 seconds

Epoch 9/49
----------
train 

### TODO: Overall
* Contrastive classifier: separate train and validate methods
* DataGenerator: generate an actual batch instead of just one (batches of candidates)
* Model saving / checkpointing
* Build binary classifier