In [1]:
import os
import time
import copy
import random
import pickle
import numpy as np
from skimage.color import gray2rgb

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import torch.optim as optim
from torch.optim import lr_scheduler
from torchvision import models
from torchsummary import summary

from config import models_folder, output_data_folder
from config import n_mels

from model_definitions import VerificationBinaryClassifierNet
from data_generators import VerificationDataGenerator
from project_utils import ModelSaveAndLogHandler, load_module_from_file

In [2]:
os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [3]:
IMG_HEIGHT = n_mels

In [4]:
encoder_model_folder = os.path.join(models_folder, "contrastive_encoder", "good_models", "2020-03-20_03-25-22")

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
torch.cuda.is_available()

cuda


True

In [6]:
# load model from contrastive training
def load_pretrained_encoder_model():
    module_file = os.path.join(encoder_model_folder, "model_definitions.py")
    module_name = "MultiSiameseContrastiveClassifierNet"
    module = load_module_from_file(module_file, module_name)
    # load model
    model = module.MultiSiameseContrastiveClassifierNet()
    state_dict_file = os.path.join(encoder_model_folder, "best_model_MultiSiameseContrastiveClassifierNet.pt")
    model.load_state_dict(torch.load(state_dict_file, map_location="cpu"))
    return model.encoder   # return pretrained encoder only

In [7]:
# Encoder model
encoder_model = load_pretrained_encoder_model()
for param in encoder_model.parameters(): param.requires_grad = False   # freeze encoder layers
summary(encoder_model, input_size=(3, IMG_HEIGHT, IMG_HEIGHT), device='cpu')

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 32, 64, 64]             864
       BatchNorm2d-2           [-1, 32, 64, 64]              64
             ReLU6-3           [-1, 32, 64, 64]               0
            Conv2d-4           [-1, 32, 64, 64]             288
       BatchNorm2d-5           [-1, 32, 64, 64]              64
             ReLU6-6           [-1, 32, 64, 64]               0
            Conv2d-7           [-1, 16, 64, 64]             512
       BatchNorm2d-8           [-1, 16, 64, 64]              32
  InvertedResidual-9           [-1, 16, 64, 64]               0
           Conv2d-10           [-1, 96, 64, 64]           1,536
      BatchNorm2d-11           [-1, 96, 64, 64]             192
            ReLU6-12           [-1, 96, 64, 64]               0
           Conv2d-13           [-1, 96, 32, 32]             864
      BatchNorm2d-14           [-1, 96,

     BatchNorm2d-125            [-1, 160, 4, 4]             320
InvertedResidual-126            [-1, 160, 4, 4]               0
          Conv2d-127            [-1, 960, 4, 4]         153,600
     BatchNorm2d-128            [-1, 960, 4, 4]           1,920
           ReLU6-129            [-1, 960, 4, 4]               0
          Conv2d-130            [-1, 960, 4, 4]           8,640
     BatchNorm2d-131            [-1, 960, 4, 4]           1,920
           ReLU6-132            [-1, 960, 4, 4]               0
          Conv2d-133            [-1, 160, 4, 4]         153,600
     BatchNorm2d-134            [-1, 160, 4, 4]             320
InvertedResidual-135            [-1, 160, 4, 4]               0
          Conv2d-136            [-1, 960, 4, 4]         153,600
     BatchNorm2d-137            [-1, 960, 4, 4]           1,920
           ReLU6-138            [-1, 960, 4, 4]               0
          Conv2d-139            [-1, 960, 4, 4]           8,640
     BatchNorm2d-140            [-1, 960

In [8]:
# Verification Binary classifier
summary(VerificationBinaryClassifierNet(encoder_model), input_size=(2, 3, IMG_HEIGHT, IMG_HEIGHT), device='cpu')

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 32, 64, 64]             864
       BatchNorm2d-2           [-1, 32, 64, 64]              64
             ReLU6-3           [-1, 32, 64, 64]               0
            Conv2d-4           [-1, 32, 64, 64]             288
       BatchNorm2d-5           [-1, 32, 64, 64]              64
             ReLU6-6           [-1, 32, 64, 64]               0
            Conv2d-7           [-1, 16, 64, 64]             512
       BatchNorm2d-8           [-1, 16, 64, 64]              32
  InvertedResidual-9           [-1, 16, 64, 64]               0
           Conv2d-10           [-1, 96, 64, 64]           1,536
      BatchNorm2d-11           [-1, 96, 64, 64]             192
            ReLU6-12           [-1, 96, 64, 64]               0
           Conv2d-13           [-1, 96, 32, 32]             864
      BatchNorm2d-14           [-1, 96,

     BatchNorm2d-125            [-1, 160, 4, 4]             320
InvertedResidual-126            [-1, 160, 4, 4]               0
          Conv2d-127            [-1, 960, 4, 4]         153,600
     BatchNorm2d-128            [-1, 960, 4, 4]           1,920
           ReLU6-129            [-1, 960, 4, 4]               0
          Conv2d-130            [-1, 960, 4, 4]           8,640
     BatchNorm2d-131            [-1, 960, 4, 4]           1,920
           ReLU6-132            [-1, 960, 4, 4]               0
          Conv2d-133            [-1, 160, 4, 4]         153,600
     BatchNorm2d-134            [-1, 160, 4, 4]             320
InvertedResidual-135            [-1, 160, 4, 4]               0
          Conv2d-136            [-1, 960, 4, 4]         153,600
     BatchNorm2d-137            [-1, 960, 4, 4]           1,920
           ReLU6-138            [-1, 960, 4, 4]               0
          Conv2d-139            [-1, 960, 4, 4]           8,640
     BatchNorm2d-140            [-1, 960

           ReLU6-252            [-1, 384, 8, 8]               0
          Conv2d-253            [-1, 384, 8, 8]           3,456
     BatchNorm2d-254            [-1, 384, 8, 8]             768
           ReLU6-255            [-1, 384, 8, 8]               0
          Conv2d-256             [-1, 96, 8, 8]          36,864
     BatchNorm2d-257             [-1, 96, 8, 8]             192
InvertedResidual-258             [-1, 96, 8, 8]               0
          Conv2d-259            [-1, 576, 8, 8]          55,296
     BatchNorm2d-260            [-1, 576, 8, 8]           1,152
           ReLU6-261            [-1, 576, 8, 8]               0
          Conv2d-262            [-1, 576, 8, 8]           5,184
     BatchNorm2d-263            [-1, 576, 8, 8]           1,152
           ReLU6-264            [-1, 576, 8, 8]               0
          Conv2d-265             [-1, 96, 8, 8]          55,296
     BatchNorm2d-266             [-1, 96, 8, 8]             192
InvertedResidual-267             [-1, 96

In [9]:
### Training data
training_folder = os.path.join(output_data_folder, "training_dataset_full_spectrogram/vox1_dev_wav")
spectrogram_samples_files = [os.path.join(training_folder, file) for file in os.listdir(training_folder)]
batch_size = 160
# num_batches = 1000 // batch_size
num_batches = 2000 // batch_size
# num_batches = 400 // batch_size
# num_sub_samples = 100
num_sub_samples = 200
# num_sub_samples = 20
training_data_generator = VerificationDataGenerator(spectrogram_samples_files, batch_size, num_batches, num_sub_samples, IMG_HEIGHT)

In [10]:
### Validation data
validation_set_file = os.path.join(output_data_folder, "validation_sets", "verification_validation_set.pickle")
with open(validation_set_file, 'rb') as f:
    validation_data = pickle.load(f)

In [11]:
def train_model(model, criterion, optimizer, scheduler, num_epochs, training_data_generator, validation_data, log_handler):
    since = time.time()
    best_acc = 0.0
    count_more_than_70 = 0

    t1 = time.time()
    for epoch in range(num_epochs):
        log_handler.print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        log_handler.print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            batches_used = 0
            data_generator = training_data_generator.generate_batches() if phase == 'train' else validation_data
            for data in data_generator:
                batches_used += 1
                input_imgs, labels = data
                inputs = [img.to(device) for img in input_imgs]
                labels = labels.to(device)
                
                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):   # gradient only for train
                    outputs = model(inputs)   
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # statistics
                running_loss += loss.item() * inputs[0].size(0)
                running_corrects += torch.sum(preds == labels.data)
                
            if phase == 'train':
                scheduler.step()
            
            epoch_loss = running_loss / (batches_used * inputs[0].size(0))
            epoch_acc = running_corrects.double() / (batches_used * inputs[0].size(0))
            log_handler.print('{} Loss: {:.4f} Acc: {:.4f}'.format(
                phase, epoch_loss, epoch_acc))

            # deep copy the model
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                log_handler.save_pytorch_model(model, "best_model_{}.pt".format(model.__class__.__name__))
                example = [torch.rand(1, 3, IMG_HEIGHT, IMG_HEIGHT), torch.rand(1, 3, IMG_HEIGHT, IMG_HEIGHT)]
                log_handler.save_pytorch_model_as_torchscript(model, "mobile_model.pt", (example,))
            # Track val acc >= 70%
            if phase == 'val' and epoch_acc >= 0.70: count_more_than_70 += 1

        # end of epoch
        log_handler.print("Time taken is {} seconds".format(int(time.time()-t1)))
        t1 = time.time()
        log_handler.print()

    time_elapsed = time.time() - since
    log_handler.print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    log_handler.print('Best val Acc: {:4f}'.format(best_acc))
    log_handler.print('Val Acc >= 0.70: {}'.format(count_more_than_70))
    

In [12]:
### Train

# epochs = 70
epochs = 50

model_ft = VerificationBinaryClassifierNet(encoder_model).to(device)
criterion = nn.CrossEntropyLoss()

# Observe that all parameters are being optimized
optimizer_ft = optim.Adam(model_ft.parameters(), lr = 0.0001)

# Decay LR by a factor of 0.1 every 7 epochs
# learning_rate_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=10, gamma=0.1)
learning_rate_scheduler = lr_scheduler.CyclicLR(optimizer_ft, base_lr=0.0001, max_lr=0.01, cycle_momentum=False)   # 0.01 seems better

### Train 

# Logger
model_save_folder = os.path.join(models_folder, "verification_classifier")
log_handler = ModelSaveAndLogHandler(model_save_folder, enable_model_saving=True, enable_logging=True)   # init
model_def_src_file_path = os.path.join(r"D:\Desktop\projects\speaker_recognition_voxceleb1\scripts", "model_definitions.py")
log_handler.save_model_definition_file(model_def_src_file_path)   # copy model def file
print(log_handler.folder)

# Description
log_handler.print("Encoder: {}".format(encoder_model_folder))

# Train
train_model(model_ft, criterion, optimizer_ft, learning_rate_scheduler, epochs, training_data_generator, validation_data, log_handler)

D:\Desktop\projects\speaker_recognition_voxceleb1\output_data\models\verification_classifier\2020-03-20_12-20-57
Encoder: D:\Desktop\projects\speaker_recognition_voxceleb1\output_data\models\contrastive_encoder\good_models\2020-03-20_03-25-22
Epoch 0/49
----------
train Loss: 0.6906 Acc: 0.5000
val Loss: 0.6881 Acc: 0.5000
MODEL SAVED
MODEL SAVED (MOBILE)
Time taken is 96 seconds

Epoch 1/49
----------
train Loss: 0.6865 Acc: 0.5000
val Loss: 0.6831 Acc: 0.5000
Time taken is 78 seconds

Epoch 2/49
----------
train Loss: 0.6807 Acc: 0.5036
val Loss: 0.6769 Acc: 0.5143
MODEL SAVED
MODEL SAVED (MOBILE)
Time taken is 100 seconds

Epoch 3/49
----------
train Loss: 0.6748 Acc: 0.5310
val Loss: 0.6681 Acc: 0.5602
MODEL SAVED
MODEL SAVED (MOBILE)
Time taken is 78 seconds

Epoch 4/49
----------
train Loss: 0.6650 Acc: 0.5716
val Loss: 0.6556 Acc: 0.6018
MODEL SAVED
MODEL SAVED (MOBILE)
Time taken is 109 seconds

Epoch 5/49
----------
train Loss: 0.6504 Acc: 0.6232
val Loss: 0.6373 Acc: 0.6529
M

In [13]:
# log_handler.print("Encoder weights not frozen")
random_acc = 1 / 2
random_acc

0.5

### TODO: Overall
* **Change spectrogram size?**
* **Contrastive classifier**
    * Implement intraclass variance reduction
* **Verification binary classifier**
    * Implement EER metric
    * Build 2nd validation set? (accent dataset)