# Train Network from Scratch for Speaker Identification

##Mount Google Drive as needed

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


##Download and Extract VoxCeleb1

In [0]:
! wget --user voxceleb1902 --password nx0bl2v2 http://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partaa
! wget --user voxceleb1902 --password nx0bl2v2 http://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partab
! wget --user voxceleb1902 --password nx0bl2v2 http://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partac
! wget --user voxceleb1902 --password nx0bl2v2 http://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partad

! wget --user voxceleb1902 --password nx0bl2v2 http://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_test_wav.zip

! wget http://www.robots.ox.ac.uk/~vgg/data/voxceleb/data/vox1_dev_txt.zip  
! wget http://www.robots.ox.ac.uk/~vgg/data/voxceleb/data/vox1_test_txt.zip

! wget http://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/iden_split.txt
! wget http://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/veri_test.txt
  
! cat vox1_dev* > vox1_dev_wav.zip
! rm vox1_dev_wav_partaa vox1_dev_wav_partab vox1_dev_wav_partac vox1_dev_wav_partad
! mkdir -p voxceleb1
! mv *.zip voxceleb1
! mv *.txt voxceleb1

--2019-07-24 20:29:59--  http://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partaa
Resolving www.robots.ox.ac.uk (www.robots.ox.ac.uk)... 129.67.94.2
Connecting to www.robots.ox.ac.uk (www.robots.ox.ac.uk)|129.67.94.2|:80... connected.
HTTP request sent, awaiting response... 401 Unauthorized
Authentication selected: Basic realm="VoxCeleb1"
Reusing existing connection to www.robots.ox.ac.uk:80.
HTTP request sent, awaiting response... 200 OK
Length: 10737418240 (10G)
Saving to: ‘vox1_dev_wav_partaa’

vox1_dev_wav_partaa   0%[                    ]  56.41M  9.14MB/s    eta 15m 46s

In [0]:
import os
import zipfile

DATA_PATH = 'voxceleb1/'

print('Starting to unpack vox1_dev_wav.zip')
zip = zipfile.ZipFile(os.path.join(DATA_PATH, 'vox1_dev_wav.zip'), 'r')
zip.extractall(DATA_PATH)
zip.close()
print('Done. Starting to unpack vox1_test_wav.zip')
zip = zipfile.ZipFile(os.path.join(DATA_PATH, 'vox1_test_wav.zip'), 'r')
zip.extractall(DATA_PATH)
zip.close()
print('Done. Starting to unpack vox1_dev_txt.zip')
zip = zipfile.ZipFile(os.path.join(DATA_PATH, 'vox1_dev_txt.zip'), 'r')
zip.extractall(DATA_PATH)
zip.close()
print('Done. Starting to unpack vox1_test_txt.zip')
zip = zipfile.ZipFile(os.path.join(DATA_PATH, 'vox1_test_txt.zip'), 'r')
zip.extractall(DATA_PATH)
zip.close()
print('Done.')

os.remove(os.path.join(DATA_PATH, 'vox1_dev_wav.zip'))
os.remove(os.path.join(DATA_PATH, 'vox1_test_wav.zip'))
os.remove(os.path.join(DATA_PATH, 'vox1_dev_txt.zip'))
os.remove(os.path.join(DATA_PATH, 'vox1_test_txt.zip'))


##VGG-M with PyTorch

###Import Requirements

In [0]:
import os

import numpy as np
import pandas as pd
from scipy.io import wavfile
from scipy import signal

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset
from torchvision.transforms import Compose

# ! pip install tensorboardX
# import tensorboardX
from tqdm import tqdm

import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore') # scipy throws future warnings on fft (known bug)

DATA_PATH = 'voxceleb1/'

###Read wav files and calculate spectrogram

In [0]:
class IdentificationDataset(Dataset):
    
    def __init__(self, path, train, transform=None):
        iden_split_path = os.path.join(path, 'iden_split.txt')
        split = pd.read_table(iden_split_path, sep=' ', header=None, names=['phase', 'path'])
        
        if train=='train':
            phases = [1]
        elif train=='valid':
            phases = [2]
        else:
            phases = [3]
        
        mask = split['phase'].isin(phases)
        self.dataset = split['path'][mask].reset_index(drop=True)
        self.dataset = self.dataset[:int(len(self.dataset)/2)]
        self.path = path
        self.train = train
        self.transform = transform
    
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        # path
        track_path = self.dataset[idx]
        audio_path = os.path.join(self.path, 'wav', track_path)

        # read .wav
        rate, samples = wavfile.read(audio_path)
        # extract label from path like id10003/L9_sh8msGV59/00001.txt
        # subtracting 1 because PyTorch assumes that C_i in [0, 1251-1]
        label = int(track_path.split('/')[0].replace('id1', '')) - 1

        ## parameters
        window = 'hamming'
        # window width and step size
        Tw = 25 # ms
        Ts = 10 # ms
        # frame duration (samples)
        Nw = int(rate * Tw * 1e-3)
        Ns = int(rate * (Tw - Ts) * 1e-3)
        # overlapped duration (samples)
        # 2 ** to the next pow of 2 of (Nw - 1)
        nfft = 2 ** (Nw - 1).bit_length()
        pre_emphasis = 0.97
        
        # preemphasis filter
        samples = np.append(samples[0], samples[1:] - pre_emphasis * samples[:-1])
        
        # removes DC component of the signal and add a small dither
        samples = signal.lfilter([1, -1], [1, -0.99], samples)
        dither = np.random.uniform(-1, 1, samples.shape)
        spow = np.std(samples)
        samples = samples + 1e-6 * spow * dither
        
        if self.train == 'train':
            # segment selection
            segment_len = 3 # sec
            upper_bound = len(samples) - segment_len * rate
            start = np.random.randint(0, upper_bound)
            end = start + segment_len * rate
            samples = samples[start:end]
        
        # spectogram
        _, _, spec = signal.spectrogram(samples, rate, window, Nw, Ns, nfft, 
                                        mode='magnitude', return_onesided=False)
        
        # just multiplying it by 1600 makes spectrograms in the paper and here "the same"
        spec *= rate / 10
        
        if self.transform:
            spec = self.transform(spec)
            
         
        
        
        
#         _, _, spec_phase = signal.spectrogram(samples, rate, window, Nw, Ns, nfft, 
#                                                 mode='phase', return_onesided=False)                
#         spec_phase[1:,:] = np.diff(spec_phase, axis=0)
#         spec_phase = spec_phase.reshape(1, spec_phase.shape[0], spec_phase.shape[1])
#         spec_phase = spec_phase.astype(np.float32)
#         spec_phase = torch.from_numpy(spec_phase)
#         spec_ = np.concatenate((spec, spec_phase), axis=0)    
        
        

        return label, spec

###mean and variance normalization

In [0]:
class Normalize(object):
    """Normalizes voice spectrogram (mean-varience)"""
    
    def __call__(self, spec):
        
        # (Freq, Time)
        # mean-variance normalization for every spectrogram (not batch-wise)
        mu = spec.mean(axis=1).reshape(512, 1)
        sigma = spec.std(axis=1).reshape(512, 1)
        spec = (spec - mu) / sigma

        return spec

class ToTensor(object):
    """Convert spectogram to Tensor."""
    
    def __call__(self, spec):
        F, T = spec.shape
        
        # now specs are of size (Freq, Time) and 2D but has to be 3D (channel dim)
        spec = spec.reshape(1, F, T)
        
        # make the ndarray to be of a proper type (was float64)
        spec = spec.astype(np.float32)
        
        return torch.from_numpy(spec)

###create model class

In [0]:
class VoiceNet(nn.Module):

    def __init__(self, num_classes=2):
        super(VoiceNet, self).__init__()
        
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=96, kernel_size=7, stride=2, padding=1)
        self.conv2 = nn.Conv2d(in_channels=96, out_channels=256, kernel_size=5, stride=2, padding=1)
        self.conv3 = nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, padding=1)
        self.conv4 = nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, padding=1)
        self.conv5 = nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, padding=1)
        
        self.bn1 = nn.BatchNorm2d(num_features=96)
        self.bn2 = nn.BatchNorm2d(num_features=256)
        self.bn3 = nn.BatchNorm2d(num_features=256)
        self.bn4 = nn.BatchNorm2d(num_features=256)
        self.bn5 = nn.BatchNorm2d(num_features=256)
        self.bn6 = nn.BatchNorm2d(num_features=2048)
        self.bn7 = nn.BatchNorm1d(num_features=1024)
        
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax()
        
        self.mpool1 = nn.MaxPool2d(kernel_size=3, stride=2)
        self.mpool2 = nn.MaxPool2d(kernel_size=3, stride=2)
        self.mpool5 = nn.MaxPool2d(kernel_size=(5, 3), stride=(3, 2))
        
        # Conv2d with weights of size (H, 1) is identical to FC with H weights
        self.fc6 = nn.Conv2d(in_channels=256, out_channels=2048, kernel_size=(9, 1))
        self.fc7 = nn.Linear(in_features=2048, out_features=1024)
        self.fc8 = nn.Linear(in_features=1024, out_features=num_classes)
        
    def forward(self, x):
        B, C, H, W = x.size()
        x = self.relu(self.bn1(self.conv1(x)))
        x = self.mpool1(x)
        x = self.relu(self.bn2(self.conv2(x)))
        x = self.mpool2(x) 
        x = self.relu(self.bn3(self.conv3(x)))
        x = self.relu(self.bn4(self.conv4(x)))
        x = self.relu(self.bn5(self.conv5(x)))
        x = self.mpool5(x)
        x = self.relu(self.bn6(self.fc6(x)))
        
        _, _, _, W = x.size()
        self.apool6 = nn.AvgPool2d(kernel_size=(1, W))
        x = self.apool6(x)
        
        x = x.view(x.size(0), -1)
        x = self.relu(self.bn7(self.fc7(x)))
        x = self.fc8(x)
        
        # during training, there's no need for SoftMax because CELoss calculates it
        if self.training:
            return x
        
        else:
            return self.softmax(x)

###set hyper-parameters

In [0]:
LOG_PATH = 'logs/VoxCeleb/rm_dc_n_dither'
! mkdir -p logs/VoxCeleb/rm_dc_n_dither
EPOCH_NUM = 10 #30

# in shared code B = 100 but PyTorch throws CUDA out of memory at B = 97 
# though B=96 takes only 90.6% of the GPU Mem (bug?):
# https://discuss.pytorch.org/t/lesser-memory-consumption-with-a-larger-batch-in-multi-gpu-setup/29087
# B = 96
# but when 
torch.backends.cudnn.deterministic = True
# I can set B = 100
B = 100

WEIGHT_DECAY = 5e-4
LR_INIT = 1e-2
LR_LAST = 1e-4
# lr scheduler parameter
gamma = 10 ** (np.log10(LR_LAST / LR_INIT) / (EPOCH_NUM - 1))
MOMENTUM = 0.9
DEVICE = 'cuda:0'
NUM_WORKERS = 4
# TBoard = tensorboardX.SummaryWriter(log_dir=LOG_PATH)

###create model and data generator

In [0]:
net = VoiceNet(num_classes=1251)
net.to(DEVICE)

transforms = Compose([
    Normalize(),
    ToTensor()
])

trainset = IdentificationDataset(DATA_PATH, train='train', transform=transforms)
trainsetloader = torch.utils.data.DataLoader(trainset, batch_size=B, num_workers=NUM_WORKERS, shuffle=True)

testset = IdentificationDataset(DATA_PATH, train='test', transform=transforms)
testsetloader = torch.utils.data.DataLoader(testset, batch_size=1, num_workers=NUM_WORKERS*2)

validset = IdentificationDataset(DATA_PATH, train='valid', transform=transforms)
validsetloader = torch.utils.data.DataLoader(validset, batch_size=1, num_workers=NUM_WORKERS*2)

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), LR_INIT, MOMENTUM, weight_decay=WEIGHT_DECAY)
lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=gamma)

###model info

In [0]:
from torchsummary import summary
summary(net, input_size=(1, 512, 300))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 96, 254, 148]           4,800
       BatchNorm2d-2         [-1, 96, 254, 148]             192
              ReLU-3         [-1, 96, 254, 148]               0
         MaxPool2d-4          [-1, 96, 126, 73]               0
            Conv2d-5          [-1, 256, 62, 36]         614,656
       BatchNorm2d-6          [-1, 256, 62, 36]             512
              ReLU-7          [-1, 256, 62, 36]               0
         MaxPool2d-8          [-1, 256, 30, 17]               0
            Conv2d-9          [-1, 256, 30, 17]         590,080
      BatchNorm2d-10          [-1, 256, 30, 17]             512
             ReLU-11          [-1, 256, 30, 17]               0
           Conv2d-12          [-1, 256, 30, 17]         590,080
      BatchNorm2d-13          [-1, 256, 30, 17]             512
             ReLU-14          [-1, 256,

###Train

In [0]:
for epoch_num in range(EPOCH_NUM):
    lr_scheduler.step()
    
    # train
    net.train()
    
    for iter_num, (labels, specs) in tqdm(enumerate(trainsetloader)):
        optimizer.zero_grad()
        labels, specs = labels.to(DEVICE), specs.to(DEVICE)
        scores = net(specs)
        loss = criterion(scores, labels)
        loss.backward()
        optimizer.step()
        
    
    # validation
    net.eval()
    
    top5_accuracy = 0
    top1_accuracy = 0

    for _, (label, spec) in tqdm(enumerate(validsetloader)):
        label, spec = label.to(DEVICE), spec.to(DEVICE)
        probs = net(spec)

        # calculate Top-5 and Top-1 accuracy
        pred_top5 = probs.topk(5)[1].view(5)

        if label in pred_top5:
            # increment top-5 accuracy
            top5_accuracy += 1

            if label == pred_top5[0]:
                # increment top-1 accuracy
                top1_accuracy += 1

    top5_accuracy /= len(validsetloader)
    top1_accuracy /= len(validsetloader)

    
    print('\nvalidation_top5 =', round(100 * top5_accuracy, 2), '%')
    print('validation_top1 =', round(100 * top1_accuracy, 2), '%')
    
    torch.save(net, 'vggm_torch.pt')
        
# when the training is finished save the model
# torch.save(net.state_dict(), os.path.join(LOG_PATH, 'model_snapshot.txt'))
os.system('cp vggm_torch.pt drive/My\ Drive/models/vggm_torch.pt')


692it [19:23,  1.66s/it]
3452it [01:49, 31.40it/s]


validation_top5 = 13.3 %
validation_top1 = 4.32 %



692it [19:20,  1.66s/it]
3452it [01:47, 32.04it/s]


validation_top5 = 24.13 %
validation_top1 = 9.04 %



692it [19:21,  1.66s/it]
3452it [01:48, 31.81it/s]


validation_top5 = 34.13 %
validation_top1 = 16.02 %



692it [19:23,  1.67s/it]
3452it [01:49, 31.42it/s]


validation_top5 = 41.57 %
validation_top1 = 20.83 %



692it [19:24,  1.66s/it]
3452it [01:48, 31.94it/s]


validation_top5 = 43.8 %
validation_top1 = 22.54 %



692it [19:22,  1.66s/it]
3452it [01:48, 31.71it/s]


validation_top5 = 45.6 %
validation_top1 = 25.49 %



692it [19:23,  1.67s/it]
3452it [01:49, 31.63it/s]


validation_top5 = 47.54 %
validation_top1 = 26.39 %



692it [19:21,  1.66s/it]
3452it [01:48, 31.92it/s]


validation_top5 = 48.2 %
validation_top1 = 27.26 %



692it [19:22,  1.67s/it]
3452it [01:49, 31.53it/s]


validation_top5 = 48.9 %
validation_top1 = 27.78 %



692it [19:24,  1.66s/it]
3452it [01:49, 31.55it/s]



validation_top5 = 49.25 %
validation_top1 = 28.39 %


0

###Test

In [0]:
net.eval()

top5_accuracy = 0
top1_accuracy = 0

for _, (label, spec) in tqdm(enumerate(testsetloader)):
    label, spec = label.to(DEVICE), spec.to(DEVICE)
    probs = net(spec)

    # calculate Top-5 and Top-1 accuracy
    pred_top5 = probs.topk(5)[1].view(5)

    if label in pred_top5:
        # increment top-5 accuracy
        top5_accuracy += 1

        if label == pred_top5[0]:
            # increment top-1 accuracy
            top1_accuracy += 1

top5_accuracy /= len(testsetloader)
top1_accuracy /= len(testsetloader)

print('\ntest_top5 =', round(100 * top5_accuracy, 2), '%')
print('test_top1 =', round(100 * top1_accuracy, 2), '%')

4125it [02:13, 30.83it/s]


test_top5 = 48.75 %
test_top1 = 27.18 %





##VGG-M and ResNet50 with Keras

###Import Requirements

In [0]:
import os

import numpy as np
import pandas as pd
from scipy.io import wavfile
from scipy import signal
import pandas as pd

import warnings
warnings.filterwarnings('ignore') # scipy throws future warnings on fft (known bug)

DATA_PATH = 'voxceleb1/'

###Data Generator (function)

In [0]:
import keras

class DataGenerator(keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, list_IDs, labels, train=True, batch_size=32, dim=(512,300), n_channels=1,
                 n_classes=1251, shuffle=True):
        'Initialization'
        self.train = train
        self.dim = dim
        self.batch_size = batch_size
        self.labels = labels
        self.list_IDs = list_IDs
        self.n_channels = n_channels
        self.n_classes = n_classes
        self.shuffle = shuffle
        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.list_IDs) / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

        # Find list of IDs
        list_IDs_temp = [self.list_IDs[k] for k in indexes]
        labels_temp = [self.labels[k] for k in indexes]

        # Generate data
        X, y = self.__data_generation(list_IDs_temp, labels_temp)

        return X, y

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.list_IDs))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, list_IDs_temp, labels_temp):
        'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
        # Initialization
        if self.train:
          X = np.empty((self.batch_size, *self.dim, self.n_channels))
        y = np.empty((self.batch_size), dtype=int)

        # Generate data
        for i, audio_path in enumerate(list_IDs_temp):          
          # Store class
          y[i] = labels_temp[i]

          # read .wav
          rate, samples = wavfile.read(audio_path)
          
          ## parameters
          window = 'hamming'
          # window width and step size
          Tw = 25 # ms
          Ts = 10 # ms
          # frame duration (samples)
          Nw = int(rate * Tw * 1e-3)
          Ns = int(rate * (Tw - Ts) * 1e-3)
          # overlapped duration (samples)
          # 2 ** to the next pow of 2 of (Nw - 1)
          nfft = 2 ** (Nw - 1).bit_length()
          pre_emphasis = 0.97

          # preemphasis filter
          samples = np.append(samples[0], samples[1:] - pre_emphasis * samples[:-1])

          # removes DC component of the signal and add a small dither
          samples = signal.lfilter([1, -1], [1, -0.99], samples)
          dither = np.random.uniform(-1, 1, samples.shape)
          spow = np.std(samples)
          samples = samples + 1e-6 * spow * dither
          
          
          
          
          for _ in range(2):
            samples = np.append(samples, samples)
         
        
        
          if self.train:
          # segment selection
            segment_len = 3 # sec
            upper_bound = len(samples) - segment_len * rate
            start = np.random.randint(0, upper_bound)
            end = start + segment_len * rate
            samples = samples[start:end]  

          # spectogram
          _, _, spec = signal.spectrogram(samples, rate, window, Nw, Ns, nfft, 
                                          mode='magnitude', return_onesided=False)
          
          if not self.train:
            X = np.empty((self.batch_size, spec.shape[0], spec.shape[1], self.n_channels))

          # just multiplying it by 1600 makes spectrograms in the paper and here "the same"
          spec *= rate / 10
          
          spec = Normalize(spec)
          spec = ToTensor(spec)
          
          # Store sample
          X[i,] = spec
        
        
        return X, keras.utils.to_categorical(y, num_classes=self.n_classes)

###Normalization (function)

In [0]:
def Normalize(spec):
  """Normalizes voice spectrogram (mean-varience)"""
  # (Freq, Time)
  # mean-variance normalization for every spectrogram (not batch-wise)
  mu = spec.mean(axis=1).reshape(spec.shape[0], 1)
  sigma = spec.std(axis=1).reshape(spec.shape[0], 1)
  spec = (spec - mu) / sigma

  return spec

def ToTensor(spec):
  """Convert spectogram to Tensor."""
  F, T = spec.shape

  # now specs are of size (Freq, Time) and 2D but has to be 3D (channel dim)
  spec = spec.reshape(F, T, 1)

  # make the ndarray to be of a proper type (was float64)
  spec = spec.astype(np.float32)

  return spec

###Split Data (function)

In [0]:
def split_data(phase):
  iden_split_path = os.path.join(DATA_PATH, 'iden_split.txt')
  split = pd.read_table(iden_split_path, sep=' ', header=None, names=['phase', 'path'])

  if phase == 'train':
    phases = [1,2]
#   elif phase == 'vaid':
#     phases = [2]
  else:
    phases = [3]

  mask = split['phase'].isin(phases)

  dataset = split['path'][mask].reset_index(drop=True)
  path = DATA_PATH

  list_IDs = [os.path.join(DATA_PATH, 'wav', track_path) for track_path in dataset]
  labels = [int(track_path.split('/')[0].replace('id1', '')) - 1 for track_path in dataset]
  
  return list_IDs[:int(len(list_IDs)/2)], labels[:int(len(list_IDs)/2)]

###Vgg-M

####Model Definition - VGG-M  (function)

In [8]:
from keras import Model
from keras.layers import Conv2D, Dense, Flatten, Activation, Input, Reshape
from keras.layers import MaxPooling2D, AveragePooling2D, ZeroPadding2D, GlobalAveragePooling2D
from keras.layers.normalization import BatchNormalization

def vgg_m_builder(num_classes, num_channel):
  
  inputs = Input(shape=(None,None,num_channel), name='input')

  x = Conv2D(96, 7, strides=2, padding='same', name='conv1')(inputs)
  x = BatchNormalization(trainable=False, name='batch1')(x)
  x = Activation('relu', name='act1')(x)
  x = MaxPooling2D(3, 2, name='mpool1')(x)
  
  x = Conv2D(256, 5, strides=2, padding='same', name='conv2')(x)
  x = BatchNormalization(name='batch2')(x)
  x = Activation('relu', name='act2')(x)
  x = MaxPooling2D(3, 2, name='mpool2')(x)
  
  x = Conv2D(384, 3, strides=1, padding='same', name='conv3')(x)
  x = BatchNormalization(name='batch3')(x)
  x = Activation('relu', name='act3')(x)
  
  x = Conv2D(256, 3, strides=1, padding='same', name='conv4')(x)
  x = BatchNormalization(name='batch4')(x)
  x = Activation('relu', name='act4')(x)
  
  x = Conv2D(256, 3, strides=1, padding='same', name='conv5')(x)
  x = BatchNormalization(name='batch5')(x)
  x = Activation('relu', name='act5')(x)
  x = MaxPooling2D(pool_size=(5,3), strides=(3,2), name='mpool5')(x)
  
  x = Conv2D(2048, (9,1), strides=1, padding='valid', name='fc6')(x)
  x = BatchNormalization(name='batch6')(x)
  x = Activation('relu', name='act6')(x)
#   x = AveragePooling2D(pool_size=(1,int(x.shape[2])), strides=1, name='apool6')(x)
#   x = Flatten(name='flat1')(x)

  x = GlobalAveragePooling2D()(x)
  
#   x = Reshape((1,1,4096))(x)
  
  x = Dense(1024, name='fc7')(x)
  x = BatchNormalization(name='batch7')(x)
  x = Activation('relu', name='act7')(x)    
  
  predictions = Dense(num_classes, activation='softmax', name='fc8')(x)

  model = Model(inputs=inputs, outputs=predictions)

  return model

Using TensorFlow backend.


####Build and Summary

In [0]:
model = vgg_m_builder(num_classes=1251, num_channel=1)

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           (None, None, None, 1)     0         
_________________________________________________________________
conv1 (Conv2D)               (None, None, None, 96)    4800      
_________________________________________________________________
batch1 (BatchNormalization)  (None, None, None, 96)    384       
_________________________________________________________________
act1 (Activation)            (None, None, None, 96)    0         
_________________________________________________________________
mpool1 (MaxPooling2D)        (None, None, None, 96)    0         
_________________________________________________________________
conv2 (Conv2D)               (None, None, None, 256)   614656    
_________________________________________________________________
batch2 (BatchNormalization)  (None, None, None, 256)   1024      
__________

####compile and Train

In [0]:
from keras.callbacks import ModelCheckpoint
from keras import optimizers

# Parameters
params = {'dim': (512,298),
          'batch_size': 100,
          'n_classes': 1251,
          'n_channels': 1,
          'shuffle': True}

# Datasets
partition, labels = split_data('train') # IDs & Labels
# Generators
training_generator = DataGenerator(partition, labels, train=True, **params)

partition, labels = split_data('valid') # IDs & Labels
validation_generator = DataGenerator(partition, labels, **params)


# optimizer = 'sgd'
model.compile(optimizer = optimizers.SGD(lr=0.1, decay=1e-6, momentum=0.9),
              loss = 'categorical_crossentropy',
              metrics=['accuracy'])


# filepath = "drive/My\ Drive/model_vggm.h5"
filepath = "model_vggm.h5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

if os.path.isdir('model_vggm.h5'):
  model = Models.load_model('model_vggm.h5')
  
# validation_data=validation_generator,
model.fit_generator(generator=training_generator,
                    validation_data=validation_generator,
                    epochs=10,
                    use_multiprocessing=True,
                    workers=6,
                    callbacks=callbacks_list)

Epoch 1/10

Epoch 00001: loss improved from inf to 4.26980, saving model to model_vggm.h5
Epoch 2/10

Epoch 00002: loss improved from 4.26980 to 3.70760, saving model to model_vggm.h5
Epoch 3/10

Epoch 00003: loss improved from 3.70760 to 3.26047, saving model to model_vggm.h5
Epoch 4/10
131/726 [====>.........................] - ETA: 15:06 - loss: 3.0143 - acc: 0.4177

Process ForkPoolWorker-110:
Process ForkPoolWorker-111:
Process ForkPoolWorker-114:
Process ForkPoolWorker-112:
Process ForkPoolWorker-113:
Traceback (most recent call last):
  File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/usr/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in 

KeyboardInterrupt: ignored

Process ForkPoolWorker-109:
Traceback (most recent call last):
  File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/usr/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.6/multiprocessing/pool.py", line 125, in worker
    put((job, i, result))
  File "/usr/lib/python3.6/multiprocessing/queues.py", line 341, in put
    obj = _ForkingPickler.dumps(obj)
  File "/usr/lib/python3.6/multiprocessing/reduction.py", line 51, in dumps
    cls(buf, protocol).dump(obj)
KeyboardInterrupt


In [0]:
params = {'dim': (512,298),
          'batch_size': 1,
          'n_classes': 1251,
          'n_channels': 1,
          'shuffle': True}

partition, labels = split_data('test') # IDs & Labels
test_generator = DataGenerator(partition, labels, train=False, **params)

loss, accuracy = model.evaluate_generator(test_generator,
                                          use_multiprocessing=True,
                                          workers=6)
print(loss)
print("%.2f" % (accuracy * 100), '%')

8.941980450312297
0.53 %


####save model to drive

In [0]:
os.system('cp model_vggm.h5 drive/My\ Drive/model_vggm.h5')

###ResNet50

####Model Definition - ResNet50

In [0]:
from __future__ import division

import six
from keras.models import Model
from keras.layers import (
    Input,
    Activation,
    Dense,
    Flatten,
    GlobalAveragePooling2D
)
from keras.layers.convolutional import (
    Conv2D,
    MaxPooling2D,
    AveragePooling2D
)
from keras.layers.merge import add
from keras.layers.normalization import BatchNormalization
from keras.regularizers import l2
from keras import backend as K


def _bn_relu(input):
    """Helper to build a BN -> relu block
    """
    norm = BatchNormalization(axis=CHANNEL_AXIS)(input)
    return Activation("relu")(norm)


def _conv_bn_relu(**conv_params):
    """Helper to build a conv -> BN -> relu block
    """
    filters = conv_params["filters"]
    kernel_size = conv_params["kernel_size"]
    strides = conv_params.setdefault("strides", (1, 1))
    kernel_initializer = conv_params.setdefault("kernel_initializer", "he_normal")
    padding = conv_params.setdefault("padding", "same")
    kernel_regularizer = conv_params.setdefault("kernel_regularizer", l2(1.e-4))

    def f(input):
        conv = Conv2D(filters=filters, kernel_size=kernel_size,
                      strides=strides, padding=padding,
                      kernel_initializer=kernel_initializer,
                      kernel_regularizer=kernel_regularizer)(input)
        return _bn_relu(conv)

    return f


def _bn_relu_conv(**conv_params):
    """Helper to build a BN -> relu -> conv block.
    This is an improved scheme proposed in http://arxiv.org/pdf/1603.05027v2.pdf
    """
    filters = conv_params["filters"]
    kernel_size = conv_params["kernel_size"]
    strides = conv_params.setdefault("strides", (1, 1))
    kernel_initializer = conv_params.setdefault("kernel_initializer", "he_normal")
    padding = conv_params.setdefault("padding", "same")
    kernel_regularizer = conv_params.setdefault("kernel_regularizer", l2(1.e-4))

    def f(input):
        activation = _bn_relu(input)
        return Conv2D(filters=filters, kernel_size=kernel_size,
                      strides=strides, padding=padding,
                      kernel_initializer=kernel_initializer,
                      kernel_regularizer=kernel_regularizer)(activation)

    return f


def _shortcut(input, residual):
    """Adds a shortcut between input and residual block and merges them with "sum"
    """
    # Expand channels of shortcut to match residual.
    # Stride appropriately to match residual (width, height)
    # Should be int if network architecture is correctly configured.
    input_shape = K.int_shape(input)
    residual_shape = K.int_shape(residual)
    stride_width = int(round(input_shape[ROW_AXIS] / residual_shape[ROW_AXIS]))
    stride_height = int(round(input_shape[COL_AXIS] / residual_shape[COL_AXIS]))
    equal_channels = input_shape[CHANNEL_AXIS] == residual_shape[CHANNEL_AXIS]

    shortcut = input
    # 1 X 1 conv if shape is different. Else identity.
    if stride_width > 1 or stride_height > 1 or not equal_channels:
        shortcut = Conv2D(filters=residual_shape[CHANNEL_AXIS],
                          kernel_size=(1, 1),
                          strides=(stride_width, stride_height),
                          padding="valid",
                          kernel_initializer="he_normal",
                          kernel_regularizer=l2(0.0001))(input)

    return add([shortcut, residual])


def _residual_block(block_function, filters, repetitions, is_first_layer=False):
    """Builds a residual block with repeating bottleneck blocks.
    """
    def f(input):
        for i in range(repetitions):
            init_strides = (1, 1)
            if i == 0 and not is_first_layer:
                init_strides = (2, 2)
            input = block_function(filters=filters, init_strides=init_strides,
                                   is_first_block_of_first_layer=(is_first_layer and i == 0))(input)
        return input

    return f


def basic_block(filters, init_strides=(1, 1), is_first_block_of_first_layer=False):
    """Basic 3 X 3 convolution blocks for use on resnets with layers <= 34.
    Follows improved proposed scheme in http://arxiv.org/pdf/1603.05027v2.pdf
    """
    def f(input):

        if is_first_block_of_first_layer:
            # don't repeat bn->relu since we just did bn->relu->maxpool
            conv1 = Conv2D(filters=filters, kernel_size=(3, 3),
                           strides=init_strides,
                           padding="same",
                           kernel_initializer="he_normal",
                           kernel_regularizer=l2(1e-4))(input)
        else:
            conv1 = _bn_relu_conv(filters=filters, kernel_size=(3, 3),
                                  strides=init_strides)(input)

        residual = _bn_relu_conv(filters=filters, kernel_size=(3, 3))(conv1)
        return _shortcut(input, residual)

    return f


def bottleneck(filters, init_strides=(1, 1), is_first_block_of_first_layer=False):
    """Bottleneck architecture for > 34 layer resnet.
    Follows improved proposed scheme in http://arxiv.org/pdf/1603.05027v2.pdf
    Returns:
        A final conv layer of filters * 4
    """
    def f(input):

        if is_first_block_of_first_layer:
            # don't repeat bn->relu since we just did bn->relu->maxpool
            conv_1_1 = Conv2D(filters=filters, kernel_size=(1, 1),
                              strides=init_strides,
                              padding="same",
                              kernel_initializer="he_normal",
                              kernel_regularizer=l2(1e-4))(input)
        else:
            conv_1_1 = _bn_relu_conv(filters=filters, kernel_size=(1, 1),
                                     strides=init_strides)(input)

        conv_3_3 = _bn_relu_conv(filters=filters, kernel_size=(3, 3))(conv_1_1)
        residual = _bn_relu_conv(filters=filters * 4, kernel_size=(1, 1))(conv_3_3)
        return _shortcut(input, residual)

    return f


def _handle_dim_ordering():
    global ROW_AXIS
    global COL_AXIS
    global CHANNEL_AXIS
    if K.image_dim_ordering() == 'tf':
        ROW_AXIS = 1
        COL_AXIS = 2
        CHANNEL_AXIS = 3
    else:
        CHANNEL_AXIS = 1
        ROW_AXIS = 2
        COL_AXIS = 3


def _get_block(identifier):
    if isinstance(identifier, six.string_types):
        res = globals().get(identifier)
        if not res:
            raise ValueError('Invalid {}'.format(identifier))
        return res
    return identifier


class ResnetBuilder(object):
    @staticmethod
    def build(input_shape, num_outputs, block_fn, repetitions):
        """Builds a custom ResNet like architecture.
        Args:
            input_shape: The input shape in the form (nb_channels, nb_rows, nb_cols)
            num_outputs: The number of outputs at final softmax layer
            block_fn: The block function to use. This is either `basic_block` or `bottleneck`.
                The original paper used basic_block for layers < 50
            repetitions: Number of repetitions of various block units.
                At each block unit, the number of filters are doubled and the input size is halved
        Returns:
            The keras `Model`.
        """
        _handle_dim_ordering()
        if len(input_shape) != 3:
            raise Exception("Input shape should be a tuple (nb_channels, nb_rows, nb_cols)")

        # Permute dimension order if necessary
        if K.image_dim_ordering() == 'tf':
            input_shape = (input_shape[1], input_shape[2], input_shape[0])

        # Load function from str if needed.
        block_fn = _get_block(block_fn)

        input = Input(shape=input_shape)
#         conv1 = _conv_bn_relu(filters=64, kernel_size=(7, 7), strides=(2, 2))(input)
        conv1 = _conv_bn_relu(filters=32, kernel_size=(7, 7), strides=(2, 2))(input)
        pool1 = MaxPooling2D(pool_size=(3, 3), strides=(2, 2), padding="same")(conv1)

        block = pool1
        filters = 32 #64
        for i, r in enumerate(repetitions):
            block = _residual_block(block_fn, filters=filters, repetitions=r, is_first_layer=(i == 0))(block)
            filters *= 2

        # Last activation
        block = _bn_relu(block)

        # Classifier block
        block_shape = K.int_shape(block)
#         pool2 = AveragePooling2D(pool_size=(block_shape[ROW_AXIS], block_shape[COL_AXIS]),
#                                  strides=(1, 1))(block)
        
        fc1 = Conv2D(filters=1024, kernel_size=[block_shape[ROW_AXIS],1], strides=[1,1], padding='valid', name='fc1')(block)
    
#         pool2 = AveragePooling2D(pool_size=[1,block_shape[COL_AXIS]], name='avgpool')(fc1)                
#         flatten1 = Flatten()(pool2)
        
        flatten1 = GlobalAveragePooling2D(name='avgpool')(fc1)
        dense = Dense(units=num_outputs, kernel_initializer="he_normal",
                      activation="softmax")(flatten1)

        model = Model(inputs=input, outputs=dense)
        return model

    @staticmethod
    def build_resnet_18(input_shape, num_outputs):
        return ResnetBuilder.build(input_shape, num_outputs, basic_block, [2, 2, 2, 2])

    @staticmethod
    def build_resnet_34(input_shape, num_outputs):
        return ResnetBuilder.build(input_shape, num_outputs, basic_block, [3, 4, 6, 3])

    @staticmethod
    def build_resnet_50(input_shape, num_outputs):
        return ResnetBuilder.build(input_shape, num_outputs, bottleneck, [3, 4, 6, 3])

    @staticmethod
    def build_resnet_101(input_shape, num_outputs):
        return ResnetBuilder.build(input_shape, num_outputs, bottleneck, [3, 4, 23, 3])

    @staticmethod
    def build_resnet_152(input_shape, num_outputs):
        return ResnetBuilder.build(input_shape, num_outputs, bottleneck, [3, 8, 36, 3])

####Build and Summary

In [0]:
resnet = ResnetBuilder()
model = resnet.build_resnet_50((1,512,298),1251)

model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 512, 298, 1)  0                                            
__________________________________________________________________________________________________
conv2d_54 (Conv2D)              (None, 256, 149, 32) 1600        input_2[0][0]                    
__________________________________________________________________________________________________
batch_normalization_50 (BatchNo (None, 256, 149, 32) 128         conv2d_54[0][0]                  
__________________________________________________________________________________________________
activation_50 (Activation)      (None, 256, 149, 32) 0           batch_normalization_50[0][0]     
__________________________________________________________________________________________________
max_poolin

####compile and Train

In [0]:
from keras.callbacks import ModelCheckpoint

# Parameters
params = {'dim': (512,298),
          'batch_size': 64,
          'n_classes': 1251,
          'n_channels': 1,
          'shuffle': True}

# Datasets
partition, labels = split_data('train') # IDs & Labels
# Generators
training_generator = DataGenerator(partition, labels, train=True, **params)

partition, labels = split_data('valid') # IDs & Labels
validation_generator = DataGenerator(partition, labels, **params)


# optimizer = 'sgd'
model.compile(optimizer = 'sgd',
              loss = 'categorical_crossentropy',
              metrics=['accuracy'])


# filepath = "drive/My\ Drive/model_resnet50.h5"
filepath = "model_resnet50.h5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

# validation_data=validation_generator,
model.fit_generator(generator=training_generator,
                    validation_data=validation_generator,
                    epochs=10,
                    use_multiprocessing=True,
                    workers=6,
                    callbacks=callbacks_list)

Epoch 1/10

Epoch 00001: loss improved from inf to 8.71992, saving model to model_resnet50.h5
Epoch 2/10
   7/1080 [..............................] - ETA: 49:31 - loss: 8.0270 - acc: 0.0580

Exception in thread Thread-50:
Traceback (most recent call last):
  File "/usr/lib/python3.6/threading.py", line 916, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.6/threading.py", line 864, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.6/multiprocessing/pool.py", line 405, in _handle_workers
    pool._maintain_pool()
  File "/usr/lib/python3.6/multiprocessing/pool.py", line 246, in _maintain_pool
    self._repopulate_pool()
  File "/usr/lib/python3.6/multiprocessing/pool.py", line 239, in _repopulate_pool
    w.start()
  File "/usr/lib/python3.6/multiprocessing/process.py", line 105, in start
    self._popen = self._Popen(self)
  File "/usr/lib/python3.6/multiprocessing/context.py", line 277, in _Popen
    return Popen(process_obj)
  File "/usr/lib/python3.6/multiprocessing/popen_fork.py", line 19, in __init__
    self._launch(process_obj)
  File "/usr/lib/python3.6/multiprocessing/popen_fork.py", line 66, in _launch
    self.pid = os.fork

   8/1080 [..............................] - ETA: 49:17 - loss: 8.0251 - acc: 0.0605

Exception in thread Thread-53:
Traceback (most recent call last):
  File "/usr/lib/python3.6/threading.py", line 916, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.6/threading.py", line 864, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.6/multiprocessing/pool.py", line 405, in _handle_workers
    pool._maintain_pool()
  File "/usr/lib/python3.6/multiprocessing/pool.py", line 246, in _maintain_pool
    self._repopulate_pool()
  File "/usr/lib/python3.6/multiprocessing/pool.py", line 239, in _repopulate_pool
    w.start()
  File "/usr/lib/python3.6/multiprocessing/process.py", line 105, in start
    self._popen = self._Popen(self)
  File "/usr/lib/python3.6/multiprocessing/context.py", line 277, in _Popen
    return Popen(process_obj)
  File "/usr/lib/python3.6/multiprocessing/popen_fork.py", line 19, in __init__
    self._launch(process_obj)
  File "/usr/lib/python3.6/multiprocessing/popen_fork.py", line 66, in _launch
    self.pid = os.fork

KeyboardInterrupt: ignored

##Add Speech Signal Phase to Inputs

###Data Generator (magnitude, phase)

In [0]:
import keras

class DataGenerator_phase(keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, list_IDs, labels, train=True, batch_size=32, dim=(512,300), n_channels=1,
                 n_classes=1251, shuffle=True):
        'Initialization'
        self.train = train
        self.dim = dim
        self.batch_size = batch_size
        self.labels = labels
        self.list_IDs = list_IDs[:int(len(list_IDs)/2)]
        self.n_channels = n_channels
        self.n_classes = n_classes
        self.shuffle = shuffle
        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.list_IDs) / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

        # Find list of IDs
        list_IDs_temp = [self.list_IDs[k] for k in indexes]
        labels_temp = [self.labels[k] for k in indexes]

        # Generate data
        X, y = self.__data_generation(list_IDs_temp, labels_temp)

        return X, y

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.list_IDs))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, list_IDs_temp, labels_temp):
        'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
        # Initialization
        X = np.empty((self.batch_size, *self.dim, self.n_channels))
        y = np.empty((self.batch_size), dtype=int)

        # Generate data
        for i, audio_path in enumerate(list_IDs_temp):          
          # Store class
          y[i] = labels_temp[i]

          # read .wav
          rate, samples = wavfile.read(audio_path)
          
          ## parameters
          window = 'hamming'
          # window width and step size
          Tw = 25 # ms
          Ts = 10 # ms
          # frame duration (samples)
          Nw = int(rate * Tw * 1e-3)
          Ns = int(rate * (Tw - Ts) * 1e-3)
          # overlapped duration (samples)
          # 2 ** to the next pow of 2 of (Nw - 1)
          nfft = 2 ** (Nw - 1).bit_length()
          pre_emphasis = 0.97

          # preemphasis filter
          samples = np.append(samples[0], samples[1:] - pre_emphasis * samples[:-1])

          # removes DC component of the signal and add a small dither
          samples = signal.lfilter([1, -1], [1, -0.99], samples)
          dither = np.random.uniform(-1, 1, samples.shape)
          spow = np.std(samples)
          samples = samples + 1e-6 * spow * dither
          
          
          
          
          for _ in range(2):
            samples = np.append(samples, samples)
         
        
        
#           if self.train:
          # segment selection
          segment_len = 3 # sec
          upper_bound = len(samples) - segment_len * rate
          start = np.random.randint(0, upper_bound)
          end = start + segment_len * rate
          samples = samples[start:end]

          # spectogram
          _, _, spec_mag = signal.spectrogram(samples, rate, window, Nw, Ns, nfft, 
                                          mode='magnitude', return_onesided=False)

          # just multiplying it by 1600 makes spectrograms in the paper and here "the same"
          spec_mag *= rate / 10
          
          spec_mag = Normalize(spec_mag)
          spec_mag = ToTensor(spec_mag)
          
          spec_mag.reshape(spec_mag.shape[0], spec_mag.shape[1], 1)
          
          # phase
          _, _, spec_phase = signal.spectrogram(samples, rate, window, Nw, Ns, nfft, 
                                          mode='phase', return_onesided=False)
          spec_phase[1:,:] = np.diff(spec_phase, axis=0)
          spec_phase *= rate / 10
          spec_phase = Normalize(spec_phase)
          spec_phase = ToTensor(spec_phase)

          spec_phase.reshape(spec_phase.shape[0], spec_phase.shape[1], 1)
                    
          spec = np.concatenate((spec_mag, spec_phase), axis=2)
          
          # Store sample
          X[i,] = spec
        

        return X, keras.utils.to_categorical(y, num_classes=self.n_classes)

###Train

In [0]:
# Parameters
params = {'dim': (512,298),
          'batch_size': 100,
          'n_classes': 1251,
          'n_channels': 2,
          'shuffle': True}

# Datasets
partition, labels = split_data('train') # IDs & Labels

# Generators
training_generator_phase = DataGenerator_phase(partition, labels, train=True, **params)
# validation_generator = DataGenerator(partition['validation'], labels, **params)


model = vgg_m_builder(num_classes=1251, num_channel=2)

# optimizer = 'sgd'
model.compile(optimizer = 'sgd',
              loss = 'categorical_crossentropy',
              metrics=['accuracy'])

# validation_data=validation_generator,
model.fit_generator(generator=training_generator_phase,
                    epochs=10,
                    use_multiprocessing=True,
                    workers=6)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10

ResourceExhaustedError: ignored

In [0]:
# Parameters
params = {'dim': (512,298),
          'batch_size': 100,
          'n_classes': 1251,
          'n_channels': 2,
          'shuffle': True}

# Datasets
partition, labels = split_data('test') # IDs & Labels

# Generators
test_generator = DataGenerator_phase(partition, labels, train=False, **params)

loss, accuracy = model.evaluate_generator(test_generator,
                                          use_multiprocessing=True,
                                          workers=6)
print(loss)
print("%.2f" % (accuracy * 100), '%')


10.273668382225967
0.17 %


# Pre-Trained Model (Keras)

###Mount Google Drive

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


###Download Mozilla Cmmon Voice Persian dataset and save to google drive

In [0]:
! wget https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/fa.tar.gz
! cp fa.tar.gz drive/My\ Drive/datasets/fa.tar.gz

### or Import saved dataset from google drive

In [0]:
! cp drive/My\ Drive/datasets/fa.tar.gz fa.tar.gz

In [0]:
! mkdir common_voice
! tar -C common_voice -xf fa.tar.gz

##as Feature Extractor

In [0]:
! git clone https://github.com/mohsenoon/iust-dl97-project.git
! apt install libasound2-dev portaudio19-dev libportaudio2 libportaudiocpp0
! pip install pyaudio

###for 1194 speakers

In [0]:
import os
import collections

with open('common_voice/validated.tsv', 'r') as val:
  lines = val.readlines()
  
clients_id = []
files_name = []
mp3_name = []
for x in lines[1:]:
  clients_id.append(x.split()[0])
  files_name.append(x.split()[1].replace('mp3','wav'))
  mp3_name.append(x.split()[1])
  
import collections
sample_per_speaker = 2
spk_id = [item for item, count in collections.Counter(clients_id).items() if count >= sample_per_speaker]


spk_index = []
file_index = []
mp3_index = []
for i, sid in enumerate(spk_id):
  idx = clients_id.index(sid)
  [spk_index.append(i) for f in clients_id[idx : idx+sps]]
  [file_index.append(os.path.join(DATA_PATH, f)) for f in files_name[idx : idx+sps]]
  [mp3_index.append(os.path.join('common_voice/clips', f)) for f in mp3_name[idx : idx+sps]]

In [0]:
import os
# mp3 to wav
if not os.path.isdir('common_voice/wav'):
  os.mkdir('common_voice/wav')
for i, wav in enumerate(file_index):
  os.system('ffmpeg -i {} -ar 16000 {}'.format(mp3_index[i], wav))

In [0]:
import csv

with open('iust-dl97-project/cfg/enroll_list.csv', mode='w') as csv_file:
    fieldnames = ['filename', 'speaker']
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
    writer.writeheader()
    for i in range(0, len(file_index), 2):
      writer.writerow({'filename': '../'+file_index[i], 'speaker': spk_index[i]})

with open('iust-dl97-project/cfg/test_list.csv', mode='w') as csv_file:
    fieldnames = ['filename', 'speaker']
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
    writer.writeheader()
    for i in range(1, len(file_index), 2):
      writer.writerow({'filename': '../'+file_index[i], 'speaker': spk_index[i]})


In [0]:
% cd iust-dl97-project
! python scoring.py
% cd ..

/content/iust-dl97-project
Using TensorFlow backend.
Loading model weights from [model/weights.h5]....
W0721 14:41:17.353217 140413200598912 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0721 14:41:17.370271 140413200598912 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0721 14:41:17.397982 140413200598912 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:245: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0721 14:41:17.398170 140413200598912 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:174: The name tf.get_default_sessi

####Accuracy Calculation

In [0]:
import pandas as pd

df = pd.read_csv('iust-dl97-project/res/results.csv')

accuracy = sum(df['correct']) / len(df['correct'])

print('Accuracy =', accuracy * 100, '%')

Accuracy = 30.569514237855945 %


###for 196 speakers

In [0]:
import os
import collections

with open('common_voice/validated.tsv', 'r') as val:
  lines = val.readlines()
  
clients_id = []
files_name = []
mp3_name = []
for x in lines[1:]:
  clients_id.append(x.split()[0])
  files_name.append(x.split()[1].replace('mp3','wav'))
  mp3_name.append(x.split()[1])
  
import collections
sample_per_speaker = 40
spk_id = [item for item, count in collections.Counter(clients_id).items() if count >= sample_per_speaker]

print('number of speakers =', len(spk_id))

spk_index = []
file_index = []
mp3_index = []
for i, sid in enumerate(spk_id):
  idx = clients_id.index(sid)
  [spk_index.append(i) for f in clients_id[idx : idx+sps]]
  [file_index.append(os.path.join(DATA_PATH, f)) for f in files_name[idx : idx+sps]]
  [mp3_index.append(os.path.join('common_voice/clips', f)) for f in mp3_name[idx : idx+sps]]

number of speakers = 196


In [0]:
import csv

with open('iust-dl97-project/cfg/enroll_list.csv', mode='w') as csv_file:
    fieldnames = ['filename', 'speaker']
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
    writer.writeheader()
    for i in range(0, len(file_index), 2):
      writer.writerow({'filename': '../'+file_index[i], 'speaker': spk_index[i]})

with open('iust-dl97-project/cfg/test_list.csv', mode='w') as csv_file:
    fieldnames = ['filename', 'speaker']
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
    writer.writeheader()
    for i in range(1, len(file_index), 2):
      writer.writerow({'filename': '../'+file_index[i], 'speaker': spk_index[i]})

In [0]:
% cd iust-dl97-project
! python scoring.py
% cd ..

/content/iust-dl97-project
Using TensorFlow backend.
Loading model weights from [model/weights.h5]....
W0721 14:33:04.827537 140481328191360 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0721 14:33:04.844577 140481328191360 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0721 14:33:04.872748 140481328191360 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:245: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0721 14:33:04.872946 140481328191360 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:174: The name tf.get_default_sessi

####Accuracy Calculation

In [0]:
import pandas as pd

df = pd.read_csv('iust-dl97-project/res/results.csv')

accuracy = sum(df['correct']) / len(df['correct'])

print('Accuracy =', accuracy * 100, '%')

Accuracy = 50.0 %


##FineTuning

In [5]:
! wget http://www.robots.ox.ac.uk/~vgg/data/voxceleb/models/vggvox_ident_net.mat

--2019-07-25 06:18:00--  http://www.robots.ox.ac.uk/~vgg/data/voxceleb/models/vggvox_ident_net.mat
Resolving www.robots.ox.ac.uk (www.robots.ox.ac.uk)... 129.67.94.2
Connecting to www.robots.ox.ac.uk (www.robots.ox.ac.uk)|129.67.94.2|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 66806091 (64M)
Saving to: ‘vggvox_ident_net.mat’


2019-07-25 06:18:16 (4.17 MB/s) - ‘vggvox_ident_net.mat’ saved [66806091/66806091]



In [0]:
from scipy.io import loadmat
import numpy as np

net = loadmat('vggvox_ident_net.mat',
                matlab_compatible=False,
                struct_as_record=False)
net = net['net'][0,0]
layers = net.layers[0]

layers_dict = {}
for layer in layers:
  layers_dict[layer[0,0].name[0]] = layer[0,0]

In [0]:
with open('common_voice/validated.tsv', 'r') as val:
  lines = val.readlines()
  
clients_id = []
files_name = []
for x in lines[1:]:
  clients_id.append(x.split()[0])
  files_name.append(x.split()[1].replace('mp3','wav'))
  
import collections
sps = 40
spk_id = [item for item, count in collections.Counter(clients_id).items() if count >= sps]

In [0]:
import os

DATA_PATH = 'common_voice/wav'

spk_index = []
file_index = []
for i, sid in enumerate(spk_id):
  idx = clients_id.index(sid)
  [spk_index.append(i) for f in clients_id[idx : idx+sps]]
  [file_index.append(os.path.join(DATA_PATH, f)) for f in files_name[idx : idx+sps]]


###convert mp3 to wav

In [0]:
import os
# mp3 to wav
mp3_path = 'common_voice/clips'
wav_path = 'common_voice/wav'
for wav in file_index:
  os.system('ffmpeg -i {}.mp3 -ar 16000 {}'.format(os.path.join(mp3_path, os.path.splitext(wav)[0]), os.path.join(wav_path, wav)))


In [0]:
from keras import Model
from keras.layers import Conv2D, Dense, Flatten, Activation, Input, Reshape
from keras.layers import MaxPooling2D, AveragePooling2D, ZeroPadding2D, GlobalAveragePooling2D
from keras.layers.normalization import BatchNormalization

def vgg_m_builder_4096(num_classes, num_channel):
  
  inputs = Input(shape=(None,None,num_channel), name='input')

  x = Conv2D(96, 7, strides=2, padding='same', name='conv1')(inputs)
  x = BatchNormalization(trainable=False, name='batch1')(x)
  x = Activation('relu', name='act1')(x)
  x = MaxPooling2D(3, 2, name='mpool1')(x)
  
  x = Conv2D(256, 5, strides=2, padding='same', name='conv2')(x)
  x = BatchNormalization(name='batch2')(x)
  x = Activation('relu', name='act2')(x)
  x = MaxPooling2D(3, 2, name='mpool2')(x)
  
  x = Conv2D(384, 3, strides=1, padding='same', name='conv3')(x)
  x = BatchNormalization(name='batch3')(x)
  x = Activation('relu', name='act3')(x)
  
  x = Conv2D(256, 3, strides=1, padding='same', name='conv4')(x)
  x = BatchNormalization(name='batch4')(x)
  x = Activation('relu', name='act4')(x)
  
  x = Conv2D(256, 3, strides=1, padding='same', name='conv5')(x)
  x = BatchNormalization(name='batch5')(x)
  x = Activation('relu', name='act5')(x)
  x = MaxPooling2D(pool_size=(5,3), strides=(3,2), name='mpool5')(x)
  
  x = Conv2D(4096, (9,1), strides=1, padding='valid', name='fc6')(x)
  x = BatchNormalization(name='batch6')(x)
  x = Activation('relu', name='act6')(x)
#   x = AveragePooling2D(pool_size=(1,int(x.shape[2])), strides=1, name='apool6')(x)
#   x = Flatten(name='flat1')(x)

  x = GlobalAveragePooling2D()(x)
  
#   x = Reshape((1,1,4096))(x)
  
  x = Dense(1024, name='fc7')(x)
  x = BatchNormalization(name='batch7')(x)
  x = Activation('relu', name='act7')(x)    
  
  predictions = Dense(num_classes, activation='softmax', name='fc8')(x)

  model = Model(inputs=inputs, outputs=predictions)

  return model

In [0]:
model = vgg_m_builder_4096(1251, 1)

In [16]:
layers_name = ['conv1', 'conv2', 'conv3', 'conv4', 'conv5', 'fc6', 'fc7', 'fc8']
for name in layers_name:
  weights =[]
  weights.append(layers_dict[name].weights[0,0])
  weights.append(np.asarray([b[0] for b in layers_dict[name].weights[0,1]]))
  model.get_layer(name).set_weights(weights)

ValueError: ignored