##Download and Extract VoxCeleb1

In [0]:
! wget --user voxceleb1902 --password nx0bl2v2 http://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partaa
! wget --user voxceleb1902 --password nx0bl2v2 http://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partab
! wget --user voxceleb1902 --password nx0bl2v2 http://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partac
! wget --user voxceleb1902 --password nx0bl2v2 http://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partad

! wget --user voxceleb1902 --password nx0bl2v2 http://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_test_wav.zip

! wget http://www.robots.ox.ac.uk/~vgg/data/voxceleb/data/vox1_dev_txt.zip  
! wget http://www.robots.ox.ac.uk/~vgg/data/voxceleb/data/vox1_test_txt.zip

! wget http://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/iden_split.txt
! wget http://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/veri_test.txt
  
! cat vox1_dev* > vox1_dev_wav.zip
! rm vox1_dev_wav_partaa vox1_dev_wav_partab vox1_dev_wav_partac vox1_dev_wav_partad
! mkdir -p voxceleb1
! mv *.zip voxceleb1
! mv *.txt voxceleb1

In [0]:
import os
import zipfile

DATA_PATH = 'voxceleb1/'

print('Starting to unpack vox1_dev_wav.zip')
zip = zipfile.ZipFile(os.path.join(DATA_PATH, 'vox1_dev_wav.zip'), 'r')
zip.extractall(DATA_PATH)
zip.close()
print('Done. Starting to unpack vox1_test_wav.zip')
zip = zipfile.ZipFile(os.path.join(DATA_PATH, 'vox1_test_wav.zip'), 'r')
zip.extractall(DATA_PATH)
zip.close()
print('Done. Starting to unpack vox1_dev_txt.zip')
zip = zipfile.ZipFile(os.path.join(DATA_PATH, 'vox1_dev_txt.zip'), 'r')
zip.extractall(DATA_PATH)
zip.close()
print('Done. Starting to unpack vox1_test_txt.zip')
zip = zipfile.ZipFile(os.path.join(DATA_PATH, 'vox1_test_txt.zip'), 'r')
zip.extractall(DATA_PATH)
zip.close()
print('Done.')

os.remove(os.path.join(DATA_PATH, 'vox1_dev_wav.zip'))
os.remove(os.path.join(DATA_PATH, 'vox1_test_wav.zip'))
os.remove(os.path.join(DATA_PATH, 'vox1_dev_txt.zip'))
os.remove(os.path.join(DATA_PATH, 'vox1_test_txt.zip'))


##with Keras

In [0]:
import os

import numpy as np
import pandas as pd
from scipy.io import wavfile
from scipy import signal

import warnings
warnings.filterwarnings('ignore') # scipy throws future warnings on fft (known bug)

In [0]:
def Normalize(spec):
  """Normalizes voice spectrogram (mean-varience)"""
  # (Freq, Time)
  # mean-variance normalization for every spectrogram (not batch-wise)
  mu = spec.mean(axis=1).reshape(spec.shape[0], 1)
  sigma = spec.std(axis=1).reshape(spec.shape[0], 1)
  spec = (spec - mu) / sigma

  return spec

def ToTensor(spec):
  """Convert spectogram to Tensor."""
  F, T = spec.shape

  # now specs are of size (Freq, Time) and 2D but has to be 3D (channel dim)
  spec = spec.reshape(F, T, 1)

  # make the ndarray to be of a proper type (was float64)
  spec = spec.astype(np.float32)

#   return torch.from_numpy(spec)
  return spec

####Data Generator

In [0]:
import keras

class DataGenerator(keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, list_IDs, labels, train=True, batch_size=32, dim=(512,300), n_channels=1,
                 n_classes=1251, shuffle=True):
        'Initialization'
        self.train = train
        self.dim = dim
        self.batch_size = batch_size
        self.labels = labels
        self.list_IDs = list_IDs
        self.n_channels = n_channels
        self.n_classes = n_classes
        self.shuffle = shuffle
        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.list_IDs) / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

        # Find list of IDs
        list_IDs_temp = [self.list_IDs[k] for k in indexes]
        labels_temp = [self.labels[k] for k in indexes]

        # Generate data
        X, y = self.__data_generation(list_IDs_temp, labels_temp)

        return X, y

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.list_IDs))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, list_IDs_temp, labels_temp):
        'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
        # Initialization
        X = np.empty((self.batch_size, *self.dim, self.n_channels))
        y = np.empty((self.batch_size), dtype=int)

        # Generate data
        for i, audio_path in enumerate(list_IDs_temp):          
          # Store class
          y[i] = labels_temp[i]

          # read .wav
          rate, samples = wavfile.read(audio_path)
          
          ## parameters
          window = 'hamming'
          # window width and step size
          Tw = 25 # ms
          Ts = 10 # ms
          # frame duration (samples)
          Nw = int(rate * Tw * 1e-3)
          Ns = int(rate * (Tw - Ts) * 1e-3)
          # overlapped duration (samples)
          # 2 ** to the next pow of 2 of (Nw - 1)
          nfft = 2 ** (Nw - 1).bit_length()
          pre_emphasis = 0.97

          # preemphasis filter
          samples = np.append(samples[0], samples[1:] - pre_emphasis * samples[:-1])

          # removes DC component of the signal and add a small dither
          samples = signal.lfilter([1, -1], [1, -0.99], samples)
          dither = np.random.uniform(-1, 1, samples.shape)
          spow = np.std(samples)
          samples = samples + 1e-6 * spow * dither
          
          
          
          
          for _ in range(2):
            samples = np.append(samples, samples)
         
        
        
          if self.train:
              # segment selection
              segment_len = 3 # sec
              upper_bound = len(samples) - segment_len * rate
              start = np.random.randint(0, upper_bound)
              end = start + segment_len * rate
              samples = samples[start:end]

          # spectogram
          _, _, spec = signal.spectrogram(samples, rate, window, Nw, Ns, nfft, 
                                          mode='complex', return_onesided=False)

          # just multiplying it by 1600 makes spectrograms in the paper and here "the same"
          spec *= rate / 10
          
          spec = Normalize(spec)
          spec = ToTensor(spec)
          
          
          
          
          
          
                                       
          _, _, spec_phase = signal.spectrogram(samples, rate, window, Nw, Ns, nfft, 
                                                mode='phase', return_onesided=False)
          spec_ = np.concatenate((spec, np.expand_dims(spec_phase, axis=-1)), axis=2)





          
          # Store sample
          X[i,] = spec
        

        return X, keras.utils.to_categorical(y, num_classes=self.n_classes)

In [0]:
from scipy import signal
import matplotlib.pyplot as plt
from google.colab.patches import cv2_imshow

In [0]:
# read .wav
rate, samples = wavfile.read('madc0_si737.wav')

## parameters
window = 'hamming'
# window width and step size
Tw = 25 # ms
Ts = 10 # ms
# frame duration (samples)
Nw = int(rate * Tw * 1e-3)
Ns = int(rate * (Tw - Ts) * 1e-3)
# overlapped duration (samples)
# 2 ** to the next pow of 2 of (Nw - 1)
nfft = 2 ** (Nw - 1).bit_length()
pre_emphasis = 0.97

# preemphasis filter
samples = np.append(samples[0], samples[1:] - pre_emphasis * samples[:-1])

# removes DC component of the signal and add a small dither
samples = signal.lfilter([1, -1], [1, -0.99], samples)
dither = np.random.uniform(-1, 1, samples.shape)
spow = np.std(samples)
samples = samples + 1e-6 * spow * dither

_, _, spec = signal.spectrogram(samples, rate, window, Nw, Ns, nfft, 
                                          mode='magnitude', return_onesided=False)

# cv2_imshow(spec)
                                
_, _, spec_phase = signal.spectrogram(samples, rate, window, Nw, Ns, nfft, 
                                      mode='phase', return_onesided=False)

# cv2_imshow(spec_phase)

spec = np.concatenate((np.expand_dims(spec, axis=-1), np.expand_dims(spec_phase, axis=-1)), axis=2)

spec.shape

(256, 478, 2)

In [0]:
def split_data(phase):
  iden_split_path = os.path.join(DATA_PATH, 'iden_split.txt')
  split = pd.read_table(iden_split_path, sep=' ', header=None, names=['phase', 'path'])

  if phase == 'train':
    phases = [1, 2]
  else:
    phases = [3]

  mask = split['phase'].isin(phases)

  dataset = split['path'][mask].reset_index(drop=True)
  path = DATA_PATH

  list_IDs = [os.path.join(DATA_PATH, 'wav', track_path) for track_path in dataset]
  labels = [int(track_path.split('/')[0].replace('id1', '')) - 1 for track_path in dataset]
  
  return list_IDs, labels

In [0]:
from keras.models import Sequential

# Parameters
params = {'dim': (512,298),
          'batch_size': 100,
          'n_classes': 1251,
          'n_channels': 1,
          'shuffle': True}

# Datasets
partition, labels = split_data('train') # IDs & Labels

# Generators
training_generator = DataGenerator(partition, labels, train=True, **params)
# validation_generator = DataGenerator(partition['validation'], labels, **params)


In [0]:
from keras import Model
from keras.layers import Conv2D, Dense, Flatten, Activation, Input
from keras.layers import MaxPooling2D, AveragePooling2D, ZeroPadding2D
from keras.layers.normalization import BatchNormalization

def my_model(num_classes):
  
  inputs = Input(shape=(512,298,1), name='input')

  x = ZeroPadding2D(1, name='pad1')(inputs)
  x = Conv2D(96, 7, strides=2, name='conv1')(x)
  x = BatchNormalization(trainable=False, name='batch1')(x)
  x = Activation('relu', name='act1')(x)
  x = MaxPooling2D(3, 2, name='mpool1')(x)
  
  x = ZeroPadding2D(1, name='pad2')(x)
  x = Conv2D(256, 5, strides=2, name='conv2')(x)
  x = BatchNormalization(name='batch2')(x)
  x = Activation('relu', name='act2')(x)
  x = MaxPooling2D(3, 2, name='mpool2')(x)
  
  x = ZeroPadding2D(1, name='pad3')(x)
  x = Conv2D(384, 3, strides=1, name='conv3')(x)
  x = BatchNormalization(name='batch3')(x)
  x = Activation('relu', name='act3')(x)
  
  x = ZeroPadding2D(1, name='pad4')(x)
  x = Conv2D(256, 3, strides=1, name='conv4')(x)
  x = BatchNormalization(name='batch4')(x)
  x = Activation('relu', name='act4')(x)
  
  x = ZeroPadding2D(1, name='pad5')(x)
  x = Conv2D(256, 3, strides=1, name='conv5')(x)
  x = BatchNormalization(name='batch5')(x)
  x = Activation('relu', name='act5')(x)
  x = MaxPooling2D(pool_size=(5,3), strides=(3,2), name='mpool5')(x)
  
  x = Conv2D(4096, (9,1), strides=1, name='fc6')(x)
  x = BatchNormalization(name='batch6')(x)
  x = Activation('relu', name='act6')(x)
  x = AveragePooling2D(pool_size=(1,int(x.shape[2])), strides=1, name='apool6')(x)
  
  x = Flatten(name='flat1')(x)
  
  x = Dense(1024, name='fc7')(x)
  x = BatchNormalization(name='batch7')(x)
  x = Activation('relu', name='act7')(x)    
  
  predictions = Dense(num_classes, activation='softmax', name='fc8')(x)

  model = Model(inputs=inputs, outputs=predictions)

  return model

In [0]:
model = my_model(1251)

# model.summary()

In [0]:
# optimizer = 'sgd'
model.compile(optimizer = 'sgd',
              loss = 'categorical_crossentropy',
              metrics=['accuracy'])


In [0]:
# validation_data=validation_generator,
model.fit_generator(generator=training_generator,
                    use_multiprocessing=True,
                    workers=6)

Epoch 1/1

Process ForkPoolWorker-125:
Process ForkPoolWorker-124:
Process ForkPoolWorker-126:
Process ForkPoolWorker-122:
Process ForkPoolWorker-121:
Process ForkPoolWorker-123:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
Traceback (most recent call last):
  File "/usr/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
Traceback (most recent call last):
  File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File

KeyboardInterrupt: ignored

 815/1452 [===============>..............] - ETA: 31:22 - loss: 6.6341 - acc: 0.0267
 
  467/1452 [========>.....................] - ETA: 29:15 - loss: 6.8480 - acc: 0.0172

####Pre-Trained Model

In [0]:
! wget http://www.robots.ox.ac.uk/~vgg/data/voxceleb/models/vggvox_ident_net.mat

In [0]:
from scipy.io import loadmat

net = loadmat('vggvox_ident_net.mat',
                matlab_compatible=False,
                struct_as_record=False)
net = net['net'][0,0]
layers = net.layers[0]

In [0]:
layers_dict = {}
for layer in layers:
  layers_dict[layer[0,0].name[0]] = layer[0,0]

In [0]:
# layers_name = ['conv1', 'conv2', 'conv3', 'conv4', 'conv5', 'fc6', 'fc7', 'fc8']
# for name in layers_name:
#   weights =[]
#   weights.append(layers_dict[name].weights[0,0])
#   weights.append(np.asarray([b[0] for b in layers_dict[name].weights[0,1]]))
#   model.get_layer(name).set_weights(weights)
  


####Evaluation

In [0]:
from keras.models import Sequential

# Parameters
params = {'dim': (512,298),
          'batch_size': 1,
          'n_classes': 1251,
          'n_channels': 1,
          'shuffle': True}

# Datasets
test_set, test_labels = split_data('test') # IDs & Labels

# Generators
test_generator = DataGenerator(test_set, test_labels, train=False, **params)


In [0]:
loss, accuracy = model.evaluate_generator(test_generator)
print(loss, accuracy)

###Common Voice dataset

####Download Persian dataset and save to google drive

In [0]:
# ! wget https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/fa.tar.gz
# ! mv fa.tar.gz drive/My\ Drive/datasets

In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
! cp drive/My\ Drive/datasets/fa.tar.gz fa.tar.gz
! mkdir common_voice
! tar -C common_voice -xf fa.tar.gz

In [0]:
with open('common_voice/validated.tsv', 'r') as val:
  lines = val.readlines()
  
clients_id = []
files_name = []
for x in lines[1:]:
  clients_id.append(x.split()[0])
  files_name.append(x.split()[1].replace('mp3','wav'))
  
import collections
sps = 40
spk_id = [item for item, count in collections.Counter(clients_id).items() if count >= sps]

In [0]:
print(len(spk_id))

196


In [0]:
import os

DATA_PATH = 'common_voice/wav'

spk_index = []
file_index = []
for i, sid in enumerate(spk_id):
  idx = clients_id.index(sid)
  [spk_index.append(i) for f in clients_id[idx : idx+sps]]
  [file_index.append(os.path.join(DATA_PATH, f)) for f in files_name[idx : idx+sps]]
  

In [0]:
# import os
# # mp3 to wav
# mp3_path = 'common_voice/clips'
# wav_path = 'common_voice/wav'
# for wav in file_index:
#   os.system('ffmpeg -i {}.mp3 -ar 16000 {}'.format(os.path.join(mp3_path, os.path.splitext(wav)[0]), os.path.join(wav_path, wav)))

In [0]:
import numpy as np

indexes = np.arange(len(spk_index))
np.random.shuffle(indexes)

train_set = [file_index[i] for i in indexes[:int(70*len(spk_index)/100)]]
train_labels = [spk_index[i] for i in indexes[:int(70*len(spk_index)/100)]]

valid_set = [file_index[i] for i in indexes[int(70*len(spk_index)/100) : int(80*len(spk_index)/100)]]
valid_labels = [spk_index[i] for i in indexes[int(70*len(spk_index)/100) : int(80*len(spk_index)/100)]]

test_set = [file_index[i] for i in indexes[int(80*len(spk_index)/100):]]
test_labels = [spk_index[i] for i in indexes[int(80*len(spk_index)/100):]]


In [0]:
[print(f, file=open('train_labels.txt','a')) for f in train_labels]

In [0]:

# Parameters
params = {'dim': (512,298),
          'batch_size': 100,
          'n_classes': len(spk_id),
          'n_channels': 1,
          'shuffle': True}

# Generators
test_generator = DataGenerator(test_set, test_labels, train=True, **params)
# validation_generator = DataGenerator(partition['validation'], labels, **params)

loss, accuracy = cv_model.evaluate_generator(test_generator)
print(loss)
print("%.2f" % (accuracy * 100), '%')

5.361034297943116
0.20 %


In [0]:
cv_model = my_model_drop(len(spk_id))

In [0]:
layers_name = ['conv1', 'conv2', 'conv3', 'conv4', 'conv5', 'fc6']
for name in layers_name:
  weights =[]
  weights.append(layers_dict[name].weights[0,0])
  weights.append(np.asarray([b[0] for b in layers_dict[name].weights[0,1]]))
  cv_model.get_layer(name).set_weights(weights)
  cv_model.get_layer(name).trainable = False

In [0]:
# Parameters
params = {'dim': (512,298),
          'batch_size': 100,
          'n_classes': len(spk_id),
          'n_channels': 1,
          'shuffle': True}

# Generators
train_generator = DataGenerator(train_set, train_labels, train=True, **params)
validation_generator = DataGenerator(valid_set, valid_labels, train=True, **params)

# optimizer = 'sgd' 'rmsprob' 'adam'
cv_model.compile(optimizer = 'adam',
              loss = 'categorical_crossentropy',
              metrics=['accuracy'])

In [0]:
# validation_data=validation_generator,
cv_model.fit_generator(generator=train_generator,
                       validation_data=validation_generator,
                       epochs = 30,
                       use_multiprocessing=True,
                       workers=6)

In [0]:
from keras import Model
from keras.layers import Conv2D, Dense, Flatten, Activation, Input, Dropout
from keras.layers import MaxPooling2D, AveragePooling2D, ZeroPadding2D
from keras.layers.normalization import BatchNormalization

def my_model_drop(num_classes):
  
  inputs = Input(shape=(512,298,1), name='input')

  x = ZeroPadding2D(1, name='pad1')(inputs)
  x = Conv2D(96, 7, strides=2, name='conv1')(x)
  x = BatchNormalization(trainable=False, name='batch1')(x)
  x = Activation('relu', name='act1')(x)
  x = MaxPooling2D(3, 2, name='mpool1')(x)
  
  x = ZeroPadding2D(1, name='pad2')(x)
  x = Conv2D(256, 5, strides=2, name='conv2')(x)
  x = BatchNormalization(name='batch2')(x)
  x = Activation('relu', name='act2')(x)
  x = MaxPooling2D(3, 2, name='mpool2')(x)
  
  x = ZeroPadding2D(1, name='pad3')(x)
  x = Conv2D(384, 3, strides=1, name='conv3')(x)
  x = BatchNormalization(name='batch3')(x)
  x = Activation('relu', name='act3')(x)
  
  x = ZeroPadding2D(1, name='pad4')(x)
  x = Conv2D(256, 3, strides=1, name='conv4')(x)
  x = BatchNormalization(name='batch4')(x)
  x = Activation('relu', name='act4')(x)
  
  x = ZeroPadding2D(1, name='pad5')(x)
  x = Conv2D(256, 3, strides=1, name='conv5')(x)
  x = BatchNormalization(name='batch5')(x)
  x = Activation('relu', name='act5')(x)
  x = MaxPooling2D(pool_size=(5,3), strides=(3,2), name='mpool5')(x)
  
  x = Conv2D(4096, (9,1), strides=1, name='fc6')(x)
  x = BatchNormalization(name='batch6')(x)
  x = Activation('relu', name='act6')(x)
  x = AveragePooling2D(pool_size=(1,int(x.shape[2])), strides=1, name='apool6')(x)
  
  x = Flatten(name='flat1')(x)
  
  x = Dense(1024, name='fc7',)(x)
  x = BatchNormalization(name='batch7')(x)
  x = Activation('relu', name='act7')(x)  
#   x = Dropout(0.5)(x)
  
  x = Dense(512, name='fc71')(x)
  x = BatchNormalization(name='batch71')(x)
  x = Activation('relu', name='act71')(x)  
#   x = Dropout(0.5)(x)
  
  predictions = Dense(num_classes, activation='softmax', name='fc8')(x)

  model = Model(inputs=inputs, outputs=predictions)

  return model