From 49c12603ca0391d81fddcdacf8586de1bc1c163e Mon Sep 17 00:00:00 2001 From: Philippe Remy Date: Sat, 25 Apr 2020 11:26:33 +0900 Subject: [PATCH 01/13] GRU code. --- README.md | 2 +- batcher.py | 4 +- conv_models.py | 164 +++++++++++++++++------------------------ example.py | 8 +- test.py | 6 +- tests/batcher_test.py | 4 +- tests/batcher_test2.py | 4 +- tests/model.py | 58 +++++++++++++++ train.py | 6 +- 9 files changed, 142 insertions(+), 114 deletions(-) create mode 100644 tests/model.py diff --git a/README.md b/README.md index ddf6133..006d29c 100644 --- a/README.md +++ b/README.md @@ -59,7 +59,7 @@ import random from audio import read_mfcc from batcher import sample_from_mfcc from constants import SAMPLE_RATE, NUM_FRAMES -from conv_models import DeepSpeakerModel +from conv_models import ResCNNModel from test import batch_cosine_similarity # Reproducible results. diff --git a/batcher.py b/batcher.py index b26889c..4c03525 100644 --- a/batcher.py +++ b/batcher.py @@ -11,7 +11,7 @@ from audio import pad_mfcc, Audio from constants import NUM_FRAMES, NUM_FBANKS -from conv_models import DeepSpeakerModel +from conv_models import ResCNNModel, DeepSpeakerModel from utils import ensures_dir, load_pickle, load_npy, train_test_sp_to_utt logger = logging.getLogger(__name__) @@ -495,7 +495,7 @@ def get_speaker_verification_data(self, positive_speaker, num_different_speakers np.random.seed(123) ltb = LazyTripletBatcher(working_dir='/Users/premy/deep-speaker/', max_length=NUM_FRAMES, - model=DeepSpeakerModel()) + model=ResCNNModel()) for i in range(1000): print(i) start = time() diff --git a/conv_models.py b/conv_models.py index 416ad44..4ce03a1 100644 --- a/conv_models.py +++ b/conv_models.py @@ -1,71 +1,60 @@ +import abc import logging -import os -import numpy as np import tensorflow.keras.backend as K from tensorflow.keras import layers from tensorflow.keras import regularizers from tensorflow.keras.layers import BatchNormalization from tensorflow.keras.layers import Conv2D from tensorflow.keras.layers import Dropout +from tensorflow.keras.layers import GRU from tensorflow.keras.layers import Input from tensorflow.keras.layers import Lambda, Dense from tensorflow.keras.layers import Reshape from tensorflow.keras.models import Model -from tensorflow.keras.optimizers import Adam from constants import NUM_FBANKS, NUM_FRAMES -from triplet_loss import deep_speaker_loss logger = logging.getLogger(__name__) class DeepSpeakerModel: - # I thought it was 3 but maybe energy is added at a 4th dimension. - # would be better to have 4 dimensions: - # MFCC, DIFF(MFCC), DIFF(DIFF(MFCC)), ENERGIES (probably tiled across the frequency domain). - # this seems to help match the parameter counts. - def __init__(self, batch_input_shape=(None, NUM_FRAMES, NUM_FBANKS, 1), include_softmax=False, - num_speakers_softmax=None): + def __init__(self, + batch_input_shape=(None, NUM_FRAMES, NUM_FBANKS, 1), + include_softmax=False, + num_speakers_softmax=None, + name='ResCNN'): self.include_softmax = include_softmax + self.num_speakers_softmax = num_speakers_softmax if self.include_softmax: - assert num_speakers_softmax > 0 + assert self.num_speakers_softmax > 0 self.clipped_relu_count = 0 + inputs = Input(batch_shape=batch_input_shape, name='input') + x = self.graph_with_avg_softmax_and_ln(inputs) + self.m = Model(inputs, x, name=name) - # http://cs231n.github.io/convolutional-networks/ - # conv weights - # #params = ks * ks * nb_filters * num_channels_input - - # Conv128-s - # 5*5*128*128/2+128 - # ks*ks*nb_filters*channels/strides+bias(=nb_filters) - - # take 100 ms -> 4 frames. - # if signal is 3 seconds, then take 100ms per 100ms and average out this network. - # 8*8 = 64 features. - - # used to share all the layers across the inputs + @abc.abstractmethod + def graph(self, inputs): + pass - # num_frames = K.shape() - do it dynamically after. - inputs = Input(batch_shape=batch_input_shape, name='input') - x = self.cnn_component(inputs) + def graph_with_avg_softmax_and_ln(self, inputs): + x = self.graph(inputs) - x = Reshape((-1, 2048))(x) # Temporal average layer. axis=1 is time. x = Lambda(lambda y: K.mean(y, axis=1), name='average')(x) - if include_softmax: + if self.include_softmax: logger.info('Including a Dropout layer to reduce overfitting.') # used for softmax because the dataset we pre-train on might be too small. easy to overfit. x = Dropout(0.5)(x) x = Dense(512, name='affine')(x) - if include_softmax: + if self.include_softmax: # Those weights are just when we train on softmax. - x = Dense(num_speakers_softmax, activation='softmax')(x) + x = Dense(self.num_speakers_softmax, activation='softmax')(x) else: # Does not contain any weights. x = Lambda(lambda y: K.l2_normalize(y, axis=1), name='ln')(x) - self.m = Model(inputs, x, name='ResCNN') + return x def keras_model(self): return self.m @@ -82,6 +71,28 @@ def clipped_relu(self, inputs): self.clipped_relu_count += 1 return relu + def set_weights(self, w): + for layer, layer_w in zip(self.m.layers, w): + layer.set_weights(layer_w) + logger.info(f'Setting weights for [{layer.name}]...') + + +class ResCNNModel(DeepSpeakerModel): + + def __init__(self, + batch_input_shape=(None, NUM_FRAMES, NUM_FBANKS, 1), + include_softmax=False, + num_speakers_softmax=None): + super().__init__(batch_input_shape, include_softmax, num_speakers_softmax, 'ResCNN') + + def graph(self, inputs): + x = self.conv_and_res_block(inputs, 64, stage=1) + x = self.conv_and_res_block(x, 128, stage=2) + x = self.conv_and_res_block(x, 256, stage=3) + x = self.conv_and_res_block(x, 512, stage=4) + x = Reshape((-1, 2048))(x) + return x + def identity_block(self, input_tensor, kernel_size, filters, stage, block): conv_name_base = f'res{stage}_{block}_branch' @@ -128,72 +139,29 @@ def conv_and_res_block(self, inp, filters, stage): o = self.identity_block(o, kernel_size=3, filters=filters, stage=stage, block=i) return o - def cnn_component(self, inp): - x = self.conv_and_res_block(inp, 64, stage=1) - x = self.conv_and_res_block(x, 128, stage=2) - x = self.conv_and_res_block(x, 256, stage=3) - x = self.conv_and_res_block(x, 512, stage=4) - return x - def set_weights(self, w): - for layer, layer_w in zip(self.m.layers, w): - layer.set_weights(layer_w) - logger.info(f'Setting weights for [{layer.name}]...') +class GRUModel(DeepSpeakerModel): + def __init__(self, + batch_input_shape=(None, NUM_FRAMES, NUM_FBANKS, 1), + include_softmax=False, + num_speakers_softmax=None): + super().__init__(batch_input_shape, include_softmax, num_speakers_softmax, 'GRU') -def main(): - # Looks correct to me. - # I have 37K but paper reports 41K. which is not too far. - dsm = DeepSpeakerModel() - dsm.m.summary() - - # I suspect num frames to be 32. - # Then fbank=64, then total would be 32*64 = 2048. - # plot_model(dsm.m, to_file='model.png', dpi=300, show_shapes=True, expand_nested=True) - - -def _train(): - # x = np.random.uniform(size=(6, 32, 64, 4)) # 6 is multiple of 3. - # y_softmax = np.random.uniform(size=(6, 100)) - # dsm = DeepSpeakerModel(batch_input_shape=(None, 32, 64, 4), include_softmax=True, num_speakers_softmax=100) - # dsm.m.compile(optimizer=Adam(lr=0.01), loss='categorical_crossentropy') - # print(dsm.m.predict(x).shape) - # print(dsm.m.evaluate(x, y_softmax)) - # w = dsm.get_weights() - dsm = DeepSpeakerModel(batch_input_shape=(None, 32, 64, 4), include_softmax=False) - # dsm.m.set_weights(w) - dsm.m.compile(optimizer=Adam(lr=0.01), loss=deep_speaker_loss) - - # it works!!!!!!!!!!!!!!!!!!!! - # unit_batch_size = 20 - # anchor = np.ones(shape=(unit_batch_size, 32, 64, 4)) - # positive = np.array(anchor) - # negative = np.ones(shape=(unit_batch_size, 32, 64, 4)) * (-1) - # batch = np.vstack((anchor, positive, negative)) - # x = batch - # y = np.zeros(shape=(len(batch), 512)) # not important. - # print('Starting to fit...') - # while True: - # print(dsm.m.train_on_batch(x, y)) - - # should not work... and it does not work! - unit_batch_size = 20 - negative = np.ones(shape=(unit_batch_size, 32, 64, 4)) * (-1) - batch = np.vstack((negative, negative, negative)) - x = batch - y = np.zeros(shape=(len(batch), 512)) # not important. - print('Starting to fit...') - while True: - print(dsm.m.train_on_batch(x, y)) - - -def _test_checkpoint_compatibility(): - dsm = DeepSpeakerModel(batch_input_shape=(None, 32, 64, 4), include_softmax=True, num_speakers_softmax=10) - dsm.m.save_weights('test.h5') - dsm = DeepSpeakerModel(batch_input_shape=(None, 32, 64, 4), include_softmax=False) - dsm.m.load_weights('test.h5', by_name=True) - os.remove('test.h5') - - -if __name__ == '__main__': - _test_checkpoint_compatibility() + def graph(self, inputs): + x = Conv2D(64, kernel_size=5, strides=2, padding='same', kernel_initializer='glorot_uniform', + name='conv1', kernel_regularizer=regularizers.l2(l=0.0001))(inputs) + # shape = (BATCH_SIZE , num_frames/2, 64/2, 64) + x = BatchNormalization(name='bn1')(x) + x = self.clipped_relu(x) + + # 4d -> 3d. + _, frames_dim, fbank_dim, conv_output_dim = K.int_shape(x) + x = Reshape((frames_dim, fbank_dim * conv_output_dim))(x) + x = Reshape((frames_dim, fbank_dim * conv_output_dim))(x) + + # shape = (BATCH_SIZE , num_frames/2, 1024) + x = GRU(1024, name='GRU1', return_sequences=True)(x) + x = GRU(1024, name='GRU2', return_sequences=True)(x) + x = GRU(1024, name='GRU3', return_sequences=True)(x) + return x diff --git a/example.py b/example.py index 593527d..7cfed95 100644 --- a/example.py +++ b/example.py @@ -1,15 +1,17 @@ -import numpy as np import random + +import numpy as np + from audio import read_mfcc from batcher import sample_from_mfcc from constants import SAMPLE_RATE, NUM_FRAMES -from conv_models import DeepSpeakerModel +from conv_models import ResCNNModel from test import batch_cosine_similarity np.random.seed(123) random.seed(123) -model = DeepSpeakerModel() +model = ResCNNModel() model.m.load_weights('/Users/premy/deep-speaker/checkpoints/ResCNN_triplet_training_checkpoint_175.h5', by_name=True) mfcc_001 = sample_from_mfcc(read_mfcc('samples/PhilippeRemy/PhilippeRemy_001.wav', SAMPLE_RATE), NUM_FRAMES) diff --git a/test.py b/test.py index e3583f7..71327d0 100644 --- a/test.py +++ b/test.py @@ -6,7 +6,7 @@ from audio import Audio from batcher import LazyTripletBatcher from constants import NUM_FBANKS, NUM_FRAMES, CHECKPOINTS_TRIPLET_DIR, BATCH_SIZE -from conv_models import DeepSpeakerModel +from conv_models import ResCNNModel from eval_metrics import evaluate from utils import load_best_checkpoint, enable_deterministic @@ -25,7 +25,7 @@ def batch_cosine_similarity(x1, x2): return s -def eval_model(working_dir: str, model: DeepSpeakerModel): +def eval_model(working_dir: str, model: ResCNNModel): enable_deterministic() audio = Audio(working_dir) batcher = LazyTripletBatcher(working_dir, NUM_FRAMES, model) @@ -54,7 +54,7 @@ def eval_model(working_dir: str, model: DeepSpeakerModel): def test(working_dir, checkpoint_file=None): batch_input_shape = [None, NUM_FRAMES, NUM_FBANKS, 1] - dsm = DeepSpeakerModel(batch_input_shape) + dsm = ResCNNModel(batch_input_shape) if checkpoint_file is None: checkpoint_file = load_best_checkpoint(CHECKPOINTS_TRIPLET_DIR) if checkpoint_file is not None: diff --git a/tests/batcher_test.py b/tests/batcher_test.py index aee75db..12ce3b8 100644 --- a/tests/batcher_test.py +++ b/tests/batcher_test.py @@ -5,7 +5,7 @@ import triplet_loss from batcher import KerasFormatConverter, TripletBatcherSelectHardNegatives, TripletBatcher from constants import NUM_FBANKS, NUM_FRAMES, CHECKPOINTS_TRIPLET_DIR, CHECKPOINTS_SOFTMAX_DIR, BATCH_SIZE -from conv_models import DeepSpeakerModel +from conv_models import ResCNNModel from triplet_loss import deep_speaker_loss from utils import load_best_checkpoint @@ -57,7 +57,7 @@ def main(): # we select batches this way. batch_input_shape = [None, NUM_FRAMES, NUM_FBANKS, 1] print('Testing with the triplet losses.') - dsm = DeepSpeakerModel(batch_input_shape, include_softmax=False) + dsm = ResCNNModel(batch_input_shape, include_softmax=False) triplet_checkpoint = load_best_checkpoint(CHECKPOINTS_TRIPLET_DIR) pre_training_checkpoint = load_best_checkpoint(CHECKPOINTS_SOFTMAX_DIR) if triplet_checkpoint is not None: diff --git a/tests/batcher_test2.py b/tests/batcher_test2.py index cf1f9b4..848459d 100644 --- a/tests/batcher_test2.py +++ b/tests/batcher_test2.py @@ -2,13 +2,13 @@ from batcher import LazyTripletBatcher from constants import NUM_FBANKS, NUM_FRAMES -from conv_models import DeepSpeakerModel +from conv_models import ResCNNModel from triplet_loss import deep_speaker_loss def main2(): batch_input_shape = [None, NUM_FRAMES, NUM_FBANKS, 1] - dsm = DeepSpeakerModel(batch_input_shape, include_softmax=False) + dsm = ResCNNModel(batch_input_shape, include_softmax=False) dsm.m.compile(optimizer='adam', loss=deep_speaker_loss) dsm.m.load_weights('/Users/premy/deep-speaker/ResCNN_checkpoint_102.h5', by_name=True) dsm.m.summary() diff --git a/tests/model.py b/tests/model.py new file mode 100644 index 0000000..c0d2c14 --- /dev/null +++ b/tests/model.py @@ -0,0 +1,58 @@ +import numpy as np +import tensorflow.keras.backend as K +from tensorflow.keras.optimizers import Adam + +from constants import NUM_FRAMES, NUM_FBANKS +from conv_models import ResCNNModel, GRUModel +from triplet_loss import deep_speaker_loss + + +def _train(): + # x = np.random.uniform(size=(6, 32, 64, 4)) # 6 is multiple of 3. + # y_softmax = np.random.uniform(size=(6, 100)) + # dsm = DeepSpeakerModel(batch_input_shape=(None, 32, 64, 4), include_softmax=True, num_speakers_softmax=100) + # dsm.m.compile(optimizer=Adam(lr=0.01), loss='categorical_crossentropy') + # print(dsm.m.predict(x).shape) + # print(dsm.m.evaluate(x, y_softmax)) + # w = dsm.get_weights() + dsm = ResCNNModel(batch_input_shape=(None, 32, 64, 4), include_softmax=False) + # dsm.m.set_weights(w) + dsm.m.compile(optimizer=Adam(lr=0.01), loss=deep_speaker_loss) + + # it works!!!!!!!!!!!!!!!!!!!! + # unit_batch_size = 20 + # anchor = np.ones(shape=(unit_batch_size, 32, 64, 4)) + # positive = np.array(anchor) + # negative = np.ones(shape=(unit_batch_size, 32, 64, 4)) * (-1) + # batch = np.vstack((anchor, positive, negative)) + # x = batch + # y = np.zeros(shape=(len(batch), 512)) # not important. + # print('Starting to fit...') + # while True: + # print(dsm.m.train_on_batch(x, y)) + + # should not work... and it does not work! + unit_batch_size = 20 + negative = np.ones(shape=(unit_batch_size, 32, 64, 4)) * (-1) + batch = np.vstack((negative, negative, negative)) + x = batch + y = np.zeros(shape=(len(batch), 512)) # not important. + print('Starting to fit...') + while True: + print(dsm.m.train_on_batch(x, y)) + + +def test_gru(): + # Looks correct to me. + # I have 37K but paper reports 41K. which is not too far. + dsm = GRUModel() + dsm.m.summary() + dsm.m.predict(np.random.random(size=(2, NUM_FRAMES, NUM_FBANKS, 1))) + + i = np.random.random(size=(2, NUM_FRAMES, NUM_FBANKS, 1)) + y = K.reshape(i, (2, NUM_FRAMES, NUM_FBANKS * 1)) + z = K.reshape(y, (2, NUM_FRAMES, NUM_FBANKS, 1)) + + # I suspect num frames to be 32. + # Then fbank=64, then total would be 32*64 = 2048. + # plot_model(dsm.m, to_file='model.png', dpi=300, show_shapes=True, expand_nested=True) diff --git a/train.py b/train.py index da9fe90..e1ffa8b 100644 --- a/train.py +++ b/train.py @@ -7,7 +7,7 @@ from batcher import KerasFormatConverter, LazyTripletBatcher from constants import BATCH_SIZE, CHECKPOINTS_SOFTMAX_DIR, CHECKPOINTS_TRIPLET_DIR, NUM_FRAMES, NUM_FBANKS -from conv_models import DeepSpeakerModel +from conv_models import ResCNNModel, DeepSpeakerModel from triplet_loss import deep_speaker_loss from utils import load_best_checkpoint, ensures_dir @@ -80,7 +80,7 @@ def start_training(working_dir, pre_training_phase=True): logger.info('Softmax pre-training.') kc = KerasFormatConverter(working_dir) num_speakers_softmax = len(kc.categorical_speakers.speaker_ids) - dsm = DeepSpeakerModel(batch_input_shape, include_softmax=True, num_speakers_softmax=num_speakers_softmax) + dsm = ResCNNModel(batch_input_shape, include_softmax=True, num_speakers_softmax=num_speakers_softmax) dsm.m.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy']) pre_training_checkpoint = load_best_checkpoint(CHECKPOINTS_SOFTMAX_DIR) if pre_training_checkpoint is not None: @@ -93,7 +93,7 @@ def start_training(working_dir, pre_training_phase=True): fit_model_softmax(dsm, kc.kx_train, kc.ky_train, kc.kx_test, kc.ky_test, initial_epoch=initial_epoch) else: logger.info('Training with the triplet loss.') - dsm = DeepSpeakerModel(batch_input_shape, include_softmax=False) + dsm = ResCNNModel(batch_input_shape, include_softmax=False) triplet_checkpoint = load_best_checkpoint(CHECKPOINTS_TRIPLET_DIR) pre_training_checkpoint = load_best_checkpoint(CHECKPOINTS_SOFTMAX_DIR) if triplet_checkpoint is not None: From e6198fd3f99d9ec04325622f583078fae1c203ab Mon Sep 17 00:00:00 2001 From: Philippe Remy Date: Sat, 25 Apr 2020 11:46:15 +0900 Subject: [PATCH 02/13] GRU code. --- cli.py | 18 ++++++++---------- conv_models.py | 18 +++++++++++++++--- test.py | 17 +++++------------ train.py | 9 +++++---- 4 files changed, 33 insertions(+), 29 deletions(-) diff --git a/cli.py b/cli.py index a44c226..6c287ea 100644 --- a/cli.py +++ b/cli.py @@ -9,6 +9,7 @@ from audio import Audio from batcher import KerasFormatConverter from constants import SAMPLE_RATE, NUM_FRAMES +from conv_models import GRU_NAME, RES_CNN_NAME from test import test from train import start_training from utils import ClickType as Ct, ensures_dir @@ -53,8 +54,9 @@ def build_keras_inputs(working_dir, counts_per_speaker): @cli.command('test-model', short_help='Test a Keras model.') @click.option('--working_dir', required=True, type=Ct.input_dir()) +@click.option('--model_name', required=True, choices=[RES_CNN_NAME, GRU_NAME]) @click.option('--checkpoint_file', required=True, type=Ct.input_file()) -def test_model(working_dir, checkpoint_file=None): +def test_model(working_dir, model_name, checkpoint_file): # export CUDA_VISIBLE_DEVICES=0; python cli.py test-model # --working_dir /home/philippe/ds-test/triplet-training/ # --checkpoint_file ../ds-test/checkpoints-softmax/ResCNN_checkpoint_102.h5 @@ -64,20 +66,16 @@ def test_model(working_dir, checkpoint_file=None): # --working_dir /home/philippe/ds-test/triplet-training/ # --checkpoint_file ../ds-test/checkpoints-triplets/ResCNN_checkpoint_175.h5 # f-measure = 0.849, true positive rate = 0.798, accuracy = 0.997, equal error rate = 0.025 - test(working_dir, checkpoint_file) + test(working_dir, model_name, checkpoint_file) @cli.command('train-model', short_help='Train a Keras model.') @click.option('--working_dir', required=True, type=Ct.input_dir()) +@click.option('--model_name', required=True, choices=[RES_CNN_NAME, GRU_NAME]) @click.option('--pre_training_phase/--no_pre_training_phase', default=False, show_default=True) -def train_model(working_dir, pre_training_phase): +def train_model(working_dir, model_name, pre_training_phase): # PRE TRAINING - - # commit a5030dd7a1b53cd11d5ab7832fa2d43f2093a464 - # Merge: a11d13e b30e64e - # Author: Philippe Remy - # Date: Fri Apr 10 10:37:59 2020 +0900 - # LibriSpeech train-clean-data360 (600, 100). 0.985 on test set (enough for pre-training). + # LibriSpeech train-clean-data360 (600, 100). 0.991 on test set (enough for pre-training). # TRIPLET TRAINING # [...] @@ -89,7 +87,7 @@ def train_model(working_dir, pre_training_phase): # 2000/2000 [==============================] - 927s 464ms/step - loss: 0.0075 - val_loss: 0.0059 # Epoch 178/1000 # 2000/2000 [==============================] - 948s 474ms/step - loss: 0.0073 - val_loss: 0.0058 - start_training(working_dir, pre_training_phase) + start_training(working_dir, model_name, pre_training_phase) if __name__ == '__main__': diff --git a/conv_models.py b/conv_models.py index 4ce03a1..ab6ca27 100644 --- a/conv_models.py +++ b/conv_models.py @@ -17,6 +17,18 @@ logger = logging.getLogger(__name__) +RES_CNN_NAME = 'ResCNN' +GRU_NAME = 'GRU' + + +def select_model_class(name: str): + if name == RES_CNN_NAME: + return ResCNNModel + elif name == GRU_NAME: + return GRUModel + else: + raise Exception(f'Unknown model name: {name}.') + class DeepSpeakerModel: @@ -24,7 +36,7 @@ def __init__(self, batch_input_shape=(None, NUM_FRAMES, NUM_FBANKS, 1), include_softmax=False, num_speakers_softmax=None, - name='ResCNN'): + name=RES_CNN_NAME): self.include_softmax = include_softmax self.num_speakers_softmax = num_speakers_softmax if self.include_softmax: @@ -83,7 +95,7 @@ def __init__(self, batch_input_shape=(None, NUM_FRAMES, NUM_FBANKS, 1), include_softmax=False, num_speakers_softmax=None): - super().__init__(batch_input_shape, include_softmax, num_speakers_softmax, 'ResCNN') + super().__init__(batch_input_shape, include_softmax, num_speakers_softmax, RES_CNN_NAME) def graph(self, inputs): x = self.conv_and_res_block(inputs, 64, stage=1) @@ -146,7 +158,7 @@ def __init__(self, batch_input_shape=(None, NUM_FRAMES, NUM_FBANKS, 1), include_softmax=False, num_speakers_softmax=None): - super().__init__(batch_input_shape, include_softmax, num_speakers_softmax, 'GRU') + super().__init__(batch_input_shape, include_softmax, num_speakers_softmax, GRU_NAME) def graph(self, inputs): x = Conv2D(64, kernel_size=5, strides=2, padding='same', kernel_initializer='glorot_uniform', diff --git a/test.py b/test.py index 71327d0..cdbd3c6 100644 --- a/test.py +++ b/test.py @@ -5,10 +5,10 @@ from audio import Audio from batcher import LazyTripletBatcher -from constants import NUM_FBANKS, NUM_FRAMES, CHECKPOINTS_TRIPLET_DIR, BATCH_SIZE -from conv_models import ResCNNModel +from constants import NUM_FBANKS, NUM_FRAMES, BATCH_SIZE +from conv_models import ResCNNModel, select_model_class from eval_metrics import evaluate -from utils import load_best_checkpoint, enable_deterministic +from utils import enable_deterministic logger = logging.getLogger(__name__) @@ -41,22 +41,15 @@ def eval_model(working_dir: str, model: ResCNNModel): anchor_embedding = predictions[0] for j, other_than_anchor_embedding in enumerate(predictions[1:]): # positive + negatives y_pred[i][j] = batch_cosine_similarity([anchor_embedding], [other_than_anchor_embedding])[0] - # y_pred[i] = softmax(y_pred[i]) - # could apply softmax here. y_true = np.zeros_like(y_pred) # positive is at index 0. y_true[:, 0] = 1.0 - print(np.matrix(y_true)) - print(np.matrix(y_pred)) - print(np.min(y_pred), np.max(y_pred)) fm, tpr, acc, eer = evaluate(y_pred, y_true) return fm, tpr, acc, eer -def test(working_dir, checkpoint_file=None): +def test(working_dir, model_name, checkpoint_file): batch_input_shape = [None, NUM_FRAMES, NUM_FBANKS, 1] - dsm = ResCNNModel(batch_input_shape) - if checkpoint_file is None: - checkpoint_file = load_best_checkpoint(CHECKPOINTS_TRIPLET_DIR) + dsm = select_model_class(model_name)(batch_input_shape) if checkpoint_file is not None: logger.info(f'Found checkpoint [{checkpoint_file}]. Loading weights...') dsm.m.load_weights(checkpoint_file, by_name=True) diff --git a/train.py b/train.py index e1ffa8b..161fa42 100644 --- a/train.py +++ b/train.py @@ -7,7 +7,7 @@ from batcher import KerasFormatConverter, LazyTripletBatcher from constants import BATCH_SIZE, CHECKPOINTS_SOFTMAX_DIR, CHECKPOINTS_TRIPLET_DIR, NUM_FRAMES, NUM_FBANKS -from conv_models import ResCNNModel, DeepSpeakerModel +from conv_models import ResCNNModel, DeepSpeakerModel, select_model_class from triplet_loss import deep_speaker_loss from utils import load_best_checkpoint, ensures_dir @@ -72,15 +72,16 @@ def fit_model_softmax(dsm: DeepSpeakerModel, kx_train, ky_train, kx_test, ky_tes callbacks=[early_stopping, reduce_lr, checkpoint]) -def start_training(working_dir, pre_training_phase=True): +def start_training(working_dir, model_name, pre_training_phase=True): ensures_dir(CHECKPOINTS_SOFTMAX_DIR) ensures_dir(CHECKPOINTS_TRIPLET_DIR) batch_input_shape = [None, NUM_FRAMES, NUM_FBANKS, 1] + model_class = select_model_class(model_name) if pre_training_phase: logger.info('Softmax pre-training.') kc = KerasFormatConverter(working_dir) num_speakers_softmax = len(kc.categorical_speakers.speaker_ids) - dsm = ResCNNModel(batch_input_shape, include_softmax=True, num_speakers_softmax=num_speakers_softmax) + dsm = model_class(batch_input_shape, include_softmax=True, num_speakers_softmax=num_speakers_softmax) dsm.m.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy']) pre_training_checkpoint = load_best_checkpoint(CHECKPOINTS_SOFTMAX_DIR) if pre_training_checkpoint is not None: @@ -93,7 +94,7 @@ def start_training(working_dir, pre_training_phase=True): fit_model_softmax(dsm, kc.kx_train, kc.ky_train, kc.kx_test, kc.ky_test, initial_epoch=initial_epoch) else: logger.info('Training with the triplet loss.') - dsm = ResCNNModel(batch_input_shape, include_softmax=False) + dsm = model_class(batch_input_shape, include_softmax=False) triplet_checkpoint = load_best_checkpoint(CHECKPOINTS_TRIPLET_DIR) pre_training_checkpoint = load_best_checkpoint(CHECKPOINTS_SOFTMAX_DIR) if triplet_checkpoint is not None: From 641bfa0c161c5228ad4156c0f1e284659cfda199 Mon Sep 17 00:00:00 2001 From: Philippe Remy Date: Sat, 25 Apr 2020 11:52:42 +0900 Subject: [PATCH 03/13] GRU code. --- batcher.py | 15 +-------------- cli.py | 2 +- 2 files changed, 2 insertions(+), 15 deletions(-) diff --git a/batcher.py b/batcher.py index 4c03525..add5b36 100644 --- a/batcher.py +++ b/batcher.py @@ -11,7 +11,7 @@ from audio import pad_mfcc, Audio from constants import NUM_FRAMES, NUM_FBANKS -from conv_models import ResCNNModel, DeepSpeakerModel +from conv_models import DeepSpeakerModel from utils import ensures_dir, load_pickle, load_npy, train_test_sp_to_utt logger = logging.getLogger(__name__) @@ -489,16 +489,3 @@ def get_speaker_verification_data(self, positive_speaker, num_different_speakers data = [anchor, positive] data.extend([self._select_speaker_data(n) for n in negative_speakers]) return np.vstack(data) - - -if __name__ == '__main__': - np.random.seed(123) - ltb = LazyTripletBatcher(working_dir='/Users/premy/deep-speaker/', - max_length=NUM_FRAMES, - model=ResCNNModel()) - for i in range(1000): - print(i) - start = time() - ltb.get_batch_train(batch_size=9) - print(time() - start) - # ltb.get_batch(batch_size=96) diff --git a/cli.py b/cli.py index 6c287ea..121acde 100644 --- a/cli.py +++ b/cli.py @@ -17,7 +17,7 @@ logger = logging.getLogger(__name__) -VERSION = '3.0a' +VERSION = '3.0b' @click.group() From a69363f686f8e8571016cd65e6ce6244296c8710 Mon Sep 17 00:00:00 2001 From: Philippe Remy Date: Sat, 25 Apr 2020 12:31:14 +0900 Subject: [PATCH 04/13] GRU code. --- cli.py | 4 ++-- conv_models.py | 1 - triplet_loss.py | 1 - 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/cli.py b/cli.py index 121acde..7d997b2 100644 --- a/cli.py +++ b/cli.py @@ -54,7 +54,7 @@ def build_keras_inputs(working_dir, counts_per_speaker): @cli.command('test-model', short_help='Test a Keras model.') @click.option('--working_dir', required=True, type=Ct.input_dir()) -@click.option('--model_name', required=True, choices=[RES_CNN_NAME, GRU_NAME]) +@click.option('--model_name', required=True, type=click.Choice([RES_CNN_NAME, GRU_NAME])) @click.option('--checkpoint_file', required=True, type=Ct.input_file()) def test_model(working_dir, model_name, checkpoint_file): # export CUDA_VISIBLE_DEVICES=0; python cli.py test-model @@ -71,7 +71,7 @@ def test_model(working_dir, model_name, checkpoint_file): @cli.command('train-model', short_help='Train a Keras model.') @click.option('--working_dir', required=True, type=Ct.input_dir()) -@click.option('--model_name', required=True, choices=[RES_CNN_NAME, GRU_NAME]) +@click.option('--model_name', required=True, type=click.Choice([RES_CNN_NAME, GRU_NAME])) @click.option('--pre_training_phase/--no_pre_training_phase', default=False, show_default=True) def train_model(working_dir, model_name, pre_training_phase): # PRE TRAINING diff --git a/conv_models.py b/conv_models.py index ab6ca27..a520c51 100644 --- a/conv_models.py +++ b/conv_models.py @@ -52,7 +52,6 @@ def graph(self, inputs): def graph_with_avg_softmax_and_ln(self, inputs): x = self.graph(inputs) - # Temporal average layer. axis=1 is time. x = Lambda(lambda y: K.mean(y, axis=1), name='average')(x) if self.include_softmax: diff --git a/triplet_loss.py b/triplet_loss.py index 784b404..4179560 100644 --- a/triplet_loss.py +++ b/triplet_loss.py @@ -1,6 +1,5 @@ import keras.backend as K -# ALPHA = 0.2 # used in FaceNet https://arxiv.org/pdf/1503.03832.pdf ALPHA = 0.1 # used in Deep Speaker. From e9b42f8dd2aaaf89f9601a0da37e366b66e9a34d Mon Sep 17 00:00:00 2001 From: Philippe Remy Date: Sun, 26 Apr 2020 09:59:28 +0900 Subject: [PATCH 05/13] GRU can train to around 0.935 without much effort now --- conv_models.py | 3 ++- train.py | 11 +++++++---- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/conv_models.py b/conv_models.py index a520c51..3c3951b 100644 --- a/conv_models.py +++ b/conv_models.py @@ -57,6 +57,7 @@ def graph_with_avg_softmax_and_ln(self, inputs): if self.include_softmax: logger.info('Including a Dropout layer to reduce overfitting.') # used for softmax because the dataset we pre-train on might be too small. easy to overfit. + # x = Dropout(0.25)(x) # was for GRU. Does 0.5 work with GRU as well? x = Dropout(0.5)(x) x = Dense(512, name='affine')(x) if self.include_softmax: @@ -163,7 +164,7 @@ def graph(self, inputs): x = Conv2D(64, kernel_size=5, strides=2, padding='same', kernel_initializer='glorot_uniform', name='conv1', kernel_regularizer=regularizers.l2(l=0.0001))(inputs) # shape = (BATCH_SIZE , num_frames/2, 64/2, 64) - x = BatchNormalization(name='bn1')(x) + x = BatchNormalization(name='bn1')(x) # does it work with BN? x = self.clipped_relu(x) # 4d -> 3d. diff --git a/train.py b/train.py index 161fa42..81871a8 100644 --- a/train.py +++ b/train.py @@ -2,12 +2,12 @@ import os from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint -from tensorflow.keras.optimizers import SGD +from tensorflow.keras.optimizers import SGD, Adam from tqdm import tqdm from batcher import KerasFormatConverter, LazyTripletBatcher from constants import BATCH_SIZE, CHECKPOINTS_SOFTMAX_DIR, CHECKPOINTS_TRIPLET_DIR, NUM_FRAMES, NUM_FBANKS -from conv_models import ResCNNModel, DeepSpeakerModel, select_model_class +from conv_models import ResCNNModel, DeepSpeakerModel, select_model_class, RES_CNN_NAME from triplet_loss import deep_speaker_loss from utils import load_best_checkpoint, ensures_dir @@ -82,13 +82,16 @@ def start_training(working_dir, model_name, pre_training_phase=True): kc = KerasFormatConverter(working_dir) num_speakers_softmax = len(kc.categorical_speakers.speaker_ids) dsm = model_class(batch_input_shape, include_softmax=True, num_speakers_softmax=num_speakers_softmax) - dsm.m.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy']) + # ResCNN can train with default Adam LR of 0.001. GRU is more sensitive. + lr = 0.001 if model_name == RES_CNN_NAME else 0.0003 + logger.info(f'Initial learning rate set to {lr}.') + dsm.m.compile(optimizer=Adam(learning_rate=lr), loss='sparse_categorical_crossentropy', metrics=['accuracy']) pre_training_checkpoint = load_best_checkpoint(CHECKPOINTS_SOFTMAX_DIR) if pre_training_checkpoint is not None: initial_epoch = int(pre_training_checkpoint.split('/')[-1].split('.')[0].split('_')[-1]) logger.info(f'Initial epoch is {initial_epoch}.') logger.info(f'Loading softmax checkpoint: {pre_training_checkpoint}.') - dsm.m.load_weights(pre_training_checkpoint) # latest one. + dsm.m.load_weights(pre_training_checkpoint, by_name=True) # latest one. else: initial_epoch = 0 fit_model_softmax(dsm, kc.kx_train, kc.ky_train, kc.kx_test, kc.ky_test, initial_epoch=initial_epoch) From 570c0593a97ac2d9e61da1f001cd9ad0af6648bc Mon Sep 17 00:00:00 2001 From: Philippe Remy Date: Sun, 26 Apr 2020 10:00:40 +0900 Subject: [PATCH 06/13] add model name as arg --- deep-speaker | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/deep-speaker b/deep-speaker index 07a3046..f8cfc81 100755 --- a/deep-speaker +++ b/deep-speaker @@ -38,13 +38,13 @@ build_model_inputs) train_softmax) # Pre-training (0.92k speakers). echo "[train_softmax] selected." - python cli.py train-model --working_dir "${PRE_TRAINING_WORKING_DIR}" --pre_training_phase + python cli.py train-model --model_name "$2" --working_dir "${PRE_TRAINING_WORKING_DIR}" --pre_training_phase ;; train_triplet) # Triplet-training (2.48k speakers). echo "[train_triplet] selected." - python cli.py train-model --working_dir "${TRIPLET_TRAINING_WORKING_DIR}" + python cli.py train-model --model_name "$2" --working_dir "${TRIPLET_TRAINING_WORKING_DIR}" ;; *) From 23f3c764100bdc7e50313a9b384e275d375b19b5 Mon Sep 17 00:00:00 2001 From: Philippe Remy Date: Sun, 26 Apr 2020 15:03:57 +0900 Subject: [PATCH 07/13] LR lowered to 0.0001 --- train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/train.py b/train.py index 81871a8..dc5f17b 100644 --- a/train.py +++ b/train.py @@ -83,7 +83,7 @@ def start_training(working_dir, model_name, pre_training_phase=True): num_speakers_softmax = len(kc.categorical_speakers.speaker_ids) dsm = model_class(batch_input_shape, include_softmax=True, num_speakers_softmax=num_speakers_softmax) # ResCNN can train with default Adam LR of 0.001. GRU is more sensitive. - lr = 0.001 if model_name == RES_CNN_NAME else 0.0003 + lr = 0.001 if model_name == RES_CNN_NAME else 0.0001 logger.info(f'Initial learning rate set to {lr}.') dsm.m.compile(optimizer=Adam(learning_rate=lr), loss='sparse_categorical_crossentropy', metrics=['accuracy']) pre_training_checkpoint = load_best_checkpoint(CHECKPOINTS_SOFTMAX_DIR) From 3651192b314cfabbf28e4a0561e665bbea014662 Mon Sep 17 00:00:00 2001 From: Philippe Remy Date: Sun, 26 Apr 2020 19:24:12 +0900 Subject: [PATCH 08/13] LR lowered to 0.00005. has to be really low --- train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/train.py b/train.py index dc5f17b..674b7c8 100644 --- a/train.py +++ b/train.py @@ -83,7 +83,7 @@ def start_training(working_dir, model_name, pre_training_phase=True): num_speakers_softmax = len(kc.categorical_speakers.speaker_ids) dsm = model_class(batch_input_shape, include_softmax=True, num_speakers_softmax=num_speakers_softmax) # ResCNN can train with default Adam LR of 0.001. GRU is more sensitive. - lr = 0.001 if model_name == RES_CNN_NAME else 0.0001 + lr = 0.001 if model_name == RES_CNN_NAME else 0.00005 logger.info(f'Initial learning rate set to {lr}.') dsm.m.compile(optimizer=Adam(learning_rate=lr), loss='sparse_categorical_crossentropy', metrics=['accuracy']) pre_training_checkpoint = load_best_checkpoint(CHECKPOINTS_SOFTMAX_DIR) From 152f19d83ec3ce4f0216e143f71952a2e51e004d Mon Sep 17 00:00:00 2001 From: Philippe Remy Date: Sun, 26 Apr 2020 19:26:33 +0900 Subject: [PATCH 09/13] conv_models.py to models.py. --- README.md | 2 +- batcher.py | 2 +- cli.py | 2 +- example.py | 2 +- conv_models.py => models.py | 0 test.py | 2 +- tests/batcher_test.py | 2 +- tests/batcher_test2.py | 2 +- tests/model.py | 2 +- train.py | 2 +- 10 files changed, 9 insertions(+), 9 deletions(-) rename conv_models.py => models.py (100%) diff --git a/README.md b/README.md index 006d29c..64ebab8 100644 --- a/README.md +++ b/README.md @@ -59,7 +59,7 @@ import random from audio import read_mfcc from batcher import sample_from_mfcc from constants import SAMPLE_RATE, NUM_FRAMES -from conv_models import ResCNNModel +from models import ResCNNModel from test import batch_cosine_similarity # Reproducible results. diff --git a/batcher.py b/batcher.py index add5b36..78fe129 100644 --- a/batcher.py +++ b/batcher.py @@ -11,7 +11,7 @@ from audio import pad_mfcc, Audio from constants import NUM_FRAMES, NUM_FBANKS -from conv_models import DeepSpeakerModel +from models import DeepSpeakerModel from utils import ensures_dir, load_pickle, load_npy, train_test_sp_to_utt logger = logging.getLogger(__name__) diff --git a/cli.py b/cli.py index 7d997b2..e0a1a4b 100644 --- a/cli.py +++ b/cli.py @@ -9,7 +9,7 @@ from audio import Audio from batcher import KerasFormatConverter from constants import SAMPLE_RATE, NUM_FRAMES -from conv_models import GRU_NAME, RES_CNN_NAME +from models import GRU_NAME, RES_CNN_NAME from test import test from train import start_training from utils import ClickType as Ct, ensures_dir diff --git a/example.py b/example.py index 7cfed95..41391db 100644 --- a/example.py +++ b/example.py @@ -5,7 +5,7 @@ from audio import read_mfcc from batcher import sample_from_mfcc from constants import SAMPLE_RATE, NUM_FRAMES -from conv_models import ResCNNModel +from models import ResCNNModel from test import batch_cosine_similarity np.random.seed(123) diff --git a/conv_models.py b/models.py similarity index 100% rename from conv_models.py rename to models.py diff --git a/test.py b/test.py index cdbd3c6..3e01af0 100644 --- a/test.py +++ b/test.py @@ -6,7 +6,7 @@ from audio import Audio from batcher import LazyTripletBatcher from constants import NUM_FBANKS, NUM_FRAMES, BATCH_SIZE -from conv_models import ResCNNModel, select_model_class +from models import ResCNNModel, select_model_class from eval_metrics import evaluate from utils import enable_deterministic diff --git a/tests/batcher_test.py b/tests/batcher_test.py index 12ce3b8..2f50e01 100644 --- a/tests/batcher_test.py +++ b/tests/batcher_test.py @@ -5,7 +5,7 @@ import triplet_loss from batcher import KerasFormatConverter, TripletBatcherSelectHardNegatives, TripletBatcher from constants import NUM_FBANKS, NUM_FRAMES, CHECKPOINTS_TRIPLET_DIR, CHECKPOINTS_SOFTMAX_DIR, BATCH_SIZE -from conv_models import ResCNNModel +from models import ResCNNModel from triplet_loss import deep_speaker_loss from utils import load_best_checkpoint diff --git a/tests/batcher_test2.py b/tests/batcher_test2.py index 848459d..66810e8 100644 --- a/tests/batcher_test2.py +++ b/tests/batcher_test2.py @@ -2,7 +2,7 @@ from batcher import LazyTripletBatcher from constants import NUM_FBANKS, NUM_FRAMES -from conv_models import ResCNNModel +from models import ResCNNModel from triplet_loss import deep_speaker_loss diff --git a/tests/model.py b/tests/model.py index c0d2c14..056e839 100644 --- a/tests/model.py +++ b/tests/model.py @@ -3,7 +3,7 @@ from tensorflow.keras.optimizers import Adam from constants import NUM_FRAMES, NUM_FBANKS -from conv_models import ResCNNModel, GRUModel +from models import ResCNNModel, GRUModel from triplet_loss import deep_speaker_loss diff --git a/train.py b/train.py index 81871a8..6017208 100644 --- a/train.py +++ b/train.py @@ -7,7 +7,7 @@ from batcher import KerasFormatConverter, LazyTripletBatcher from constants import BATCH_SIZE, CHECKPOINTS_SOFTMAX_DIR, CHECKPOINTS_TRIPLET_DIR, NUM_FRAMES, NUM_FBANKS -from conv_models import ResCNNModel, DeepSpeakerModel, select_model_class, RES_CNN_NAME +from models import ResCNNModel, DeepSpeakerModel, select_model_class, RES_CNN_NAME from triplet_loss import deep_speaker_loss from utils import load_best_checkpoint, ensures_dir From 8806bbb53f21954481f9eb832f880407ea28829d Mon Sep 17 00:00:00 2001 From: Philippe Remy Date: Wed, 29 Apr 2020 10:15:42 +0900 Subject: [PATCH 10/13] more precision --- test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test.py b/test.py index 3e01af0..c1627d3 100644 --- a/test.py +++ b/test.py @@ -58,5 +58,5 @@ def test(working_dir, model_name, checkpoint_file): exit(1) fm, tpr, acc, eer = eval_model(working_dir, model=dsm) - logger.info(f'f-measure = {fm:.3f}, true positive rate = {tpr:.3f}, ' - f'accuracy = {acc:.3f}, equal error rate = {eer:.3f}') + logger.info(f'f-measure = {fm:.5f}, true positive rate = {tpr:.5f}, ' + f'accuracy = {acc:.5f}, equal error rate = {eer:.5f}') From 2223f93c4551ffdac61878bfaca9418fa6334193 Mon Sep 17 00:00:00 2001 From: Philippe Remy Date: Wed, 29 Apr 2020 10:16:02 +0900 Subject: [PATCH 11/13] havent tested those but looks promising. we need to break this GRU1024x3 --- models.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/models.py b/models.py index 3c3951b..03103cf 100644 --- a/models.py +++ b/models.py @@ -174,6 +174,11 @@ def graph(self, inputs): # shape = (BATCH_SIZE , num_frames/2, 1024) x = GRU(1024, name='GRU1', return_sequences=True)(x) + if self.include_softmax: + x = Dropout(0.2)(x) x = GRU(1024, name='GRU2', return_sequences=True)(x) + if self.include_softmax: + x = Dropout(0.2)(x) x = GRU(1024, name='GRU3', return_sequences=True)(x) return x + From 571ffbfa1f12574c1c029fe4a72e9253a159bc04 Mon Sep 17 00:00:00 2001 From: Philippe Remy Date: Wed, 29 Apr 2020 10:16:24 +0900 Subject: [PATCH 12/13] just to make sure we dont see dropout during triplet training --- train.py | 1 + 1 file changed, 1 insertion(+) diff --git a/train.py b/train.py index c9f2695..7db5b8b 100644 --- a/train.py +++ b/train.py @@ -110,4 +110,5 @@ def start_training(working_dir, model_name, pre_training_phase=True): # some of the layers have changed. dsm.m.load_weights(pre_training_checkpoint, by_name=True) dsm.m.compile(optimizer=SGD(), loss=deep_speaker_loss) + dsm.m.summary() fit_model(dsm, working_dir, NUM_FRAMES) From 36eb0ef2d8a362dae30995aa394531929a3ca35f Mon Sep 17 00:00:00 2001 From: Philippe Remy Date: Thu, 30 Apr 2020 11:18:15 +0900 Subject: [PATCH 13/13] better training --- train.py | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/train.py b/train.py index 7db5b8b..209ec4f 100644 --- a/train.py +++ b/train.py @@ -7,7 +7,7 @@ from batcher import KerasFormatConverter, LazyTripletBatcher from constants import BATCH_SIZE, CHECKPOINTS_SOFTMAX_DIR, CHECKPOINTS_TRIPLET_DIR, NUM_FRAMES, NUM_FBANKS -from models import ResCNNModel, DeepSpeakerModel, select_model_class, RES_CNN_NAME +from models import DeepSpeakerModel, select_model_class, RES_CNN_NAME from triplet_loss import deep_speaker_loss from utils import load_best_checkpoint, ensures_dir @@ -17,7 +17,8 @@ os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' -def fit_model(dsm: DeepSpeakerModel, working_dir: str, max_length: int = NUM_FRAMES, batch_size=BATCH_SIZE): +def fit_model(dsm: DeepSpeakerModel, working_dir: str, max_length: int = NUM_FRAMES, + batch_size: int = BATCH_SIZE, initial_epoch: int = 0): batcher = LazyTripletBatcher(working_dir, max_length, dsm) # build small test set. @@ -37,9 +38,10 @@ def train_generator(): checkpoint_name = dsm.m.name + '_checkpoint' checkpoint_filename = os.path.join(CHECKPOINTS_TRIPLET_DIR, checkpoint_name + '_{epoch}.h5') checkpoint = ModelCheckpoint(monitor='val_loss', filepath=checkpoint_filename, save_best_only=True) + reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=10, min_lr=1e-4, verbose=1) dsm.m.fit(x=train_generator(), y=None, steps_per_epoch=2000, shuffle=False, epochs=1000, validation_data=test_generator(), validation_steps=len(test_batches), - callbacks=[checkpoint]) + callbacks=[reduce_lr, checkpoint], initial_epoch=initial_epoch) def fit_model_softmax(dsm: DeepSpeakerModel, kx_train, ky_train, kx_test, ky_test, @@ -83,7 +85,7 @@ def start_training(working_dir, model_name, pre_training_phase=True): num_speakers_softmax = len(kc.categorical_speakers.speaker_ids) dsm = model_class(batch_input_shape, include_softmax=True, num_speakers_softmax=num_speakers_softmax) # ResCNN can train with default Adam LR of 0.001. GRU is more sensitive. - lr = 0.001 if model_name == RES_CNN_NAME else 0.00005 + lr = 0.001 if model_name == RES_CNN_NAME else 0.00003 logger.info(f'Initial learning rate set to {lr}.') dsm.m.compile(optimizer=Adam(learning_rate=lr), loss='sparse_categorical_crossentropy', metrics=['accuracy']) pre_training_checkpoint = load_best_checkpoint(CHECKPOINTS_SOFTMAX_DIR) @@ -102,13 +104,19 @@ def start_training(working_dir, model_name, pre_training_phase=True): pre_training_checkpoint = load_best_checkpoint(CHECKPOINTS_SOFTMAX_DIR) if triplet_checkpoint is not None: logger.info(f'Loading triplet checkpoint: {triplet_checkpoint}.') + initial_epoch = int(triplet_checkpoint.split('/')[-1].split('.')[0].split('_')[-1]) dsm.m.load_weights(triplet_checkpoint) elif pre_training_checkpoint is not None: logger.info(f'Loading pre-training checkpoint: {pre_training_checkpoint}.') # If `by_name` is True, weights are loaded into layers only if they share the # same name. This is useful for fine-tuning or transfer-learning models where # some of the layers have changed. + initial_epoch = 0 dsm.m.load_weights(pre_training_checkpoint, by_name=True) - dsm.m.compile(optimizer=SGD(), loss=deep_speaker_loss) + else: + initial_epoch = 0 + # dsm.m.compile(optimizer=SGD(learning_rate=0.05, momentum=0.99), loss=deep_speaker_loss) dsm.m.summary() - fit_model(dsm, working_dir, NUM_FRAMES) + dsm.m.compile(optimizer=SGD(), loss=deep_speaker_loss) + fit_model(dsm, working_dir, max_length=NUM_FRAMES, initial_epoch=initial_epoch) +