From 49c12603ca0391d81fddcdacf8586de1bc1c163e Mon Sep 17 00:00:00 2001
From: Philippe Remy <premy.enseirb@gmail.com>
Date: Sat, 25 Apr 2020 11:26:33 +0900
Subject: [PATCH 01/13] GRU code.

---
 README.md              |   2 +-
 batcher.py             |   4 +-
 conv_models.py         | 164 +++++++++++++++++------------------------
 example.py             |   8 +-
 test.py                |   6 +-
 tests/batcher_test.py  |   4 +-
 tests/batcher_test2.py |   4 +-
 tests/model.py         |  58 +++++++++++++++
 train.py               |   6 +-
 9 files changed, 142 insertions(+), 114 deletions(-)
 create mode 100644 tests/model.py

diff --git a/README.md b/README.md
index ddf6133..006d29c 100644
--- a/README.md
+++ b/README.md
@@ -59,7 +59,7 @@ import random
 from audio import read_mfcc
 from batcher import sample_from_mfcc
 from constants import SAMPLE_RATE, NUM_FRAMES
-from conv_models import DeepSpeakerModel
+from conv_models import ResCNNModel
 from test import batch_cosine_similarity
 
 # Reproducible results.
diff --git a/batcher.py b/batcher.py
index b26889c..4c03525 100644
--- a/batcher.py
+++ b/batcher.py
@@ -11,7 +11,7 @@
 
 from audio import pad_mfcc, Audio
 from constants import NUM_FRAMES, NUM_FBANKS
-from conv_models import DeepSpeakerModel
+from conv_models import ResCNNModel, DeepSpeakerModel
 from utils import ensures_dir, load_pickle, load_npy, train_test_sp_to_utt
 
 logger = logging.getLogger(__name__)
@@ -495,7 +495,7 @@ def get_speaker_verification_data(self, positive_speaker, num_different_speakers
     np.random.seed(123)
     ltb = LazyTripletBatcher(working_dir='/Users/premy/deep-speaker/',
                              max_length=NUM_FRAMES,
-                             model=DeepSpeakerModel())
+                             model=ResCNNModel())
     for i in range(1000):
         print(i)
         start = time()
diff --git a/conv_models.py b/conv_models.py
index 416ad44..4ce03a1 100644
--- a/conv_models.py
+++ b/conv_models.py
@@ -1,71 +1,60 @@
+import abc
 import logging
-import os
 
-import numpy as np
 import tensorflow.keras.backend as K
 from tensorflow.keras import layers
 from tensorflow.keras import regularizers
 from tensorflow.keras.layers import BatchNormalization
 from tensorflow.keras.layers import Conv2D
 from tensorflow.keras.layers import Dropout
+from tensorflow.keras.layers import GRU
 from tensorflow.keras.layers import Input
 from tensorflow.keras.layers import Lambda, Dense
 from tensorflow.keras.layers import Reshape
 from tensorflow.keras.models import Model
-from tensorflow.keras.optimizers import Adam
 
 from constants import NUM_FBANKS, NUM_FRAMES
-from triplet_loss import deep_speaker_loss
 
 logger = logging.getLogger(__name__)
 
 
 class DeepSpeakerModel:
 
-    # I thought it was 3 but maybe energy is added at a 4th dimension.
-    # would be better to have 4 dimensions:
-    # MFCC, DIFF(MFCC), DIFF(DIFF(MFCC)), ENERGIES (probably tiled across the frequency domain).
-    # this seems to help match the parameter counts.
-    def __init__(self, batch_input_shape=(None, NUM_FRAMES, NUM_FBANKS, 1), include_softmax=False,
-                 num_speakers_softmax=None):
+    def __init__(self,
+                 batch_input_shape=(None, NUM_FRAMES, NUM_FBANKS, 1),
+                 include_softmax=False,
+                 num_speakers_softmax=None,
+                 name='ResCNN'):
         self.include_softmax = include_softmax
+        self.num_speakers_softmax = num_speakers_softmax
         if self.include_softmax:
-            assert num_speakers_softmax > 0
+            assert self.num_speakers_softmax > 0
         self.clipped_relu_count = 0
+        inputs = Input(batch_shape=batch_input_shape, name='input')
+        x = self.graph_with_avg_softmax_and_ln(inputs)
+        self.m = Model(inputs, x, name=name)
 
-        # http://cs231n.github.io/convolutional-networks/
-        # conv weights
-        # #params = ks * ks * nb_filters * num_channels_input
-
-        # Conv128-s
-        # 5*5*128*128/2+128
-        # ks*ks*nb_filters*channels/strides+bias(=nb_filters)
-
-        # take 100 ms -> 4 frames.
-        # if signal is 3 seconds, then take 100ms per 100ms and average out this network.
-        # 8*8 = 64 features.
-
-        # used to share all the layers across the inputs
+    @abc.abstractmethod
+    def graph(self, inputs):
+        pass
 
-        # num_frames = K.shape() - do it dynamically after.
-        inputs = Input(batch_shape=batch_input_shape, name='input')
-        x = self.cnn_component(inputs)
+    def graph_with_avg_softmax_and_ln(self, inputs):
+        x = self.graph(inputs)
 
-        x = Reshape((-1, 2048))(x)
         # Temporal average layer. axis=1 is time.
         x = Lambda(lambda y: K.mean(y, axis=1), name='average')(x)
-        if include_softmax:
+        if self.include_softmax:
             logger.info('Including a Dropout layer to reduce overfitting.')
             # used for softmax because the dataset we pre-train on might be too small. easy to overfit.
             x = Dropout(0.5)(x)
         x = Dense(512, name='affine')(x)
-        if include_softmax:
+        if self.include_softmax:
             # Those weights are just when we train on softmax.
-            x = Dense(num_speakers_softmax, activation='softmax')(x)
+            x = Dense(self.num_speakers_softmax, activation='softmax')(x)
         else:
             # Does not contain any weights.
             x = Lambda(lambda y: K.l2_normalize(y, axis=1), name='ln')(x)
-        self.m = Model(inputs, x, name='ResCNN')
+        return x
 
     def keras_model(self):
         return self.m
@@ -82,6 +71,28 @@ def clipped_relu(self, inputs):
         self.clipped_relu_count += 1
         return relu
 
+    def set_weights(self, w):
+        for layer, layer_w in zip(self.m.layers, w):
+            layer.set_weights(layer_w)
+            logger.info(f'Setting weights for [{layer.name}]...')
+
+
+class ResCNNModel(DeepSpeakerModel):
+
+    def __init__(self,
+                 batch_input_shape=(None, NUM_FRAMES, NUM_FBANKS, 1),
+                 include_softmax=False,
+                 num_speakers_softmax=None):
+        super().__init__(batch_input_shape, include_softmax, num_speakers_softmax, 'ResCNN')
+
+    def graph(self, inputs):
+        x = self.conv_and_res_block(inputs, 64, stage=1)
+        x = self.conv_and_res_block(x, 128, stage=2)
+        x = self.conv_and_res_block(x, 256, stage=3)
+        x = self.conv_and_res_block(x, 512, stage=4)
+        x = Reshape((-1, 2048))(x)
+        return x
+
     def identity_block(self, input_tensor, kernel_size, filters, stage, block):
         conv_name_base = f'res{stage}_{block}_branch'
 
@@ -128,72 +139,29 @@ def conv_and_res_block(self, inp, filters, stage):
             o = self.identity_block(o, kernel_size=3, filters=filters, stage=stage, block=i)
         return o
 
-    def cnn_component(self, inp):
-        x = self.conv_and_res_block(inp, 64, stage=1)
-        x = self.conv_and_res_block(x, 128, stage=2)
-        x = self.conv_and_res_block(x, 256, stage=3)
-        x = self.conv_and_res_block(x, 512, stage=4)
-        return x
 
-    def set_weights(self, w):
-        for layer, layer_w in zip(self.m.layers, w):
-            layer.set_weights(layer_w)
-            logger.info(f'Setting weights for [{layer.name}]...')
+class GRUModel(DeepSpeakerModel):
 
+    def __init__(self,
+                 batch_input_shape=(None, NUM_FRAMES, NUM_FBANKS, 1),
+                 include_softmax=False,
+                 num_speakers_softmax=None):
+        super().__init__(batch_input_shape, include_softmax, num_speakers_softmax, 'GRU')
 
-def main():
-    # Looks correct to me.
-    # I have 37K but paper reports 41K. which is not too far.
-    dsm = DeepSpeakerModel()
-    dsm.m.summary()
-
-    # I suspect num frames to be 32.
-    # Then fbank=64, then total would be 32*64 = 2048.
-    # plot_model(dsm.m, to_file='model.png', dpi=300, show_shapes=True, expand_nested=True)
-
-
-def _train():
-    # x = np.random.uniform(size=(6, 32, 64, 4))  # 6 is multiple of 3.
-    # y_softmax = np.random.uniform(size=(6, 100))
-    # dsm = DeepSpeakerModel(batch_input_shape=(None, 32, 64, 4), include_softmax=True, num_speakers_softmax=100)
-    # dsm.m.compile(optimizer=Adam(lr=0.01), loss='categorical_crossentropy')
-    # print(dsm.m.predict(x).shape)
-    # print(dsm.m.evaluate(x, y_softmax))
-    # w = dsm.get_weights()
-    dsm = DeepSpeakerModel(batch_input_shape=(None, 32, 64, 4), include_softmax=False)
-    # dsm.m.set_weights(w)
-    dsm.m.compile(optimizer=Adam(lr=0.01), loss=deep_speaker_loss)
-
-    # it works!!!!!!!!!!!!!!!!!!!!
-    # unit_batch_size = 20
-    # anchor = np.ones(shape=(unit_batch_size, 32, 64, 4))
-    # positive = np.array(anchor)
-    # negative = np.ones(shape=(unit_batch_size, 32, 64, 4)) * (-1)
-    # batch = np.vstack((anchor, positive, negative))
-    # x = batch
-    # y = np.zeros(shape=(len(batch), 512))  # not important.
-    # print('Starting to fit...')
-    # while True:
-    #     print(dsm.m.train_on_batch(x, y))
-
-    # should not work... and it does not work!
-    unit_batch_size = 20
-    negative = np.ones(shape=(unit_batch_size, 32, 64, 4)) * (-1)
-    batch = np.vstack((negative, negative, negative))
-    x = batch
-    y = np.zeros(shape=(len(batch), 512))  # not important.
-    print('Starting to fit...')
-    while True:
-        print(dsm.m.train_on_batch(x, y))
-
-
-def _test_checkpoint_compatibility():
-    dsm = DeepSpeakerModel(batch_input_shape=(None, 32, 64, 4), include_softmax=True, num_speakers_softmax=10)
-    dsm.m.save_weights('test.h5')
-    dsm = DeepSpeakerModel(batch_input_shape=(None, 32, 64, 4), include_softmax=False)
-    dsm.m.load_weights('test.h5', by_name=True)
-    os.remove('test.h5')
-
-
-if __name__ == '__main__':
-    _test_checkpoint_compatibility()
+    def graph(self, inputs):
+        x = Conv2D(64, kernel_size=5, strides=2, padding='same', kernel_initializer='glorot_uniform',
+                   name='conv1', kernel_regularizer=regularizers.l2(l=0.0001))(inputs)
+        # shape = (BATCH_SIZE , num_frames/2, 64/2, 64)
+        x = BatchNormalization(name='bn1')(x)
+        x = self.clipped_relu(x)
+
+        # 4d -> 3d.
+        _, frames_dim, fbank_dim, conv_output_dim = K.int_shape(x)
+        x = Reshape((frames_dim, fbank_dim * conv_output_dim))(x)
+        x = Reshape((frames_dim, fbank_dim * conv_output_dim))(x)
+
+        # shape = (BATCH_SIZE , num_frames/2, 1024)
+        x = GRU(1024, name='GRU1', return_sequences=True)(x)
+        x = GRU(1024, name='GRU2', return_sequences=True)(x)
+        x = GRU(1024, name='GRU3', return_sequences=True)(x)
+        return x
diff --git a/example.py b/example.py
index 593527d..7cfed95 100644
--- a/example.py
+++ b/example.py
@@ -1,15 +1,17 @@
-import numpy as np
 import random
+
+import numpy as np
+
 from audio import read_mfcc
 from batcher import sample_from_mfcc
 from constants import SAMPLE_RATE, NUM_FRAMES
-from conv_models import DeepSpeakerModel
+from conv_models import ResCNNModel
 from test import batch_cosine_similarity
 
 np.random.seed(123)
 random.seed(123)
 
-model = DeepSpeakerModel()
+model = ResCNNModel()
 model.m.load_weights('/Users/premy/deep-speaker/checkpoints/ResCNN_triplet_training_checkpoint_175.h5', by_name=True)
 
 mfcc_001 = sample_from_mfcc(read_mfcc('samples/PhilippeRemy/PhilippeRemy_001.wav', SAMPLE_RATE), NUM_FRAMES)
diff --git a/test.py b/test.py
index e3583f7..71327d0 100644
--- a/test.py
+++ b/test.py
@@ -6,7 +6,7 @@
 from audio import Audio
 from batcher import LazyTripletBatcher
 from constants import NUM_FBANKS, NUM_FRAMES, CHECKPOINTS_TRIPLET_DIR, BATCH_SIZE
-from conv_models import DeepSpeakerModel
+from conv_models import ResCNNModel
 from eval_metrics import evaluate
 from utils import load_best_checkpoint, enable_deterministic
 
@@ -25,7 +25,7 @@ def batch_cosine_similarity(x1, x2):
     return s
 
 
-def eval_model(working_dir: str, model: DeepSpeakerModel):
+def eval_model(working_dir: str, model: ResCNNModel):
     enable_deterministic()
     audio = Audio(working_dir)
     batcher = LazyTripletBatcher(working_dir, NUM_FRAMES, model)
@@ -54,7 +54,7 @@ def eval_model(working_dir: str, model: DeepSpeakerModel):
 
 def test(working_dir, checkpoint_file=None):
     batch_input_shape = [None, NUM_FRAMES, NUM_FBANKS, 1]
-    dsm = DeepSpeakerModel(batch_input_shape)
+    dsm = ResCNNModel(batch_input_shape)
     if checkpoint_file is None:
         checkpoint_file = load_best_checkpoint(CHECKPOINTS_TRIPLET_DIR)
     if checkpoint_file is not None:
diff --git a/tests/batcher_test.py b/tests/batcher_test.py
index aee75db..12ce3b8 100644
--- a/tests/batcher_test.py
+++ b/tests/batcher_test.py
@@ -5,7 +5,7 @@
 import triplet_loss
 from batcher import KerasFormatConverter, TripletBatcherSelectHardNegatives, TripletBatcher
 from constants import NUM_FBANKS, NUM_FRAMES, CHECKPOINTS_TRIPLET_DIR, CHECKPOINTS_SOFTMAX_DIR, BATCH_SIZE
-from conv_models import DeepSpeakerModel
+from conv_models import ResCNNModel
 from triplet_loss import deep_speaker_loss
 from utils import load_best_checkpoint
 
@@ -57,7 +57,7 @@ def main():
     # we select batches this way.
     batch_input_shape = [None, NUM_FRAMES, NUM_FBANKS, 1]
     print('Testing with the triplet losses.')
-    dsm = DeepSpeakerModel(batch_input_shape, include_softmax=False)
+    dsm = ResCNNModel(batch_input_shape, include_softmax=False)
     triplet_checkpoint = load_best_checkpoint(CHECKPOINTS_TRIPLET_DIR)
     pre_training_checkpoint = load_best_checkpoint(CHECKPOINTS_SOFTMAX_DIR)
     if triplet_checkpoint is not None:
diff --git a/tests/batcher_test2.py b/tests/batcher_test2.py
index cf1f9b4..848459d 100644
--- a/tests/batcher_test2.py
+++ b/tests/batcher_test2.py
@@ -2,13 +2,13 @@
 
 from batcher import LazyTripletBatcher
 from constants import NUM_FBANKS, NUM_FRAMES
-from conv_models import DeepSpeakerModel
+from conv_models import ResCNNModel
 from triplet_loss import deep_speaker_loss
 
 
 def main2():
     batch_input_shape = [None, NUM_FRAMES, NUM_FBANKS, 1]
-    dsm = DeepSpeakerModel(batch_input_shape, include_softmax=False)
+    dsm = ResCNNModel(batch_input_shape, include_softmax=False)
     dsm.m.compile(optimizer='adam', loss=deep_speaker_loss)
     dsm.m.load_weights('/Users/premy/deep-speaker/ResCNN_checkpoint_102.h5', by_name=True)
     dsm.m.summary()
diff --git a/tests/model.py b/tests/model.py
new file mode 100644
index 0000000..c0d2c14
--- /dev/null
+++ b/tests/model.py
@@ -0,0 +1,58 @@
+import numpy as np
+import tensorflow.keras.backend as K
+from tensorflow.keras.optimizers import Adam
+
+from constants import NUM_FRAMES, NUM_FBANKS
+from conv_models import ResCNNModel, GRUModel
+from triplet_loss import deep_speaker_loss
+
+
+def _train():
+    # x = np.random.uniform(size=(6, 32, 64, 4))  # 6 is multiple of 3.
+    # y_softmax = np.random.uniform(size=(6, 100))
+    # dsm = DeepSpeakerModel(batch_input_shape=(None, 32, 64, 4), include_softmax=True, num_speakers_softmax=100)
+    # dsm.m.compile(optimizer=Adam(lr=0.01), loss='categorical_crossentropy')
+    # print(dsm.m.predict(x).shape)
+    # print(dsm.m.evaluate(x, y_softmax))
+    # w = dsm.get_weights()
+    dsm = ResCNNModel(batch_input_shape=(None, 32, 64, 4), include_softmax=False)
+    # dsm.m.set_weights(w)
+    dsm.m.compile(optimizer=Adam(lr=0.01), loss=deep_speaker_loss)
+
+    # it works!!!!!!!!!!!!!!!!!!!!
+    # unit_batch_size = 20
+    # anchor = np.ones(shape=(unit_batch_size, 32, 64, 4))
+    # positive = np.array(anchor)
+    # negative = np.ones(shape=(unit_batch_size, 32, 64, 4)) * (-1)
+    # batch = np.vstack((anchor, positive, negative))
+    # x = batch
+    # y = np.zeros(shape=(len(batch), 512))  # not important.
+    # print('Starting to fit...')
+    # while True:
+    #     print(dsm.m.train_on_batch(x, y))
+
+    # should not work... and it does not work!
+    unit_batch_size = 20
+    negative = np.ones(shape=(unit_batch_size, 32, 64, 4)) * (-1)
+    batch = np.vstack((negative, negative, negative))
+    x = batch
+    y = np.zeros(shape=(len(batch), 512))  # not important.
+    print('Starting to fit...')
+    while True:
+        print(dsm.m.train_on_batch(x, y))
+
+
+def test_gru():
+    # Looks correct to me.
+    # I have 37K but paper reports 41K. which is not too far.
+    dsm = GRUModel()
+    dsm.m.summary()
+    dsm.m.predict(np.random.random(size=(2, NUM_FRAMES, NUM_FBANKS, 1)))
+
+    i = np.random.random(size=(2, NUM_FRAMES, NUM_FBANKS, 1))
+    y = K.reshape(i, (2, NUM_FRAMES, NUM_FBANKS * 1))
+    z = K.reshape(y, (2, NUM_FRAMES, NUM_FBANKS, 1))
+
+    # I suspect num frames to be 32.
+    # Then fbank=64, then total would be 32*64 = 2048.
+    # plot_model(dsm.m, to_file='model.png', dpi=300, show_shapes=True, expand_nested=True)
diff --git a/train.py b/train.py
index da9fe90..e1ffa8b 100644
--- a/train.py
+++ b/train.py
@@ -7,7 +7,7 @@
 
 from batcher import KerasFormatConverter, LazyTripletBatcher
 from constants import BATCH_SIZE, CHECKPOINTS_SOFTMAX_DIR, CHECKPOINTS_TRIPLET_DIR, NUM_FRAMES, NUM_FBANKS
-from conv_models import DeepSpeakerModel
+from conv_models import ResCNNModel, DeepSpeakerModel
 from triplet_loss import deep_speaker_loss
 from utils import load_best_checkpoint, ensures_dir
 
@@ -80,7 +80,7 @@ def start_training(working_dir, pre_training_phase=True):
         logger.info('Softmax pre-training.')
         kc = KerasFormatConverter(working_dir)
         num_speakers_softmax = len(kc.categorical_speakers.speaker_ids)
-        dsm = DeepSpeakerModel(batch_input_shape, include_softmax=True, num_speakers_softmax=num_speakers_softmax)
+        dsm = ResCNNModel(batch_input_shape, include_softmax=True, num_speakers_softmax=num_speakers_softmax)
         dsm.m.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
         pre_training_checkpoint = load_best_checkpoint(CHECKPOINTS_SOFTMAX_DIR)
         if pre_training_checkpoint is not None:
@@ -93,7 +93,7 @@ def start_training(working_dir, pre_training_phase=True):
         fit_model_softmax(dsm, kc.kx_train, kc.ky_train, kc.kx_test, kc.ky_test, initial_epoch=initial_epoch)
     else:
         logger.info('Training with the triplet loss.')
-        dsm = DeepSpeakerModel(batch_input_shape, include_softmax=False)
+        dsm = ResCNNModel(batch_input_shape, include_softmax=False)
         triplet_checkpoint = load_best_checkpoint(CHECKPOINTS_TRIPLET_DIR)
         pre_training_checkpoint = load_best_checkpoint(CHECKPOINTS_SOFTMAX_DIR)
         if triplet_checkpoint is not None:

From e6198fd3f99d9ec04325622f583078fae1c203ab Mon Sep 17 00:00:00 2001
From: Philippe Remy <premy.enseirb@gmail.com>
Date: Sat, 25 Apr 2020 11:46:15 +0900
Subject: [PATCH 02/13] GRU code.

---
 cli.py         | 18 ++++++++----------
 conv_models.py | 18 +++++++++++++++---
 test.py        | 17 +++++------------
 train.py       |  9 +++++----
 4 files changed, 33 insertions(+), 29 deletions(-)

diff --git a/cli.py b/cli.py
index a44c226..6c287ea 100644
--- a/cli.py
+++ b/cli.py
@@ -9,6 +9,7 @@
 from audio import Audio
 from batcher import KerasFormatConverter
 from constants import SAMPLE_RATE, NUM_FRAMES
+from conv_models import GRU_NAME, RES_CNN_NAME
 from test import test
 from train import start_training
 from utils import ClickType as Ct, ensures_dir
@@ -53,8 +54,9 @@ def build_keras_inputs(working_dir, counts_per_speaker):
 
 @cli.command('test-model', short_help='Test a Keras model.')
 @click.option('--working_dir', required=True, type=Ct.input_dir())
+@click.option('--model_name', required=True, choices=[RES_CNN_NAME, GRU_NAME])
 @click.option('--checkpoint_file', required=True, type=Ct.input_file())
-def test_model(working_dir, checkpoint_file=None):
+def test_model(working_dir, model_name, checkpoint_file):
     # export CUDA_VISIBLE_DEVICES=0; python cli.py test-model
     # --working_dir /home/philippe/ds-test/triplet-training/
     # --checkpoint_file ../ds-test/checkpoints-softmax/ResCNN_checkpoint_102.h5
@@ -64,20 +66,16 @@ def test_model(working_dir, checkpoint_file=None):
     # --working_dir /home/philippe/ds-test/triplet-training/
     # --checkpoint_file ../ds-test/checkpoints-triplets/ResCNN_checkpoint_175.h5
     # f-measure = 0.849, true positive rate = 0.798, accuracy = 0.997, equal error rate = 0.025
-    test(working_dir, checkpoint_file)
+    test(working_dir, model_name, checkpoint_file)
 
 
 @cli.command('train-model', short_help='Train a Keras model.')
 @click.option('--working_dir', required=True, type=Ct.input_dir())
+@click.option('--model_name', required=True, choices=[RES_CNN_NAME, GRU_NAME])
 @click.option('--pre_training_phase/--no_pre_training_phase', default=False, show_default=True)
-def train_model(working_dir, pre_training_phase):
+def train_model(working_dir, model_name, pre_training_phase):
     # PRE TRAINING
-
-    # commit a5030dd7a1b53cd11d5ab7832fa2d43f2093a464
-    # Merge: a11d13e b30e64e
-    # Author: Philippe Remy <premy.enseirb@gmail.com>
-    # Date:   Fri Apr 10 10:37:59 2020 +0900
-    # LibriSpeech train-clean-data360 (600, 100). 0.985 on test set (enough for pre-training).
+    # LibriSpeech train-clean-data360 (600, 100). 0.991 on test set (enough for pre-training).
 
     # TRIPLET TRAINING
     # [...]
@@ -89,7 +87,7 @@ def train_model(working_dir, pre_training_phase):
     # 2000/2000 [==============================] - 927s 464ms/step - loss: 0.0075 - val_loss: 0.0059
     # Epoch 178/1000
     # 2000/2000 [==============================] - 948s 474ms/step - loss: 0.0073 - val_loss: 0.0058
-    start_training(working_dir, pre_training_phase)
+    start_training(working_dir, model_name, pre_training_phase)
 
 
 if __name__ == '__main__':
diff --git a/conv_models.py b/conv_models.py
index 4ce03a1..ab6ca27 100644
--- a/conv_models.py
+++ b/conv_models.py
@@ -17,6 +17,18 @@
 
 logger = logging.getLogger(__name__)
 
+RES_CNN_NAME = 'ResCNN'
+GRU_NAME = 'GRU'
+
+
+def select_model_class(name: str):
+    if name == RES_CNN_NAME:
+        return ResCNNModel
+    elif name == GRU_NAME:
+        return GRUModel
+    else:
+        raise Exception(f'Unknown model name: {name}.')
+
 
 class DeepSpeakerModel:
 
@@ -24,7 +36,7 @@ def __init__(self,
                  batch_input_shape=(None, NUM_FRAMES, NUM_FBANKS, 1),
                  include_softmax=False,
                  num_speakers_softmax=None,
-                 name='ResCNN'):
+                 name=RES_CNN_NAME):
         self.include_softmax = include_softmax
         self.num_speakers_softmax = num_speakers_softmax
         if self.include_softmax:
@@ -83,7 +95,7 @@ def __init__(self,
                  batch_input_shape=(None, NUM_FRAMES, NUM_FBANKS, 1),
                  include_softmax=False,
                  num_speakers_softmax=None):
-        super().__init__(batch_input_shape, include_softmax, num_speakers_softmax, 'ResCNN')
+        super().__init__(batch_input_shape, include_softmax, num_speakers_softmax, RES_CNN_NAME)
 
     def graph(self, inputs):
         x = self.conv_and_res_block(inputs, 64, stage=1)
@@ -146,7 +158,7 @@ def __init__(self,
                  batch_input_shape=(None, NUM_FRAMES, NUM_FBANKS, 1),
                  include_softmax=False,
                  num_speakers_softmax=None):
-        super().__init__(batch_input_shape, include_softmax, num_speakers_softmax, 'GRU')
+        super().__init__(batch_input_shape, include_softmax, num_speakers_softmax, GRU_NAME)
 
     def graph(self, inputs):
         x = Conv2D(64, kernel_size=5, strides=2, padding='same', kernel_initializer='glorot_uniform',
diff --git a/test.py b/test.py
index 71327d0..cdbd3c6 100644
--- a/test.py
+++ b/test.py
@@ -5,10 +5,10 @@
 
 from audio import Audio
 from batcher import LazyTripletBatcher
-from constants import NUM_FBANKS, NUM_FRAMES, CHECKPOINTS_TRIPLET_DIR, BATCH_SIZE
-from conv_models import ResCNNModel
+from constants import NUM_FBANKS, NUM_FRAMES, BATCH_SIZE
+from conv_models import ResCNNModel, select_model_class
 from eval_metrics import evaluate
-from utils import load_best_checkpoint, enable_deterministic
+from utils import enable_deterministic
 
 logger = logging.getLogger(__name__)
 
@@ -41,22 +41,15 @@ def eval_model(working_dir: str, model: ResCNNModel):
         anchor_embedding = predictions[0]
         for j, other_than_anchor_embedding in enumerate(predictions[1:]):  # positive + negatives
             y_pred[i][j] = batch_cosine_similarity([anchor_embedding], [other_than_anchor_embedding])[0]
-        # y_pred[i] = softmax(y_pred[i])
-    # could apply softmax here.
     y_true = np.zeros_like(y_pred)  # positive is at index 0.
     y_true[:, 0] = 1.0
-    print(np.matrix(y_true))
-    print(np.matrix(y_pred))
-    print(np.min(y_pred), np.max(y_pred))
     fm, tpr, acc, eer = evaluate(y_pred, y_true)
     return fm, tpr, acc, eer
 
 
-def test(working_dir, checkpoint_file=None):
+def test(working_dir, model_name, checkpoint_file):
     batch_input_shape = [None, NUM_FRAMES, NUM_FBANKS, 1]
-    dsm = ResCNNModel(batch_input_shape)
-    if checkpoint_file is None:
-        checkpoint_file = load_best_checkpoint(CHECKPOINTS_TRIPLET_DIR)
+    dsm = select_model_class(model_name)(batch_input_shape)
     if checkpoint_file is not None:
         logger.info(f'Found checkpoint [{checkpoint_file}]. Loading weights...')
         dsm.m.load_weights(checkpoint_file, by_name=True)
diff --git a/train.py b/train.py
index e1ffa8b..161fa42 100644
--- a/train.py
+++ b/train.py
@@ -7,7 +7,7 @@
 
 from batcher import KerasFormatConverter, LazyTripletBatcher
 from constants import BATCH_SIZE, CHECKPOINTS_SOFTMAX_DIR, CHECKPOINTS_TRIPLET_DIR, NUM_FRAMES, NUM_FBANKS
-from conv_models import ResCNNModel, DeepSpeakerModel
+from conv_models import ResCNNModel, DeepSpeakerModel, select_model_class
 from triplet_loss import deep_speaker_loss
 from utils import load_best_checkpoint, ensures_dir
 
@@ -72,15 +72,16 @@ def fit_model_softmax(dsm: DeepSpeakerModel, kx_train, ky_train, kx_test, ky_tes
               callbacks=[early_stopping, reduce_lr, checkpoint])
 
 
-def start_training(working_dir, pre_training_phase=True):
+def start_training(working_dir, model_name, pre_training_phase=True):
     ensures_dir(CHECKPOINTS_SOFTMAX_DIR)
     ensures_dir(CHECKPOINTS_TRIPLET_DIR)
     batch_input_shape = [None, NUM_FRAMES, NUM_FBANKS, 1]
+    model_class = select_model_class(model_name)
     if pre_training_phase:
         logger.info('Softmax pre-training.')
         kc = KerasFormatConverter(working_dir)
         num_speakers_softmax = len(kc.categorical_speakers.speaker_ids)
-        dsm = ResCNNModel(batch_input_shape, include_softmax=True, num_speakers_softmax=num_speakers_softmax)
+        dsm = model_class(batch_input_shape, include_softmax=True, num_speakers_softmax=num_speakers_softmax)
         dsm.m.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
         pre_training_checkpoint = load_best_checkpoint(CHECKPOINTS_SOFTMAX_DIR)
         if pre_training_checkpoint is not None:
@@ -93,7 +94,7 @@ def start_training(working_dir, pre_training_phase=True):
         fit_model_softmax(dsm, kc.kx_train, kc.ky_train, kc.kx_test, kc.ky_test, initial_epoch=initial_epoch)
     else:
         logger.info('Training with the triplet loss.')
-        dsm = ResCNNModel(batch_input_shape, include_softmax=False)
+        dsm = model_class(batch_input_shape, include_softmax=False)
         triplet_checkpoint = load_best_checkpoint(CHECKPOINTS_TRIPLET_DIR)
         pre_training_checkpoint = load_best_checkpoint(CHECKPOINTS_SOFTMAX_DIR)
         if triplet_checkpoint is not None:

From 641bfa0c161c5228ad4156c0f1e284659cfda199 Mon Sep 17 00:00:00 2001
From: Philippe Remy <premy.enseirb@gmail.com>
Date: Sat, 25 Apr 2020 11:52:42 +0900
Subject: [PATCH 03/13] GRU code.

---
 batcher.py | 15 +--------------
 cli.py     |  2 +-
 2 files changed, 2 insertions(+), 15 deletions(-)

diff --git a/batcher.py b/batcher.py
index 4c03525..add5b36 100644
--- a/batcher.py
+++ b/batcher.py
@@ -11,7 +11,7 @@
 
 from audio import pad_mfcc, Audio
 from constants import NUM_FRAMES, NUM_FBANKS
-from conv_models import ResCNNModel, DeepSpeakerModel
+from conv_models import DeepSpeakerModel
 from utils import ensures_dir, load_pickle, load_npy, train_test_sp_to_utt
 
 logger = logging.getLogger(__name__)
@@ -489,16 +489,3 @@ def get_speaker_verification_data(self, positive_speaker, num_different_speakers
         data = [anchor, positive]
         data.extend([self._select_speaker_data(n) for n in negative_speakers])
         return np.vstack(data)
-
-
-if __name__ == '__main__':
-    np.random.seed(123)
-    ltb = LazyTripletBatcher(working_dir='/Users/premy/deep-speaker/',
-                             max_length=NUM_FRAMES,
-                             model=ResCNNModel())
-    for i in range(1000):
-        print(i)
-        start = time()
-        ltb.get_batch_train(batch_size=9)
-        print(time() - start)
-        # ltb.get_batch(batch_size=96)
diff --git a/cli.py b/cli.py
index 6c287ea..121acde 100644
--- a/cli.py
+++ b/cli.py
@@ -17,7 +17,7 @@
 
 logger = logging.getLogger(__name__)
 
-VERSION = '3.0a'
+VERSION = '3.0b'
 
 
 @click.group()

From a69363f686f8e8571016cd65e6ce6244296c8710 Mon Sep 17 00:00:00 2001
From: Philippe Remy <premy.enseirb@gmail.com>
Date: Sat, 25 Apr 2020 12:31:14 +0900
Subject: [PATCH 04/13] GRU code.

---
 cli.py          | 4 ++--
 conv_models.py  | 1 -
 triplet_loss.py | 1 -
 3 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/cli.py b/cli.py
index 121acde..7d997b2 100644
--- a/cli.py
+++ b/cli.py
@@ -54,7 +54,7 @@ def build_keras_inputs(working_dir, counts_per_speaker):
 
 @cli.command('test-model', short_help='Test a Keras model.')
 @click.option('--working_dir', required=True, type=Ct.input_dir())
-@click.option('--model_name', required=True, choices=[RES_CNN_NAME, GRU_NAME])
+@click.option('--model_name', required=True, type=click.Choice([RES_CNN_NAME, GRU_NAME]))
 @click.option('--checkpoint_file', required=True, type=Ct.input_file())
 def test_model(working_dir, model_name, checkpoint_file):
     # export CUDA_VISIBLE_DEVICES=0; python cli.py test-model
@@ -71,7 +71,7 @@ def test_model(working_dir, model_name, checkpoint_file):
 
 @cli.command('train-model', short_help='Train a Keras model.')
 @click.option('--working_dir', required=True, type=Ct.input_dir())
-@click.option('--model_name', required=True, choices=[RES_CNN_NAME, GRU_NAME])
+@click.option('--model_name', required=True, type=click.Choice([RES_CNN_NAME, GRU_NAME]))
 @click.option('--pre_training_phase/--no_pre_training_phase', default=False, show_default=True)
 def train_model(working_dir, model_name, pre_training_phase):
     # PRE TRAINING
diff --git a/conv_models.py b/conv_models.py
index ab6ca27..a520c51 100644
--- a/conv_models.py
+++ b/conv_models.py
@@ -52,7 +52,6 @@ def graph(self, inputs):
 
     def graph_with_avg_softmax_and_ln(self, inputs):
         x = self.graph(inputs)
-
         # Temporal average layer. axis=1 is time.
         x = Lambda(lambda y: K.mean(y, axis=1), name='average')(x)
         if self.include_softmax:
diff --git a/triplet_loss.py b/triplet_loss.py
index 784b404..4179560 100644
--- a/triplet_loss.py
+++ b/triplet_loss.py
@@ -1,6 +1,5 @@
 import keras.backend as K
 
-# ALPHA = 0.2  # used in FaceNet https://arxiv.org/pdf/1503.03832.pdf
 ALPHA = 0.1  # used in Deep Speaker.
 
 

From e9b42f8dd2aaaf89f9601a0da37e366b66e9a34d Mon Sep 17 00:00:00 2001
From: Philippe Remy <premy.enseirb@gmail.com>
Date: Sun, 26 Apr 2020 09:59:28 +0900
Subject: [PATCH 05/13] GRU can train to around 0.935 without much effort now

---
 conv_models.py |  3 ++-
 train.py       | 11 +++++++----
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/conv_models.py b/conv_models.py
index a520c51..3c3951b 100644
--- a/conv_models.py
+++ b/conv_models.py
@@ -57,6 +57,7 @@ def graph_with_avg_softmax_and_ln(self, inputs):
         if self.include_softmax:
             logger.info('Including a Dropout layer to reduce overfitting.')
             # used for softmax because the dataset we pre-train on might be too small. easy to overfit.
+            # x = Dropout(0.25)(x) # was for GRU. Does 0.5 work with GRU as well?
             x = Dropout(0.5)(x)
         x = Dense(512, name='affine')(x)
         if self.include_softmax:
@@ -163,7 +164,7 @@ def graph(self, inputs):
         x = Conv2D(64, kernel_size=5, strides=2, padding='same', kernel_initializer='glorot_uniform',
                    name='conv1', kernel_regularizer=regularizers.l2(l=0.0001))(inputs)
         # shape = (BATCH_SIZE , num_frames/2, 64/2, 64)
-        x = BatchNormalization(name='bn1')(x)
+        x = BatchNormalization(name='bn1')(x) # does it work with BN?
         x = self.clipped_relu(x)
 
         # 4d -> 3d.
diff --git a/train.py b/train.py
index 161fa42..81871a8 100644
--- a/train.py
+++ b/train.py
@@ -2,12 +2,12 @@
 import os
 
 from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint
-from tensorflow.keras.optimizers import SGD
+from tensorflow.keras.optimizers import SGD, Adam
 from tqdm import tqdm
 
 from batcher import KerasFormatConverter, LazyTripletBatcher
 from constants import BATCH_SIZE, CHECKPOINTS_SOFTMAX_DIR, CHECKPOINTS_TRIPLET_DIR, NUM_FRAMES, NUM_FBANKS
-from conv_models import ResCNNModel, DeepSpeakerModel, select_model_class
+from conv_models import ResCNNModel, DeepSpeakerModel, select_model_class, RES_CNN_NAME
 from triplet_loss import deep_speaker_loss
 from utils import load_best_checkpoint, ensures_dir
 
@@ -82,13 +82,16 @@ def start_training(working_dir, model_name, pre_training_phase=True):
         kc = KerasFormatConverter(working_dir)
         num_speakers_softmax = len(kc.categorical_speakers.speaker_ids)
         dsm = model_class(batch_input_shape, include_softmax=True, num_speakers_softmax=num_speakers_softmax)
-        dsm.m.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
+        # ResCNN can train with default Adam LR of 0.001. GRU is more sensitive.
+        lr = 0.001 if model_name == RES_CNN_NAME else 0.0003
+        logger.info(f'Initial learning rate set to {lr}.')
+        dsm.m.compile(optimizer=Adam(learning_rate=lr), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
         pre_training_checkpoint = load_best_checkpoint(CHECKPOINTS_SOFTMAX_DIR)
         if pre_training_checkpoint is not None:
             initial_epoch = int(pre_training_checkpoint.split('/')[-1].split('.')[0].split('_')[-1])
             logger.info(f'Initial epoch is {initial_epoch}.')
             logger.info(f'Loading softmax checkpoint: {pre_training_checkpoint}.')
-            dsm.m.load_weights(pre_training_checkpoint)  # latest one.
+            dsm.m.load_weights(pre_training_checkpoint, by_name=True)  # latest one.
         else:
             initial_epoch = 0
         fit_model_softmax(dsm, kc.kx_train, kc.ky_train, kc.kx_test, kc.ky_test, initial_epoch=initial_epoch)

From 570c0593a97ac2d9e61da1f001cd9ad0af6648bc Mon Sep 17 00:00:00 2001
From: Philippe Remy <premy.enseirb@gmail.com>
Date: Sun, 26 Apr 2020 10:00:40 +0900
Subject: [PATCH 06/13] add model name as arg

---
 deep-speaker | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/deep-speaker b/deep-speaker
index 07a3046..f8cfc81 100755
--- a/deep-speaker
+++ b/deep-speaker
@@ -38,13 +38,13 @@ build_model_inputs)
 train_softmax)
   # Pre-training (0.92k speakers).
   echo "[train_softmax] selected."
-  python cli.py train-model --working_dir "${PRE_TRAINING_WORKING_DIR}" --pre_training_phase
+  python cli.py train-model --model_name "$2" --working_dir "${PRE_TRAINING_WORKING_DIR}" --pre_training_phase
   ;;
 
 train_triplet)
   # Triplet-training (2.48k speakers).
   echo "[train_triplet] selected."
-  python cli.py train-model --working_dir "${TRIPLET_TRAINING_WORKING_DIR}"
+  python cli.py train-model --model_name "$2" --working_dir "${TRIPLET_TRAINING_WORKING_DIR}"
   ;;
 
 *)

From 23f3c764100bdc7e50313a9b384e275d375b19b5 Mon Sep 17 00:00:00 2001
From: Philippe Remy <premy.enseirb@gmail.com>
Date: Sun, 26 Apr 2020 15:03:57 +0900
Subject: [PATCH 07/13] LR lowered to 0.0001

---
 train.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/train.py b/train.py
index 81871a8..dc5f17b 100644
--- a/train.py
+++ b/train.py
@@ -83,7 +83,7 @@ def start_training(working_dir, model_name, pre_training_phase=True):
         num_speakers_softmax = len(kc.categorical_speakers.speaker_ids)
         dsm = model_class(batch_input_shape, include_softmax=True, num_speakers_softmax=num_speakers_softmax)
         # ResCNN can train with default Adam LR of 0.001. GRU is more sensitive.
-        lr = 0.001 if model_name == RES_CNN_NAME else 0.0003
+        lr = 0.001 if model_name == RES_CNN_NAME else 0.0001
         logger.info(f'Initial learning rate set to {lr}.')
         dsm.m.compile(optimizer=Adam(learning_rate=lr), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
         pre_training_checkpoint = load_best_checkpoint(CHECKPOINTS_SOFTMAX_DIR)

From 3651192b314cfabbf28e4a0561e665bbea014662 Mon Sep 17 00:00:00 2001
From: Philippe Remy <premy.enseirb@gmail.com>
Date: Sun, 26 Apr 2020 19:24:12 +0900
Subject: [PATCH 08/13] LR lowered to 0.00005. has to be really low

---
 train.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/train.py b/train.py
index dc5f17b..674b7c8 100644
--- a/train.py
+++ b/train.py
@@ -83,7 +83,7 @@ def start_training(working_dir, model_name, pre_training_phase=True):
         num_speakers_softmax = len(kc.categorical_speakers.speaker_ids)
         dsm = model_class(batch_input_shape, include_softmax=True, num_speakers_softmax=num_speakers_softmax)
         # ResCNN can train with default Adam LR of 0.001. GRU is more sensitive.
-        lr = 0.001 if model_name == RES_CNN_NAME else 0.0001
+        lr = 0.001 if model_name == RES_CNN_NAME else 0.00005
         logger.info(f'Initial learning rate set to {lr}.')
         dsm.m.compile(optimizer=Adam(learning_rate=lr), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
         pre_training_checkpoint = load_best_checkpoint(CHECKPOINTS_SOFTMAX_DIR)

From 152f19d83ec3ce4f0216e143f71952a2e51e004d Mon Sep 17 00:00:00 2001
From: Philippe Remy <premy.enseirb@gmail.com>
Date: Sun, 26 Apr 2020 19:26:33 +0900
Subject: [PATCH 09/13] conv_models.py to models.py.

---
 README.md                   | 2 +-
 batcher.py                  | 2 +-
 cli.py                      | 2 +-
 example.py                  | 2 +-
 conv_models.py => models.py | 0
 test.py                     | 2 +-
 tests/batcher_test.py       | 2 +-
 tests/batcher_test2.py      | 2 +-
 tests/model.py              | 2 +-
 train.py                    | 2 +-
 10 files changed, 9 insertions(+), 9 deletions(-)
 rename conv_models.py => models.py (100%)

diff --git a/README.md b/README.md
index 006d29c..64ebab8 100644
--- a/README.md
+++ b/README.md
@@ -59,7 +59,7 @@ import random
 from audio import read_mfcc
 from batcher import sample_from_mfcc
 from constants import SAMPLE_RATE, NUM_FRAMES
-from conv_models import ResCNNModel
+from models import ResCNNModel
 from test import batch_cosine_similarity
 
 # Reproducible results.
diff --git a/batcher.py b/batcher.py
index add5b36..78fe129 100644
--- a/batcher.py
+++ b/batcher.py
@@ -11,7 +11,7 @@
 
 from audio import pad_mfcc, Audio
 from constants import NUM_FRAMES, NUM_FBANKS
-from conv_models import DeepSpeakerModel
+from models import DeepSpeakerModel
 from utils import ensures_dir, load_pickle, load_npy, train_test_sp_to_utt
 
 logger = logging.getLogger(__name__)
diff --git a/cli.py b/cli.py
index 7d997b2..e0a1a4b 100644
--- a/cli.py
+++ b/cli.py
@@ -9,7 +9,7 @@
 from audio import Audio
 from batcher import KerasFormatConverter
 from constants import SAMPLE_RATE, NUM_FRAMES
-from conv_models import GRU_NAME, RES_CNN_NAME
+from models import GRU_NAME, RES_CNN_NAME
 from test import test
 from train import start_training
 from utils import ClickType as Ct, ensures_dir
diff --git a/example.py b/example.py
index 7cfed95..41391db 100644
--- a/example.py
+++ b/example.py
@@ -5,7 +5,7 @@
 from audio import read_mfcc
 from batcher import sample_from_mfcc
 from constants import SAMPLE_RATE, NUM_FRAMES
-from conv_models import ResCNNModel
+from models import ResCNNModel
 from test import batch_cosine_similarity
 
 np.random.seed(123)
diff --git a/conv_models.py b/models.py
similarity index 100%
rename from conv_models.py
rename to models.py
diff --git a/test.py b/test.py
index cdbd3c6..3e01af0 100644
--- a/test.py
+++ b/test.py
@@ -6,7 +6,7 @@
 from audio import Audio
 from batcher import LazyTripletBatcher
 from constants import NUM_FBANKS, NUM_FRAMES, BATCH_SIZE
-from conv_models import ResCNNModel, select_model_class
+from models import ResCNNModel, select_model_class
 from eval_metrics import evaluate
 from utils import enable_deterministic
 
diff --git a/tests/batcher_test.py b/tests/batcher_test.py
index 12ce3b8..2f50e01 100644
--- a/tests/batcher_test.py
+++ b/tests/batcher_test.py
@@ -5,7 +5,7 @@
 import triplet_loss
 from batcher import KerasFormatConverter, TripletBatcherSelectHardNegatives, TripletBatcher
 from constants import NUM_FBANKS, NUM_FRAMES, CHECKPOINTS_TRIPLET_DIR, CHECKPOINTS_SOFTMAX_DIR, BATCH_SIZE
-from conv_models import ResCNNModel
+from models import ResCNNModel
 from triplet_loss import deep_speaker_loss
 from utils import load_best_checkpoint
 
diff --git a/tests/batcher_test2.py b/tests/batcher_test2.py
index 848459d..66810e8 100644
--- a/tests/batcher_test2.py
+++ b/tests/batcher_test2.py
@@ -2,7 +2,7 @@
 
 from batcher import LazyTripletBatcher
 from constants import NUM_FBANKS, NUM_FRAMES
-from conv_models import ResCNNModel
+from models import ResCNNModel
 from triplet_loss import deep_speaker_loss
 
 
diff --git a/tests/model.py b/tests/model.py
index c0d2c14..056e839 100644
--- a/tests/model.py
+++ b/tests/model.py
@@ -3,7 +3,7 @@
 from tensorflow.keras.optimizers import Adam
 
 from constants import NUM_FRAMES, NUM_FBANKS
-from conv_models import ResCNNModel, GRUModel
+from models import ResCNNModel, GRUModel
 from triplet_loss import deep_speaker_loss
 
 
diff --git a/train.py b/train.py
index 81871a8..6017208 100644
--- a/train.py
+++ b/train.py
@@ -7,7 +7,7 @@
 
 from batcher import KerasFormatConverter, LazyTripletBatcher
 from constants import BATCH_SIZE, CHECKPOINTS_SOFTMAX_DIR, CHECKPOINTS_TRIPLET_DIR, NUM_FRAMES, NUM_FBANKS
-from conv_models import ResCNNModel, DeepSpeakerModel, select_model_class, RES_CNN_NAME
+from models import ResCNNModel, DeepSpeakerModel, select_model_class, RES_CNN_NAME
 from triplet_loss import deep_speaker_loss
 from utils import load_best_checkpoint, ensures_dir
 

From 8806bbb53f21954481f9eb832f880407ea28829d Mon Sep 17 00:00:00 2001
From: Philippe Remy <premy.enseirb@gmail.com>
Date: Wed, 29 Apr 2020 10:15:42 +0900
Subject: [PATCH 10/13] more precision

---
 test.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test.py b/test.py
index 3e01af0..c1627d3 100644
--- a/test.py
+++ b/test.py
@@ -58,5 +58,5 @@ def test(working_dir, model_name, checkpoint_file):
         exit(1)
 
     fm, tpr, acc, eer = eval_model(working_dir, model=dsm)
-    logger.info(f'f-measure = {fm:.3f}, true positive rate = {tpr:.3f}, '
-                f'accuracy = {acc:.3f}, equal error rate = {eer:.3f}')
+    logger.info(f'f-measure = {fm:.5f}, true positive rate = {tpr:.5f}, '
+                f'accuracy = {acc:.5f}, equal error rate = {eer:.5f}')

From 2223f93c4551ffdac61878bfaca9418fa6334193 Mon Sep 17 00:00:00 2001
From: Philippe Remy <premy.enseirb@gmail.com>
Date: Wed, 29 Apr 2020 10:16:02 +0900
Subject: [PATCH 11/13] havent tested those but looks promising. we need to
 break this GRU1024x3

---
 models.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/models.py b/models.py
index 3c3951b..03103cf 100644
--- a/models.py
+++ b/models.py
@@ -174,6 +174,11 @@ def graph(self, inputs):
 
         # shape = (BATCH_SIZE , num_frames/2, 1024)
         x = GRU(1024, name='GRU1', return_sequences=True)(x)
+        if self.include_softmax:
+            x = Dropout(0.2)(x)
         x = GRU(1024, name='GRU2', return_sequences=True)(x)
+        if self.include_softmax:
+            x = Dropout(0.2)(x)
         x = GRU(1024, name='GRU3', return_sequences=True)(x)
         return x
+

From 571ffbfa1f12574c1c029fe4a72e9253a159bc04 Mon Sep 17 00:00:00 2001
From: Philippe Remy <premy.enseirb@gmail.com>
Date: Wed, 29 Apr 2020 10:16:24 +0900
Subject: [PATCH 12/13] just to make sure we dont see dropout during triplet
 training

---
 train.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/train.py b/train.py
index c9f2695..7db5b8b 100644
--- a/train.py
+++ b/train.py
@@ -110,4 +110,5 @@ def start_training(working_dir, model_name, pre_training_phase=True):
             # some of the layers have changed.
             dsm.m.load_weights(pre_training_checkpoint, by_name=True)
         dsm.m.compile(optimizer=SGD(), loss=deep_speaker_loss)
+        dsm.m.summary()
         fit_model(dsm, working_dir, NUM_FRAMES)

From 36eb0ef2d8a362dae30995aa394531929a3ca35f Mon Sep 17 00:00:00 2001
From: Philippe Remy <premy.enseirb@gmail.com>
Date: Thu, 30 Apr 2020 11:18:15 +0900
Subject: [PATCH 13/13] better training

---
 train.py | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/train.py b/train.py
index 7db5b8b..209ec4f 100644
--- a/train.py
+++ b/train.py
@@ -7,7 +7,7 @@
 
 from batcher import KerasFormatConverter, LazyTripletBatcher
 from constants import BATCH_SIZE, CHECKPOINTS_SOFTMAX_DIR, CHECKPOINTS_TRIPLET_DIR, NUM_FRAMES, NUM_FBANKS
-from models import ResCNNModel, DeepSpeakerModel, select_model_class, RES_CNN_NAME
+from models import DeepSpeakerModel, select_model_class, RES_CNN_NAME
 from triplet_loss import deep_speaker_loss
 from utils import load_best_checkpoint, ensures_dir
 
@@ -17,7 +17,8 @@
 os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
 
 
-def fit_model(dsm: DeepSpeakerModel, working_dir: str, max_length: int = NUM_FRAMES, batch_size=BATCH_SIZE):
+def fit_model(dsm: DeepSpeakerModel, working_dir: str, max_length: int = NUM_FRAMES,
+              batch_size: int = BATCH_SIZE, initial_epoch: int = 0):
     batcher = LazyTripletBatcher(working_dir, max_length, dsm)
 
     # build small test set.
@@ -37,9 +38,10 @@ def train_generator():
     checkpoint_name = dsm.m.name + '_checkpoint'
     checkpoint_filename = os.path.join(CHECKPOINTS_TRIPLET_DIR, checkpoint_name + '_{epoch}.h5')
     checkpoint = ModelCheckpoint(monitor='val_loss', filepath=checkpoint_filename, save_best_only=True)
+    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=10, min_lr=1e-4, verbose=1)
     dsm.m.fit(x=train_generator(), y=None, steps_per_epoch=2000, shuffle=False,
               epochs=1000, validation_data=test_generator(), validation_steps=len(test_batches),
-              callbacks=[checkpoint])
+              callbacks=[reduce_lr, checkpoint], initial_epoch=initial_epoch)
 
 
 def fit_model_softmax(dsm: DeepSpeakerModel, kx_train, ky_train, kx_test, ky_test,
@@ -83,7 +85,7 @@ def start_training(working_dir, model_name, pre_training_phase=True):
         num_speakers_softmax = len(kc.categorical_speakers.speaker_ids)
         dsm = model_class(batch_input_shape, include_softmax=True, num_speakers_softmax=num_speakers_softmax)
         # ResCNN can train with default Adam LR of 0.001. GRU is more sensitive.
-        lr = 0.001 if model_name == RES_CNN_NAME else 0.00005
+        lr = 0.001 if model_name == RES_CNN_NAME else 0.00003
         logger.info(f'Initial learning rate set to {lr}.')
         dsm.m.compile(optimizer=Adam(learning_rate=lr), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
         pre_training_checkpoint = load_best_checkpoint(CHECKPOINTS_SOFTMAX_DIR)
@@ -102,13 +104,19 @@ def start_training(working_dir, model_name, pre_training_phase=True):
         pre_training_checkpoint = load_best_checkpoint(CHECKPOINTS_SOFTMAX_DIR)
         if triplet_checkpoint is not None:
             logger.info(f'Loading triplet checkpoint: {triplet_checkpoint}.')
+            initial_epoch = int(triplet_checkpoint.split('/')[-1].split('.')[0].split('_')[-1])
             dsm.m.load_weights(triplet_checkpoint)
         elif pre_training_checkpoint is not None:
             logger.info(f'Loading pre-training checkpoint: {pre_training_checkpoint}.')
             # If `by_name` is True, weights are loaded into layers only if they share the
             # same name. This is useful for fine-tuning or transfer-learning models where
             # some of the layers have changed.
+            initial_epoch = 0
             dsm.m.load_weights(pre_training_checkpoint, by_name=True)
-        dsm.m.compile(optimizer=SGD(), loss=deep_speaker_loss)
+        else:
+            initial_epoch = 0
+        # dsm.m.compile(optimizer=SGD(learning_rate=0.05, momentum=0.99), loss=deep_speaker_loss)
         dsm.m.summary()
-        fit_model(dsm, working_dir, NUM_FRAMES)
+        dsm.m.compile(optimizer=SGD(), loss=deep_speaker_loss)
+        fit_model(dsm, working_dir, max_length=NUM_FRAMES, initial_epoch=initial_epoch)
+