philipperemy · philipperemy · Apr 29, 2020 · Apr 30, 2020 · Apr 30, 2020 · Apr 30, 2020
diff --git a/audio.py b/audio.py
@@ -14,23 +14,42 @@
 logger = logging.getLogger(__name__)
 
 
+def pad_mfcc(mfcc: np.array, max_length: int):
+    # pad MFCC with 0.0. if max_length = 160 (default settings), then less than 1.6s of speech will require padding.
+    if len(mfcc) < max_length:
+        mfcc = np.vstack((mfcc, np.tile(np.zeros(mfcc.shape[1]), (max_length - len(mfcc), 1))))
+    return mfcc
+
+
+def mfcc_fbank(signal: np.array, sample_rate: int):  # 1D signal array.
+    assert len(signal.shape) == 1
+    # Returns MFCC with shape (num_frames, n_filters, 3).
+    filter_banks, energies = fbank(signal, samplerate=sample_rate, nfilt=NUM_FBANKS)
+    frames_features = normalize_mfcc_frames(filter_banks)
+    # delta_1 = delta(filter_banks, N=1)
+    # delta_2 = delta(delta_1, N=1)
+    # frames_features = np.transpose(np.stack([filter_banks, delta_1, delta_2]), (1, 2, 0))
+    return np.array(frames_features, dtype=np.float32)  # Float32 precision is enough here.
+
+
+def normalize_mfcc_frames(m: np.array, epsilon=1e-12):
+    return [(v - np.mean(v)) / max(np.std(v), epsilon) for v in m]
+
+
 def read_mfcc(input_filename, sample_rate):
     audio = Audio.read(input_filename, sample_rate)
+    # TODO: could use trim_silence() here or a better VAD.
     energy = np.abs(audio)
     silence_threshold = np.percentile(energy, 95)
     offsets = np.where(energy > silence_threshold)[0]
-    # left_blank_duration_ms = (1000.0 * offsets[0]) // self.sample_rate  # frame_id to duration (ms)
-    # right_blank_duration_ms = (1000.0 * (len(audio) - offsets[-1])) // self.sample_rate
-    # TODO: could use trim_silence() here or a better VAD.
     audio_voice_only = audio[offsets[0]:offsets[-1]]
     mfcc = mfcc_fbank(audio_voice_only, sample_rate)
     return mfcc
 
 
-def extract_speaker_and_utterance_ids(filename: str):  # LIBRI.
+def extract_speaker_and_utterance_ids(libri_filename: str):  # LIBRI.
     # 'audio/dev-other/116/288045/116-288045-0000.flac'
-    speaker, _, basename = Path(filename).parts[-3:]
-    filename.split('-')
+    speaker, _, basename = Path(libri_filename).parts[-3:]
     utterance = os.path.splitext(basename.split('-', 1)[-1])[0]
     assert basename.split('-')[0] == speaker
     return speaker, utterance
@@ -54,32 +73,15 @@ def __init__(self, cache_dir: str, audio_dir: str = None, sample_rate: int = SAM
     def speaker_ids(self):
         return sorted(self.speakers_to_utterances)
 
-    @staticmethod
-    def trim_silence(audio, threshold):
-        """Removes silence at the beginning and end of a sample."""
-        energy = librosa.feature.rms(audio)
-        frames = np.nonzero(np.array(energy > threshold))
-        indices = librosa.core.frames_to_samples(frames)[1]
-
-        # Note: indices can be an empty array, if the whole audio was silence.
-        audio_trim = audio[0:0]
-        left_blank = audio[0:0]
-        right_blank = audio[0:0]
-        if indices.size:
-            audio_trim = audio[indices[0]:indices[-1]]
-            left_blank = audio[:indices[0]]  # slice before.
-            right_blank = audio[indices[-1]:]  # slice after.
-        return audio_trim, left_blank, right_blank
-
     @staticmethod
     def read(filename, sample_rate=SAMPLE_RATE):
         audio, sr = librosa.load(filename, sr=sample_rate, mono=True, dtype=np.float32)
         assert sr == sample_rate
         return audio
 
     def build_cache(self, audio_dir, sample_rate):
-        logger.info(f'audio_dir: {audio_dir}.')
-        logger.info(f'sample_rate: {sample_rate:,} hz.')
+        logger.info(f'Audio directory : {audio_dir}.')
+        logger.info(f'Sample rate     : {sample_rate:,} hz.')
         audio_files = find_files(audio_dir, ext=self.ext)
         audio_files_count = len(audio_files)
         assert audio_files_count != 0, f'Could not find any {self.ext} files in {audio_dir}.'
@@ -98,23 +100,3 @@ def cache_audio_file(self, input_filename, sample_rate):
                 np.save(cache_filename, mfcc)
             except librosa.util.exceptions.ParameterError as e:
                 logger.error(e)
-
-
-def pad_mfcc(mfcc, max_length):  # num_frames, nfilt=64.
-    if len(mfcc) < max_length:
-        mfcc = np.vstack((mfcc, np.tile(np.zeros(mfcc.shape[1]), (max_length - len(mfcc), 1))))
-    return mfcc
-
-
-def mfcc_fbank(signal: np.array, sample_rate: int):  # 1D signal array.
-    # Returns MFCC with shape (num_frames, n_filters, 3).
-    filter_banks, energies = fbank(signal, samplerate=sample_rate, nfilt=NUM_FBANKS)
-    frames_features = normalize_frames(filter_banks)
-    # delta_1 = delta(filter_banks, N=1)
-    # delta_2 = delta(delta_1, N=1)
-    # frames_features = np.transpose(np.stack([filter_banks, delta_1, delta_2]), (1, 2, 0))
-    return np.array(frames_features, dtype=np.float32)  # Float32 precision is enough here.
-
-
-def normalize_frames(m, epsilon=1e-12):
-    return [(v - np.mean(v)) / max(np.std(v), epsilon) for v in m]
diff --git a/batcher.py b/batcher.py
@@ -1,18 +1,18 @@
 import json
 import logging
 import os
+import random
 from collections import deque, Counter
-from random import choice
 from time import time
 
 import dill
 import numpy as np
 from tqdm import tqdm
 
 from audio import pad_mfcc, Audio
-from constants import NUM_FRAMES, NUM_FBANKS
+from constants import NUM_FRAMES, NUM_FBANKS, TRAIN_TEST_RATIO
 from models import DeepSpeakerModel
-from utils import ensures_dir, load_pickle, load_npy, train_test_sp_to_utt
+from utils import ensures_dir, load_pickle, load_npy
 
 logger = logging.getLogger(__name__)
 
@@ -21,18 +21,28 @@ def extract_speaker(utt_file):
     return utt_file.split('/')[-1].split('_')[0]
 
 
-def sample_from_mfcc(mfcc, max_length):
+def sample_from_mfcc(mfcc, max_length, seed=None):
     if mfcc.shape[0] >= max_length:
-        r = choice(range(0, len(mfcc) - max_length + 1))
+        random.seed(seed)
+        r = random.choice(range(0, len(mfcc) - max_length + 1))
         s = mfcc[r:r + max_length]
     else:
         s = pad_mfcc(mfcc, max_length)
     return np.expand_dims(s, axis=-1)
 
 
-def sample_from_mfcc_file(utterance_file, max_length):
+def sample_from_mfcc_file(utterance_file, max_length, seed=None):
     mfcc = np.load(utterance_file)
-    return sample_from_mfcc(mfcc, max_length)
+    return sample_from_mfcc(mfcc, max_length, seed)
+
+
+def train_test_sp_to_utt(audio, is_test):
+    sp_to_utt = {}
+    for speaker_id, utterances in audio.speakers_to_utterances.items():
+        utterances_files = sorted(utterances.values())
+        train_test_sep = int(len(utterances_files) * TRAIN_TEST_RATIO)
+        sp_to_utt[speaker_id] = utterances_files[train_test_sep:] if is_test else utterances_files[:train_test_sep]
+    return sp_to_utt
 
 
 class KerasFormatConverter:
@@ -147,8 +157,9 @@ def __init__(self, working_dir: str, max_length: int, model: DeepSpeakerModel):
         self.history_model_inputs = None
 
         self.batch_count = 0
-        for _ in tqdm(range(self.history_length), desc='Initializing the batcher'):  # init history.
-            self.update_triplets_history()
+        if self.model is not None:
+            for _ in tqdm(range(self.history_length), desc='Initializing the batcher'):  # init history.
+                self.update_triplets_history()
 
     def update_triplets_history(self):
         model_inputs = []
@@ -316,14 +327,17 @@ def get_batch_train(self, batch_size):
 
         return batch_x, batch_y
 
-    def get_speaker_verification_data(self, anchor_speaker, num_different_speakers):
-        speakers = list(self.audio.speakers_to_utterances.keys())
+    def get_speaker_verification_data(self, anchor_speaker, num_different_speakers, seed=123):
+        speakers = list(self.audio.speaker_ids)
         anchor_utterances = []
         positive_utterances = []
         negative_utterances = []
+        np.random.seed(seed)
         negative_speakers = np.random.choice(list(set(speakers) - {anchor_speaker}), size=num_different_speakers)
         assert [negative_speaker != anchor_speaker for negative_speaker in negative_speakers]
+        np.random.seed(seed)
         pos_utterances = np.random.choice(self.sp_to_utt_test[anchor_speaker], 2, replace=False)
+        np.random.seed(seed)
         neg_utterances = [np.random.choice(self.sp_to_utt_test[neg], 1, replace=True)[0] for neg in negative_speakers]
         anchor_utterances.append(pos_utterances[0])
         positive_utterances.append(pos_utterances[1])
@@ -336,13 +350,12 @@ def get_speaker_verification_data(self, anchor_speaker, num_different_speakers):
             [extract_speaker(s) for s in anc_pos[1, :]]))
 
         batch_x = np.vstack([
-            [sample_from_mfcc_file(u, self.max_length) for u in anchor_utterances],
-            [sample_from_mfcc_file(u, self.max_length) for u in positive_utterances],
-            [sample_from_mfcc_file(u, self.max_length) for u in negative_utterances]
+            [sample_from_mfcc_file(u, self.max_length, seed) for u in anchor_utterances],
+            [sample_from_mfcc_file(u, self.max_length, seed) for u in positive_utterances],
+            [sample_from_mfcc_file(u, self.max_length, seed) for u in negative_utterances]
         ])
 
-        batch_y = np.zeros(shape=(len(batch_x), 1))  # dummy. sparse softmax needs something.
-        return batch_x, batch_y
+        return batch_x
 
 
 class TripletBatcher:

diff --git a/cli.py b/cli.py
@@ -3,6 +3,7 @@
 
 import logging
 import os
+import sys
 
 import click
 
@@ -17,12 +18,12 @@
 
 logger = logging.getLogger(__name__)
 
-VERSION = '3.0b'
+VERSION = '4.0a'
 
 
 @click.group()
 def cli():
-    logging.basicConfig(format='%(asctime)12s - %(levelname)s - %(message)s', level=logging.INFO)
+    logging.basicConfig(format='%(asctime)12s - %(levelname)s - %(message)s', level=logging.INFO, stream=sys.stdout)
     init_pandas()
 
 
@@ -35,7 +36,7 @@ def version():
 @click.option('--working_dir', required=True, type=Ct.output_dir())
 @click.option('--audio_dir', default=None)
 @click.option('--sample_rate', default=SAMPLE_RATE, show_default=True, type=int)
-def build_audio_cache(working_dir, audio_dir, sample_rate):
+def build_audio_cache(working_dir: str, audio_dir: str, sample_rate: int):
     ensures_dir(working_dir)
     if audio_dir is None:
         audio_dir = os.path.join(working_dir, 'LibriSpeech')
@@ -45,7 +46,7 @@ def build_audio_cache(working_dir, audio_dir, sample_rate):
 @cli.command('build-keras-inputs', short_help='Build inputs to Keras.')
 @click.option('--working_dir', required=True, type=Ct.input_dir())
 @click.option('--counts_per_speaker', default='600,100', show_default=True, type=str)  # train,test
-def build_keras_inputs(working_dir, counts_per_speaker):
+def build_keras_inputs(working_dir: str, counts_per_speaker: str):
     counts_per_speaker = [int(b) for b in counts_per_speaker.split(',')]
     kc = KerasFormatConverter(working_dir)
     kc.generate(max_length=NUM_FRAMES, counts_per_speaker=counts_per_speaker)
@@ -54,9 +55,9 @@ def build_keras_inputs(working_dir, counts_per_speaker):
 
 @cli.command('test-model', short_help='Test a Keras model.')
 @click.option('--working_dir', required=True, type=Ct.input_dir())
-@click.option('--model_name', required=True, type=click.Choice([RES_CNN_NAME, GRU_NAME]))
-@click.option('--checkpoint_file', required=True, type=Ct.input_file())
-def test_model(working_dir, model_name, checkpoint_file):
+@click.option('--model_name', multiple=True, type=click.Choice([RES_CNN_NAME, GRU_NAME]))
+@click.option('--checkpoint_file', multiple=True, required=True, type=Ct.input_file())
+def test_model(working_dir: str, model_name: tuple, checkpoint_file: tuple):
     # export CUDA_VISIBLE_DEVICES=0; python cli.py test-model
     # --working_dir /home/philippe/ds-test/triplet-training/
     # --checkpoint_file ../ds-test/checkpoints-softmax/ResCNN_checkpoint_102.h5
@@ -66,18 +67,18 @@ def test_model(working_dir, model_name, checkpoint_file):
     # --working_dir /home/philippe/ds-test/triplet-training/
     # --checkpoint_file ../ds-test/checkpoints-triplets/ResCNN_checkpoint_175.h5
     # f-measure = 0.849, true positive rate = 0.798, accuracy = 0.997, equal error rate = 0.025
+    assert len(model_name) == len(checkpoint_file)
     test(working_dir, model_name, checkpoint_file)
 
 
 @cli.command('train-model', short_help='Train a Keras model.')
 @click.option('--working_dir', required=True, type=Ct.input_dir())
 @click.option('--model_name', required=True, type=click.Choice([RES_CNN_NAME, GRU_NAME]))
 @click.option('--pre_training_phase/--no_pre_training_phase', default=False, show_default=True)
-def train_model(working_dir, model_name, pre_training_phase):
+def train_model(working_dir: str, model_name: str, pre_training_phase: bool):
     # PRE TRAINING
     # LibriSpeech train-clean-data360 (600, 100). 0.991 on test set (enough for pre-training).
-
-    # TRIPLET TRAINING
+    # TRIPLET TRAINING with ResCNN.
     # [...]
     # Epoch 175/1000
     # 2000/2000 [==============================] - 919s 459ms/step - loss: 0.0077 - val_loss: 0.0058

diff --git a/eval_metrics.py b/eval_metrics.py
@@ -1,4 +1,29 @@
 import numpy as np
+from scipy.interpolate import interp1d
+from scipy.optimize import brentq
+from sklearn.metrics import roc_curve, f1_score, precision_score, accuracy_score
+
+
+def evaluate2(y_pred, y_true):
+    # TODO: still not perfect.
+    fpr, tpr, threshold = roc_curve(y_true, y_pred, pos_label=1)
+    fnr = 1 - tpr
+    eer1 = fpr[np.nanargmin(np.absolute((fnr - fpr)))]
+    eer2 = fnr[np.nanargmin(np.absolute((fnr - fpr)))]
+    eer3 = brentq(lambda x: 1. - x - interp1d(fpr, tpr)(x), 0., 1.)
+
+    thresholds = np.arange(-1, 1, 0.01)
+    best_index = np.argmax([f1_score(y_true, y_pred > t) for t in thresholds])
+    t = thresholds[best_index]
+    f1 = f1_score(y_true, y_pred > t)
+    precision = precision_score(y_true, y_pred > t)
+    # roc = roc_auc_score(y_true, y_pred > t)
+    acc = accuracy_score(y_true, y_pred > t)
+    # recall = recall_score(y_true, y_pred > t)
+
+    assert abs(eer1 - eer2) <= 1e-2
+    assert abs(eer2 - eer3) <= 1e-2
+    return f1, precision, acc, eer1
 
 
 def evaluate(sims, labels):
@@ -10,18 +35,13 @@ def evaluate(sims, labels):
 
 
 def calculate_roc(thresholds, sims, labels):
-    nrof_pairs = min(len(labels), len(sims))
     nrof_thresholds = len(thresholds)
 
     tprs = np.zeros((nrof_thresholds))
     fprs = np.zeros((nrof_thresholds))
     acc_train = np.zeros((nrof_thresholds))
     precisions = np.zeros((nrof_thresholds))
     fms = np.zeros((nrof_thresholds))
-    accuracy = 0.0
-
-    indices = np.arange(nrof_pairs)
-
     # Find the best threshold for the fold
 
     for threshold_idx, threshold in enumerate(thresholds):

diff --git a/example.py b/example.py
@@ -8,8 +8,9 @@
 from models import ResCNNModel
 from test import batch_cosine_similarity
 
-np.random.seed(123)
-random.seed(123)
+seed = 123
+np.random.seed(seed)
+random.seed(seed)
 
 model = ResCNNModel()
 model.m.load_weights('/Users/premy/deep-speaker/checkpoints/ResCNN_triplet_training_checkpoint_175.h5', by_name=True)