Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fusion #76

Open
wants to merge 10 commits into
base: gru
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 27 additions & 45 deletions audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,23 +14,42 @@
logger = logging.getLogger(__name__)


def pad_mfcc(mfcc: np.array, max_length: int):
# pad MFCC with 0.0. if max_length = 160 (default settings), then less than 1.6s of speech will require padding.
if len(mfcc) < max_length:
mfcc = np.vstack((mfcc, np.tile(np.zeros(mfcc.shape[1]), (max_length - len(mfcc), 1))))
return mfcc


def mfcc_fbank(signal: np.array, sample_rate: int): # 1D signal array.
assert len(signal.shape) == 1
# Returns MFCC with shape (num_frames, n_filters, 3).
filter_banks, energies = fbank(signal, samplerate=sample_rate, nfilt=NUM_FBANKS)
frames_features = normalize_mfcc_frames(filter_banks)
# delta_1 = delta(filter_banks, N=1)
# delta_2 = delta(delta_1, N=1)
# frames_features = np.transpose(np.stack([filter_banks, delta_1, delta_2]), (1, 2, 0))
return np.array(frames_features, dtype=np.float32) # Float32 precision is enough here.


def normalize_mfcc_frames(m: np.array, epsilon=1e-12):
return [(v - np.mean(v)) / max(np.std(v), epsilon) for v in m]


def read_mfcc(input_filename, sample_rate):
audio = Audio.read(input_filename, sample_rate)
# TODO: could use trim_silence() here or a better VAD.
energy = np.abs(audio)
silence_threshold = np.percentile(energy, 95)
offsets = np.where(energy > silence_threshold)[0]
# left_blank_duration_ms = (1000.0 * offsets[0]) // self.sample_rate # frame_id to duration (ms)
# right_blank_duration_ms = (1000.0 * (len(audio) - offsets[-1])) // self.sample_rate
# TODO: could use trim_silence() here or a better VAD.
audio_voice_only = audio[offsets[0]:offsets[-1]]
mfcc = mfcc_fbank(audio_voice_only, sample_rate)
return mfcc


def extract_speaker_and_utterance_ids(filename: str): # LIBRI.
def extract_speaker_and_utterance_ids(libri_filename: str): # LIBRI.
# 'audio/dev-other/116/288045/116-288045-0000.flac'
speaker, _, basename = Path(filename).parts[-3:]
filename.split('-')
speaker, _, basename = Path(libri_filename).parts[-3:]
utterance = os.path.splitext(basename.split('-', 1)[-1])[0]
assert basename.split('-')[0] == speaker
return speaker, utterance
Expand All @@ -54,32 +73,15 @@ def __init__(self, cache_dir: str, audio_dir: str = None, sample_rate: int = SAM
def speaker_ids(self):
return sorted(self.speakers_to_utterances)

@staticmethod
def trim_silence(audio, threshold):
"""Removes silence at the beginning and end of a sample."""
energy = librosa.feature.rms(audio)
frames = np.nonzero(np.array(energy > threshold))
indices = librosa.core.frames_to_samples(frames)[1]

# Note: indices can be an empty array, if the whole audio was silence.
audio_trim = audio[0:0]
left_blank = audio[0:0]
right_blank = audio[0:0]
if indices.size:
audio_trim = audio[indices[0]:indices[-1]]
left_blank = audio[:indices[0]] # slice before.
right_blank = audio[indices[-1]:] # slice after.
return audio_trim, left_blank, right_blank

@staticmethod
def read(filename, sample_rate=SAMPLE_RATE):
audio, sr = librosa.load(filename, sr=sample_rate, mono=True, dtype=np.float32)
assert sr == sample_rate
return audio

def build_cache(self, audio_dir, sample_rate):
logger.info(f'audio_dir: {audio_dir}.')
logger.info(f'sample_rate: {sample_rate:,} hz.')
logger.info(f'Audio directory : {audio_dir}.')
logger.info(f'Sample rate : {sample_rate:,} hz.')
audio_files = find_files(audio_dir, ext=self.ext)
audio_files_count = len(audio_files)
assert audio_files_count != 0, f'Could not find any {self.ext} files in {audio_dir}.'
Expand All @@ -98,23 +100,3 @@ def cache_audio_file(self, input_filename, sample_rate):
np.save(cache_filename, mfcc)
except librosa.util.exceptions.ParameterError as e:
logger.error(e)


def pad_mfcc(mfcc, max_length): # num_frames, nfilt=64.
if len(mfcc) < max_length:
mfcc = np.vstack((mfcc, np.tile(np.zeros(mfcc.shape[1]), (max_length - len(mfcc), 1))))
return mfcc


def mfcc_fbank(signal: np.array, sample_rate: int): # 1D signal array.
# Returns MFCC with shape (num_frames, n_filters, 3).
filter_banks, energies = fbank(signal, samplerate=sample_rate, nfilt=NUM_FBANKS)
frames_features = normalize_frames(filter_banks)
# delta_1 = delta(filter_banks, N=1)
# delta_2 = delta(delta_1, N=1)
# frames_features = np.transpose(np.stack([filter_banks, delta_1, delta_2]), (1, 2, 0))
return np.array(frames_features, dtype=np.float32) # Float32 precision is enough here.


def normalize_frames(m, epsilon=1e-12):
return [(v - np.mean(v)) / max(np.std(v), epsilon) for v in m]
45 changes: 29 additions & 16 deletions batcher.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
import json
import logging
import os
import random
from collections import deque, Counter
from random import choice
from time import time

import dill
import numpy as np
from tqdm import tqdm

from audio import pad_mfcc, Audio
from constants import NUM_FRAMES, NUM_FBANKS
from constants import NUM_FRAMES, NUM_FBANKS, TRAIN_TEST_RATIO
from models import DeepSpeakerModel
from utils import ensures_dir, load_pickle, load_npy, train_test_sp_to_utt
from utils import ensures_dir, load_pickle, load_npy

logger = logging.getLogger(__name__)

Expand All @@ -21,18 +21,28 @@ def extract_speaker(utt_file):
return utt_file.split('/')[-1].split('_')[0]


def sample_from_mfcc(mfcc, max_length):
def sample_from_mfcc(mfcc, max_length, seed=None):
if mfcc.shape[0] >= max_length:
r = choice(range(0, len(mfcc) - max_length + 1))
random.seed(seed)
r = random.choice(range(0, len(mfcc) - max_length + 1))
s = mfcc[r:r + max_length]
else:
s = pad_mfcc(mfcc, max_length)
return np.expand_dims(s, axis=-1)


def sample_from_mfcc_file(utterance_file, max_length):
def sample_from_mfcc_file(utterance_file, max_length, seed=None):
mfcc = np.load(utterance_file)
return sample_from_mfcc(mfcc, max_length)
return sample_from_mfcc(mfcc, max_length, seed)


def train_test_sp_to_utt(audio, is_test):
sp_to_utt = {}
for speaker_id, utterances in audio.speakers_to_utterances.items():
utterances_files = sorted(utterances.values())
train_test_sep = int(len(utterances_files) * TRAIN_TEST_RATIO)
sp_to_utt[speaker_id] = utterances_files[train_test_sep:] if is_test else utterances_files[:train_test_sep]
return sp_to_utt


class KerasFormatConverter:
Expand Down Expand Up @@ -147,8 +157,9 @@ def __init__(self, working_dir: str, max_length: int, model: DeepSpeakerModel):
self.history_model_inputs = None

self.batch_count = 0
for _ in tqdm(range(self.history_length), desc='Initializing the batcher'): # init history.
self.update_triplets_history()
if self.model is not None:
for _ in tqdm(range(self.history_length), desc='Initializing the batcher'): # init history.
self.update_triplets_history()

def update_triplets_history(self):
model_inputs = []
Expand Down Expand Up @@ -316,14 +327,17 @@ def get_batch_train(self, batch_size):

return batch_x, batch_y

def get_speaker_verification_data(self, anchor_speaker, num_different_speakers):
speakers = list(self.audio.speakers_to_utterances.keys())
def get_speaker_verification_data(self, anchor_speaker, num_different_speakers, seed=123):
speakers = list(self.audio.speaker_ids)
anchor_utterances = []
positive_utterances = []
negative_utterances = []
np.random.seed(seed)
negative_speakers = np.random.choice(list(set(speakers) - {anchor_speaker}), size=num_different_speakers)
assert [negative_speaker != anchor_speaker for negative_speaker in negative_speakers]
np.random.seed(seed)
pos_utterances = np.random.choice(self.sp_to_utt_test[anchor_speaker], 2, replace=False)
np.random.seed(seed)
neg_utterances = [np.random.choice(self.sp_to_utt_test[neg], 1, replace=True)[0] for neg in negative_speakers]
anchor_utterances.append(pos_utterances[0])
positive_utterances.append(pos_utterances[1])
Expand All @@ -336,13 +350,12 @@ def get_speaker_verification_data(self, anchor_speaker, num_different_speakers):
[extract_speaker(s) for s in anc_pos[1, :]]))

batch_x = np.vstack([
[sample_from_mfcc_file(u, self.max_length) for u in anchor_utterances],
[sample_from_mfcc_file(u, self.max_length) for u in positive_utterances],
[sample_from_mfcc_file(u, self.max_length) for u in negative_utterances]
[sample_from_mfcc_file(u, self.max_length, seed) for u in anchor_utterances],
[sample_from_mfcc_file(u, self.max_length, seed) for u in positive_utterances],
[sample_from_mfcc_file(u, self.max_length, seed) for u in negative_utterances]
])

batch_y = np.zeros(shape=(len(batch_x), 1)) # dummy. sparse softmax needs something.
return batch_x, batch_y
return batch_x


class TripletBatcher:
Expand Down
21 changes: 11 additions & 10 deletions cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import logging
import os
import sys

import click

Expand All @@ -17,12 +18,12 @@

logger = logging.getLogger(__name__)

VERSION = '3.0b'
VERSION = '4.0a'


@click.group()
def cli():
logging.basicConfig(format='%(asctime)12s - %(levelname)s - %(message)s', level=logging.INFO)
logging.basicConfig(format='%(asctime)12s - %(levelname)s - %(message)s', level=logging.INFO, stream=sys.stdout)
init_pandas()


Expand All @@ -35,7 +36,7 @@ def version():
@click.option('--working_dir', required=True, type=Ct.output_dir())
@click.option('--audio_dir', default=None)
@click.option('--sample_rate', default=SAMPLE_RATE, show_default=True, type=int)
def build_audio_cache(working_dir, audio_dir, sample_rate):
def build_audio_cache(working_dir: str, audio_dir: str, sample_rate: int):
ensures_dir(working_dir)
if audio_dir is None:
audio_dir = os.path.join(working_dir, 'LibriSpeech')
Expand All @@ -45,7 +46,7 @@ def build_audio_cache(working_dir, audio_dir, sample_rate):
@cli.command('build-keras-inputs', short_help='Build inputs to Keras.')
@click.option('--working_dir', required=True, type=Ct.input_dir())
@click.option('--counts_per_speaker', default='600,100', show_default=True, type=str) # train,test
def build_keras_inputs(working_dir, counts_per_speaker):
def build_keras_inputs(working_dir: str, counts_per_speaker: str):
counts_per_speaker = [int(b) for b in counts_per_speaker.split(',')]
kc = KerasFormatConverter(working_dir)
kc.generate(max_length=NUM_FRAMES, counts_per_speaker=counts_per_speaker)
Expand All @@ -54,9 +55,9 @@ def build_keras_inputs(working_dir, counts_per_speaker):

@cli.command('test-model', short_help='Test a Keras model.')
@click.option('--working_dir', required=True, type=Ct.input_dir())
@click.option('--model_name', required=True, type=click.Choice([RES_CNN_NAME, GRU_NAME]))
@click.option('--checkpoint_file', required=True, type=Ct.input_file())
def test_model(working_dir, model_name, checkpoint_file):
@click.option('--model_name', multiple=True, type=click.Choice([RES_CNN_NAME, GRU_NAME]))
@click.option('--checkpoint_file', multiple=True, required=True, type=Ct.input_file())
def test_model(working_dir: str, model_name: tuple, checkpoint_file: tuple):
# export CUDA_VISIBLE_DEVICES=0; python cli.py test-model
# --working_dir /home/philippe/ds-test/triplet-training/
# --checkpoint_file ../ds-test/checkpoints-softmax/ResCNN_checkpoint_102.h5
Expand All @@ -66,18 +67,18 @@ def test_model(working_dir, model_name, checkpoint_file):
# --working_dir /home/philippe/ds-test/triplet-training/
# --checkpoint_file ../ds-test/checkpoints-triplets/ResCNN_checkpoint_175.h5
# f-measure = 0.849, true positive rate = 0.798, accuracy = 0.997, equal error rate = 0.025
assert len(model_name) == len(checkpoint_file)
test(working_dir, model_name, checkpoint_file)


@cli.command('train-model', short_help='Train a Keras model.')
@click.option('--working_dir', required=True, type=Ct.input_dir())
@click.option('--model_name', required=True, type=click.Choice([RES_CNN_NAME, GRU_NAME]))
@click.option('--pre_training_phase/--no_pre_training_phase', default=False, show_default=True)
def train_model(working_dir, model_name, pre_training_phase):
def train_model(working_dir: str, model_name: str, pre_training_phase: bool):
# PRE TRAINING
# LibriSpeech train-clean-data360 (600, 100). 0.991 on test set (enough for pre-training).

# TRIPLET TRAINING
# TRIPLET TRAINING with ResCNN.
# [...]
# Epoch 175/1000
# 2000/2000 [==============================] - 919s 459ms/step - loss: 0.0077 - val_loss: 0.0058
Expand Down
30 changes: 25 additions & 5 deletions eval_metrics.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,29 @@
import numpy as np
from scipy.interpolate import interp1d
from scipy.optimize import brentq
from sklearn.metrics import roc_curve, f1_score, precision_score, accuracy_score


def evaluate2(y_pred, y_true):
# TODO: still not perfect.
fpr, tpr, threshold = roc_curve(y_true, y_pred, pos_label=1)
fnr = 1 - tpr
eer1 = fpr[np.nanargmin(np.absolute((fnr - fpr)))]
eer2 = fnr[np.nanargmin(np.absolute((fnr - fpr)))]
eer3 = brentq(lambda x: 1. - x - interp1d(fpr, tpr)(x), 0., 1.)

thresholds = np.arange(-1, 1, 0.01)
best_index = np.argmax([f1_score(y_true, y_pred > t) for t in thresholds])
t = thresholds[best_index]
f1 = f1_score(y_true, y_pred > t)
precision = precision_score(y_true, y_pred > t)
# roc = roc_auc_score(y_true, y_pred > t)
acc = accuracy_score(y_true, y_pred > t)
# recall = recall_score(y_true, y_pred > t)

assert abs(eer1 - eer2) <= 1e-2
assert abs(eer2 - eer3) <= 1e-2
return f1, precision, acc, eer1


def evaluate(sims, labels):
Expand All @@ -10,18 +35,13 @@ def evaluate(sims, labels):


def calculate_roc(thresholds, sims, labels):
nrof_pairs = min(len(labels), len(sims))
nrof_thresholds = len(thresholds)

tprs = np.zeros((nrof_thresholds))
fprs = np.zeros((nrof_thresholds))
acc_train = np.zeros((nrof_thresholds))
precisions = np.zeros((nrof_thresholds))
fms = np.zeros((nrof_thresholds))
accuracy = 0.0

indices = np.arange(nrof_pairs)

# Find the best threshold for the fold

for threshold_idx, threshold in enumerate(thresholds):
Expand Down
5 changes: 3 additions & 2 deletions example.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,9 @@
from models import ResCNNModel
from test import batch_cosine_similarity

np.random.seed(123)
random.seed(123)
seed = 123
np.random.seed(seed)
random.seed(seed)

model = ResCNNModel()
model.m.load_weights('/Users/premy/deep-speaker/checkpoints/ResCNN_triplet_training_checkpoint_175.h5', by_name=True)
Expand Down
Loading