# Test Speaker Recognition with SpeechBrain

In [None]:
import json
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
import random
import speechbrain as sb
import torch
import torchaudio
from moviepy.editor import VideoFileClip
from hyperpyyaml.core import load_hyperpyyaml
from IPython.display import Audio, display
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from speechbrain.dataio.dataset import DynamicItemDataset
from speechbrain.dataio.encoder import CategoricalEncoder
from speechbrain.dataio.dataloader import SaveableDataLoader
from speechbrain.lobes.models.ECAPA_TDNN import ECAPA_TDNN
from speechbrain.pretrained import EncoderClassifier, VAD


In [None]:
SAVE_DIR = "speaker_id_debate_21"
VIDEO_DIR = "video_camera_shots"
AUDIO_DIR = "audio_files"
DATA_DIR = "datasets"


In [None]:
candidates = ["Rutte", "Wilders", "Hoekstra", "Marijnissen", "Klaver", "Kaag"]

coding_df = pd.read_csv(f"{SAVE_DIR}/camera_shots_coding.csv", sep=";")

random.seed(123484)

shots_selected = coding_df[coding_df.face_1.isin(
    candidates)].groupby("face_1").sample(20).reset_index()

shots_selected


In [None]:
def convert_video_to_audio(shots_df, path, out_dir, force=False, replace=False):
    if replace:
        os.system(f"del {out_dir}\\*.wav")

    for i, shot in shots_df.iterrows():
        batch = int(np.ceil(shot.shot_id/50))
        infile_name = f'{path}\\batch{batch}\\{shot.filename}.mp4'
        outfile_name = f"{out_dir}\\batch_{batch}_shot_{int(shot.shot_id)}_{shot.face_1}.wav"

        if not os.path.isfile(outfile_name) or force:
            with VideoFileClip(infile_name) as clip:
                clip.audio.write_audiofile(outfile_name, fps=16000)


In [None]:
convert_video_to_audio(
    shots_selected, f"{SAVE_DIR}\\{VIDEO_DIR}", f"{SAVE_DIR}\\{AUDIO_DIR}", replace=True)


In [None]:
def split_train_valid_test_set(path, frac=0.1):
    with os.scandir(path) as sc:
        filenames = [filename.name for filename in sc if filename.is_file()]

    filenames_df = pd.DataFrame(filenames)
    filenames_df["name"] = filenames_df[0].str.split("_").str[-1]

    train_set = []
    valid_set = []
    test_set = []

    for group in filenames_df.groupby("name"):
        train = group[1][0].to_list()
        k = len(train)
        valid = [train.pop(i) for i in random.sample(
            range(len(train)), k=int(k*frac))]
        test = [train.pop(i) for i in random.sample(
            range(len(train)), k=int(k*frac))]
        [train_set.append(i) for i in train]
        [valid_set.append(i) for i in valid]
        [test_set.append(i) for i in test]

    return train_set, valid_set, test_set


In [None]:
random.seed(12314)

train_set, valid_set, test_set = split_train_valid_test_set(
    f"{SAVE_DIR}/{AUDIO_DIR}")

print(len(test_set), len(valid_set), len(train_set))
print(test_set, "\n\n", valid_set, "\n\n", train_set)


In [None]:
def move_train_valid_test_files(train_set, valid_set, test_set, path, out_dir, replace=True):
    if replace:
        os.system(f"del {out_dir}\\training\\*.wav")
        os.system(f"del {out_dir}\\validation\\*.wav")
        os.system(f"del {out_dir}\\test\\*.wav")

    for filename in train_set:
        os.system(f"copy {path}\\{filename} {out_dir}\\training\\")

    for filename in valid_set:
        os.system(f"copy {path}\\{filename} {out_dir}\\validation\\")

    for filename in test_set:
        os.system(f"copy {path}\\{filename} {out_dir}\\test\\")


In [None]:
move_train_valid_test_files(train_set, valid_set, test_set,
                            f"{SAVE_DIR}\\{AUDIO_DIR}", f"{SAVE_DIR}\\{DATA_DIR}")


In [None]:
def create_data_annotation_file(data_dir, out_file, force=True):
    """ Creates a data annotation file in .json format with three fiels:
            "file_path": Path to the sound file
            "spk_id": Name of the speaker
            "length": Length of the sound signal (frames)
    """
    if not os.path.isfile(out_file) or force:
        with os.scandir(data_dir) as sc:
            filenames = [
                filename.name for filename in sc if filename.is_file()]

        annotation_dict = {}

        for i, filename in enumerate(filenames):
            new_id = str(i)
            new_path = data_dir + filename
            new_spk = filename.split("_")[-1].split(".")[0]
            new_signal, _ = torchaudio.load(new_path)
            new_length = new_signal.shape[1]
            new_dict = {
                "file_path": new_path,
                "spk_id": new_spk,
                "length": new_length
            }
            annotation_dict[new_id] = new_dict

        with open(out_file, "w") as file:
            file.write(json.dumps(annotation_dict))

        print(f"Created data annotation file at {out_file}")


In [None]:
def create_data_annotation_files(save_dir, data_dir_names, force=True):
    """ Creates data annotation files for multiple directories """
    for data_dir in data_dir_names:
        create_data_annotation_file(
            save_dir + "/" + data_dir + "/", f"{save_dir}/{data_dir}.json", force=force)


In [None]:
create_data_annotation_files(
    f"{SAVE_DIR}/{DATA_DIR}", ["training", "validation", "test"])


In [None]:
def detect_voice_activation(signal, filename, vad):
    """
    Extracts voice activation (speech) segments from an audio signal using a neural VAD model:
    Computes posterior probability for speech segments from neural VAD model.
    Applies a threshold on the posterior probability to get candidate segments.
    Extracts speech segments using energy-based VAD.
    Merges segments that are close to each other.
    Removes short segments.
    Double check the energy-based VAD using the neural VAD model.
    """
    prob_chunks = vad.get_speech_prob_chunk(signal)
    prob_chunks_avg = prob_chunks.mean(dim=0, keepdim=True)
    prob_th = vad.apply_threshold(
        prob_chunks_avg, activation_th=0.5, deactivation_th=0.25).float()
    boundaries = vad.get_boundaries(prob_th)
    boundaries_energy = vad.energy_VAD(
        filename, boundaries, activation_th=0.8, deactivation_th=0.0)
    boundaries_merged = vad.merge_close_segments(
        boundaries_energy, close_th=0.250)
    boundaries_short_removed = vad.remove_short_segments(
        boundaries_merged, len_th=0.250)
    boundaries_checked = vad.double_check_speech_segments(
        boundaries_short_removed, filename, speech_th=0.5)

    return vad.upsample_boundaries(boundaries, filename)


In [None]:
def prepare_dataio(filename, save_dir, training=False):
    """ Prepares data sets for the Brain class.
        Encodes speaker names and saves the encoding.
    """
    spk_id_encoder = CategoricalEncoder()

    vad = VAD.from_hparams(source="speechbrain/vad-crdnn-libriparty",
                           savedir="pretrained_models/vad-crdnn-libriparty")

    @sb.utils.data_pipeline.takes("file_path")
    @sb.utils.data_pipeline.provides("sig")
    def audio_pipeline(file_path):
        sig, _ = torchaudio.load(file_path)
        vad_sig = detect_voice_activation(sig, file_path, vad)
        return torch.masked_select(sig, vad_sig.bool())

    @sb.utils.data_pipeline.takes("spk_id")
    @sb.utils.data_pipeline.provides("spk_id", "spk_id_encoded")
    def label_pipeline(spk_id):
        yield spk_id
        spk_id_encoded = torch.LongTensor(
            [spk_id_encoder.encode_label(spk_id)])
        yield spk_id_encoded

    dataset = sb.dataio.dataset.DynamicItemDataset.from_json(
        json_path=filename,
        dynamic_items=[audio_pipeline, label_pipeline],
        output_keys=["id", "sig", "spk_id_encoded"],
    )

    if training:
        spk_id_encoder.update_from_didataset(dataset, output_key="spk_id")
        spk_id_encoder.save(f"{save_dir}/spk_id_encoder.txt")
    else:
        spk_id_encoder.load(f"{save_dir}/spk_id_encoder.txt")

    return dataset


In [None]:
def prepare_datasets(save_dir, data_dirs):
    """ Prepares dataset for multiple directories """
    datasets = {}
    for data_dir in data_dirs:
        datasets[data_dir] = prepare_dataio(
            f"{save_dir}/{data_dir}.json", save_dir, data_dir == "training")

    return datasets


In [None]:
datasets = prepare_datasets(
    f"{SAVE_DIR}/{DATA_DIR}", ["training", "validation", "test"])


In [None]:
ecapa = EncoderClassifier.from_hparams(
    source="speechbrain/spkrec-ecapa-voxceleb", savedir="./pretrained_models/ecapa")


In [None]:
ml_pipeline = make_pipeline(StandardScaler(), SVC(gamma='auto'))


In [None]:
embeddings = [ecapa.encode_batch(batch["sig"]).squeeze()
              for batch in datasets["training"]]
speakers = [batch["spk_id_encoded"] for batch in datasets["training"]]


In [None]:
ml_pipeline.fit(pd.DataFrame(np.array([e.numpy() for e in embeddings])), [
                int(s) for s in speakers])
ml_pipeline.score(pd.DataFrame(np.array([e.numpy() for e in embeddings])), [
                  int(s) for s in speakers])


In [None]:
embeddings_valid = [ecapa.encode_batch(
    batch["sig"]).squeeze() for batch in datasets["validation"]]
speakers_valid = [batch["spk_id_encoded"] for batch in datasets["validation"]]


In [None]:
preds = ml_pipeline.predict(pd.DataFrame(
    np.array([e.numpy() for e in embeddings_valid])))
acc = ml_pipeline.score(pd.DataFrame(np.array(
    [e.numpy() for e in embeddings_valid])), [int(s) for s in speakers_valid])
print([int(s) for s in speakers_valid], preds, acc)


In [None]:
embeddings_test = [ecapa.encode_batch(
    batch["sig"]).squeeze() for batch in datasets["test"]]
speakers_test = [batch["spk_id_encoded"] for batch in datasets["test"]]


In [None]:
preds = ml_pipeline.predict(pd.DataFrame(
    np.array([e.numpy() for e in embeddings_test])))
acc = ml_pipeline.score(pd.DataFrame(np.array(
    [e.numpy() for e in embeddings_test])), [int(s) for s in speakers_test])
print([int(s) for s in speakers_test], preds, acc)


In [None]:
class SpkIdBrain(sb.Brain):
    """ New speaker recognition class that inherits from Brain base class.
        Requires at least compute_forward() and compute_objective() methods.
    """

    def compute_forward(self, batch, stage):
        """Runs all the computation of that transforms the input into the
        output probabilities over the N classes.
        Arguments
        ---------
        batch : PaddedBatch
            This batch object contains all the relevant tensors for computation.
        stage : sb.Stage
            One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST.
        Returns
        -------
        predictions : Tensor
            Tensor that contains the posterior probabilities over the N classes.
        """

        # We first move the batch to the appropriate device.
        batch = batch.to(self.device)

        # Compute features, embeddings, and predictions
        feats, lens = self.prepare_features(batch.sig, stage)
        embeddings = self.modules.embedding_model(feats, lens)
        predictions = self.modules.classifier(embeddings)

        return predictions

    def prepare_features(self, wavs, stage):
        """ Prepare the features for computation, including augmentation.
        Arguments
        ---------
        wavs : tuple
            Input signals (tensor) and their relative lengths (tensor).
        stage : sb.Stage
            The current stage of training.
        """
        wavs, lens = wavs

        # Feature extraction and normalization
        feats = self.modules.compute_features(wavs)
        feats = self.modules.mean_var_norm(feats, lens)

        return feats, lens

    def compute_objectives(self, predictions, batch, stage):
        """ Computes the loss given the predicted and targeted outputs.
        Arguments
        ---------
        predictions : tensor
            The output tensor from `compute_forward`.
        batch : PaddedBatch
            This batch object contains all the relevant tensors for computation.
        stage : sb.Stage
            One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST.
        Returns
        -------
        loss : torch.Tensor
            A one-element tensor used for backpropagating the gradient.
        """

        _, lens = batch.sig
        spkid, _ = batch.spk_id_encoded

        # Concatenate labels (due to data augmentation)
        if stage == sb.Stage.TRAIN and hasattr(self.modules, "env_corrupt"):
            spkid = torch.cat([spkid, spkid], dim=0)
            lens = torch.cat([lens, lens])

        # Compute the cost function
        loss = self.hparams.compute_cost(predictions, spkid, lens)

        # Append this batch of losses to the loss metric for easy
        self.loss_metric.append(
            batch.id, predictions, spkid, lens, reduction="batch"
        )

        # Compute classification error at test time
        if stage != sb.Stage.TRAIN:
            self.error_metrics.append(batch.id, predictions, spkid, lens)

        return loss

    def on_stage_start(self, stage, epoch=None):
        """ Gets called at the beginning of each epoch.
        Arguments
        ---------
        stage : sb.Stage
            One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST.
        epoch : int
            The currently-starting epoch. This is passed
            `None` during the test stage.
        """

        # Set up statistics trackers for this stage
        self.loss_metric = sb.utils.metric_stats.MetricStats(
            metric=sb.nnet.losses.nll_loss
        )

        # Set up evaluation-only statistics trackers
        if stage != sb.Stage.TRAIN:
            self.error_metrics = self.hparams.error_stats()

    def on_stage_end(self, stage, stage_loss, epoch=None):
        """ Gets called at the end of an epoch.
        Arguments
        ---------
        stage : sb.Stage
            One of sb.Stage.TRAIN, sb.Stage.VALID, sb.Stage.TEST
        stage_loss : float
            The average loss for all of the data processed in this stage.
        epoch : int
            The currently-starting epoch. This is passed
            `None` during the test stage.
        """

        # Store the train loss until the validation stage.
        if stage == sb.Stage.TRAIN:
            self.train_loss = stage_loss

        # Summarize the statistics from the stage for record-keeping.
        else:
            stats = {
                "loss": stage_loss,
                "error": self.error_metrics.summarize("average"),
            }

        # At the end of validation...
        if stage == sb.Stage.VALID:

            old_lr, new_lr = self.hparams.lr_annealing(epoch)
            sb.nnet.schedulers.update_learning_rate(self.optimizer, new_lr)

            # The train_logger writes a summary to stdout and to the logfile.
            self.hparams.train_logger.log_stats(
                {"Epoch": epoch, "lr": old_lr},
                train_stats={"loss": self.train_loss},
                valid_stats=stats,
            )

            # Save the current checkpoint and delete previous checkpoints,
            # self.checkpointer.save_and_keep_only(meta=stats, min_keys=["error"])

        # We also write statistics about test data to stdout and to the logfile.
        if stage == sb.Stage.TEST:
            self.hparams.train_logger.log_stats(
                {"Epoch loaded": self.hparams.epoch_counter.current},
                test_stats=stats,
            )

In [None]:
with open(f"{SAVE_DIR}/hyperparams_pre.yaml") as file:
    hparams = load_hyperpyyaml(file)

In [None]:
spk_id_brain = SpkIdBrain(
    modules=hparams["modules"],
    opt_class=hparams["opt_class"],
    hparams=hparams
)

In [None]:
spk_id_brain.fit(
    epoch_counter=range(5),
    train_set=datasets["training"],
    valid_set=datasets["validation"],
    train_loader_kwargs=hparams["dataloader_options"]
)

In [None]:
test_stat = spk_id_brain.evaluate(
    test_set=datasets["test"],
    min_key="error"
)