# Test Speaker Diarization with SpeechBrain

In [40]:
""" Explore speechbrain speaker diarization """
import matplotlib.pyplot as plt
import numpy as np
import speechbrain as sb
import torch
import torchaudio
from IPython.display import Audio, display
from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score, normalized_mutual_info_score, v_measure_score
from speechbrain.pretrained import EncoderClassifier, VAD
from speechbrain.processing.PLDA_LDA import StatObject_SB
from speechbrain.processing import diarization as diar
from speechbrain.dataio.encoder import CategoricalEncoder


In [41]:
SAVE_DIR = "speaker_id_debate_21"
VIDEO_DIR = "video_camera_shots"
AUDIO_DIR = "audio_files"
DATA_DIR = "datasets"


In [42]:
def detect_voice_activation(signal, filename, vad):
    """
    Extracts voice activation (speech) segments from an audio signal using a neural VAD model:
    Computes posterior probability for speech segments from neural VAD model.
    Applies a threshold on the posterior probability to get candidate segments.
    Extracts speech segments using energy-based VAD.
    Merges segments that are close to each other.
    Removes short segments.
    Double check the energy-based VAD using the neural VAD model.
    """
    prob_chunks = vad.get_speech_prob_chunk(signal)
    prob_chunks_avg = prob_chunks.mean(dim=0, keepdim=True)
    prob_th = vad.apply_threshold(
        prob_chunks_avg, activation_th=0.5, deactivation_th=0.25).float()
    boundaries = vad.get_boundaries(prob_th)
    boundaries_energy = vad.energy_VAD(
        filename, boundaries, activation_th=0.8, deactivation_th=0.0)
    boundaries_merged = vad.merge_close_segments(
        boundaries_energy, close_th=0.250)
    boundaries_short_removed = vad.remove_short_segments(
        boundaries_merged, len_th=0.250)
    boundaries_checked = vad.double_check_speech_segments(
        boundaries_short_removed, filename, speech_th=0.5)

    return vad.upsample_boundaries(boundaries, filename)


In [43]:
def prepare_dataio(filename, save_dir, training=False):
    """ Prepares data sets for the Brain class.
        Encodes speaker names and saves the encoding.
    """
    spk_id_encoder = CategoricalEncoder()

    vad = VAD.from_hparams(source="speechbrain/vad-crdnn-libriparty",
                           savedir="pretrained_models/vad-crdnn-libriparty")

    @sb.utils.data_pipeline.takes("file_path")
    @sb.utils.data_pipeline.provides("sig")
    def audio_pipeline(file_path):
        sig, _ = torchaudio.load(file_path)
        vad_sig = detect_voice_activation(sig, file_path, vad)
        return torch.masked_select(sig, vad_sig.bool())

    @sb.utils.data_pipeline.takes("spk_id")
    @sb.utils.data_pipeline.provides("spk_id", "spk_id_encoded")
    def label_pipeline(spk_id):
        yield spk_id
        spk_id_encoded = torch.LongTensor(
            [spk_id_encoder.encode_label(spk_id)])
        yield spk_id_encoded

    dataset = sb.dataio.dataset.DynamicItemDataset.from_json(
        json_path=filename,
        dynamic_items=[audio_pipeline, label_pipeline],
        output_keys=["id", "sig", "spk_id_encoded"],
    )

    if training:
        spk_id_encoder.update_from_didataset(dataset, output_key="spk_id")
        spk_id_encoder.save(f"{save_dir}/spk_id_encoder.txt")
    else:
        spk_id_encoder.load(f"{save_dir}/spk_id_encoder.txt")

    return dataset


In [44]:
def prepare_datasets(save_dir, data_dirs):
    """ Prepares dataset for multiple directories """
    datasets = {}
    for data_dir in data_dirs:
        datasets[data_dir] = prepare_dataio(
            f"{save_dir}/{data_dir}.json", save_dir, data_dir == "training")

    return datasets


In [9]:
datasets = prepare_datasets(
    f"{SAVE_DIR}/{DATA_DIR}", ["training", "validation", "test"])


In [10]:
ecapa = EncoderClassifier.from_hparams(
    source="speechbrain/spkrec-ecapa-voxceleb", savedir="./pretrained_models/ecapa")


In [11]:
embeddings = [ecapa.encode_batch(batch["sig"]).squeeze()
              for batch in datasets["training"]]

speakers = [int(batch["spk_id_encoded"]) for batch in datasets["training"]]


In [12]:
embeddings_array = np.array([e.numpy() for e in embeddings])


In [19]:
stat_obj = StatObject_SB(
    modelset=np.array(
        ["EPACA_TDNN_embeddings" * embeddings_array.shape[0]], dtype="|O"),
    segset=np.array(
        [f"{i}_{i}_{i}" for i in range(embeddings_array.shape[0])], dtype="|O"),
    start=np.array([None] * embeddings_array.shape[0]),
    stop=np.array([None] * embeddings_array.shape[0]),
    stat0=np.array([[1.0]] * embeddings_array.shape[0]),
    stat1=embeddings_array.squeeze()
)


In [60]:
diar.do_spec_clustering(
    diary_obj=stat_obj,
    out_rttm_file=f"{SAVE_DIR}/debate_21.rttm",
    rec_id="debate_21",
    k=6,
    pval=0.025,  # This parameter can be fine-tuned
    affinity_type="cos",
    n_neighbors=None
)


In [61]:
def read_rttm(filename):
    """ Reads output of spectral clustering (.rttm file)
    and create segments [start, end, id] """
    segments = []

    with open(filename, "r") as file:
        for row in file:
            row_split = row.split(" ")
            start = float(row_split[3])
            end = float(row_split[3]) + float(row_split[4])
            spk_id = int(row_split[7].split("_")[-1])
            segments.append([start, end, spk_id])

    return segments


In [62]:
diary = read_rttm(f"{SAVE_DIR}/debate_21.rttm")


In [63]:
diary_speakers = [row[2] for row in diary]


In [64]:
adjusted_rand_score(speakers, diary_speakers)


0.15518346418843135

In [65]:
adjusted_mutual_info_score(speakers, diary_speakers)


0.18631488098088309

In [66]:
v_measure_score(speakers, diary_speakers)


0.25665124614810164