# Run Mexca's video subcomponent on test dataset

Note that this notebook is run on [google colab](https://colab.research.google.com/drive/1OLSfQX8xqw0jztRY-MDUIZ178vqBceMO?usp=sharing) so that we can use GPU

In [None]:
# install pyannote.audio requirements
!pip install -qq torch==1.11.0 torchvision==0.12.0 torchaudio==0.11.0 torchtext==0.12.0
!pip install -qq speechbrain==0.5.12

# install pyannote.audio
!pip install -qq pyannote.audio

# install huggingface to download pyannote's models
!pip -qq install huggingface_hub

In [None]:
from huggingface_hub import HfApi
from huggingface_hub import notebook_login
from pyannote.audio import Pipeline
from pyannote.database.util import load_rttm
from pyannote.metrics.diarization import DiarizationErrorRate, DiarizationCoverage, DiarizationPurity
from pyannote.core import Annotation, Segment
import pandas as pd
import numpy as np

In [None]:
notebook_login() #insert token

Load pipeline

In [None]:
pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization@develop", use_auth_token=True)


## Run pyannote on list of audio files

In [None]:
f = open('/content/list_audio_.txt','r')

for filepath in f.readlines():
  print(f'Computing video @ {filepath} ...')
  n_speakers = int(filepath.strip().split("/")[-1].split("_")[2])
  t_duration = int(filepath.strip().split("/")[-1].split("_")[3])
  print(n_speakers)
  print(t_duration)
  diarization = pipeline(filepath.strip(),  num_speakers= n_speakers)
  t = open(f"{n_speakers}_{t_duration}.rttm","w")
  diarization.write_rttm(t)

# Optimal mapping

Read all rttm files (both reference and pyannote's output) and find the optimal mapping between labels

In [None]:
f = open('/content/list_reference.txt','r')
t = open('/content/list_mexca.txt','r')

metric = DiarizationErrorRate(collar=.25)

for filepath,mexca_path in zip(f.readlines(),t.readlines()):
  print(mexca_path)
  print(filepath)

  n_speakers = int(mexca_path.strip().split("/")[-1].split("_")[0])
  t_duration = int(mexca_path.strip().split("/")[-1].split("_")[1].split(".")[0])

  print(n_speakers)
  print(t_duration)

  REFERENCE = filepath.strip()
  reference = load_rttm(REFERENCE)

  MEXCA = mexca_path.strip()
  mexca = load_rttm(MEXCA)

  optimal_dict = metric.optimal_mapping(reference["sample"], mexca[f"list_audio_{n_speakers}_{t_duration}_unbalanced"])
  
  df = pd.DataFrame(optimal_dict.items()) 
  df['file'] = f"list_audio_{n_speakers}_{t_duration}_unbalanced"
  df.to_csv(f'mapping_{n_speakers}_{t_duration}.txt', header=None, index=None, sep=' ')