In [None]:
import numpy as np

# Inference

In [None]:
from pyannote.database import get_protocol, FileFinder
protocol = get_protocol('Debug.SpeakerDiarization.Debug', 
                        preprocessors={"audio": FileFinder()})

## Train and apply a voice activity detection model

In [None]:
from pyannote.audio.tasks import VoiceActivityDetection
from pyannote.audio.models.debug import SimpleSegmentationModel
import pytorch_lightning as pl

In [None]:
vad = VoiceActivityDetection(protocol, duration=2., batch_size=16, num_workers=4)
model = SimpleSegmentationModel(task=vad)
trainer = pl.Trainer(max_epochs=1, default_root_dir='inference/vad')
_ = trainer.fit(model, vad)

In [None]:
from pyannote.audio.core.inference import Inference
inference = Inference(model, step=0.1, batch_size=128)

In [None]:
# inference
dev_file = next(protocol.development())
scores = inference(dev_file)
np.exp(scores)

In [None]:
# inference on an excerpt 
from pyannote.core import Segment
scores = inference.crop(dev_file, Segment(10, 15))
np.exp(scores)

In [None]:
# inference on an excerpt shorter than sliding window duration
scores = inference.crop(dev_file, Segment(10, 11.5))
np.exp(scores)

In [None]:
# inference on a whole chunk
inference = Inference(model, window="whole")
scores = inference.crop(dev_file, Segment(10, 15))

## Load and apply a pretrained VAD model

In [None]:
model = SimpleSegmentationModel.load_from_checkpoint('inference/vad/lightning_logs/version_0/checkpoints/epoch=0.ckpt')
inference = Inference(model, step=0.1, batch_size=128)
scores = inference(dev_file)
np.exp(scores)

## Train and apply a speaker embedding model

In [None]:
from pyannote.audio.tasks.speaker_verification.task import SpeakerEmbeddingArcFace
emb = SpeakerEmbeddingArcFace(protocol, duration=2., num_workers=4)
from pyannote.audio.models.debug import SimpleEmbeddingModel
model = SimpleEmbeddingModel(task=emb)
trainer = pl.Trainer(max_epochs=1, default_root_dir='inference/emb')
_ = trainer.fit(model, emb)

In [None]:
# inference using a sliding window
inference = Inference(model, duration=1., step=0.5)
embeddings = inference(dev_file)

data, window = embeddings.data, embeddings.sliding_window
data.shape, window.start, window.duration, window.step

In [None]:
# inference using a sliding window on an excerpt
embeddings = inference.crop(dev_file, Segment(5, 12))

data, window = embeddings.data, embeddings.sliding_window
data.shape, window.start, window.duration, window.step

In [None]:
# inference using a sliding window on an excerpt shorter than sliding window
embeddings = inference.crop(dev_file, Segment(11.1, 12))

data, window = embeddings.data, embeddings.sliding_window
data.shape, window.start, window.duration, window.step

In [None]:
# inference on a whole chunk
inference = Inference(model, window="whole")
embeddings = inference.crop(dev_file, Segment(5, 12))

embeddings.shape

In [None]:
# inference on a whole chunk shorter than training duration
embeddings = inference.crop(dev_file, Segment(5, 5.2))

embeddings.shape

## Train and apply a multi-task segmentation model

In [None]:
from pyannote.audio.tasks import MultiTaskSegmentation
xseg = MultiTaskSegmentation(protocol, duration=2., vad=True, scd=True, osd=True, batch_size=32, num_workers=4)
from pyannote.audio.models.debug import MultiTaskSegmentationModel
model = MultiTaskSegmentationModel(task=xseg)
trainer = pl.Trainer(max_epochs=1, default_root_dir='inference/xseg')
_ = trainer.fit(model, xseg)

In [None]:
# inference using a sliding window
inference = Inference(model, duration=2., step=0.5)
dev_file = next(protocol.development())
scores = inference(dev_file)
scores

In [None]:
np.exp(scores['vad'])

In [None]:
np.exp(scores['scd'])

In [None]:
np.exp(scores['osd'])

In [None]:
# inference on an excerpt 
scores = inference.crop(dev_file, Segment(10, 15))

In [None]:
scores['vad']

In [None]:
scores['scd']

In [None]:
scores['osd']

In [None]:
# inference on an excerpt shorter than sliding window duration
scores = inference.crop(dev_file, Segment(10, 11.5))
scores['vad']

In [None]:
# inference on a whole chunk
inference = Inference(model, window="whole")
scores = inference.crop(dev_file, Segment(10, 15))

In [None]:
scores['vad'].shape