# Inference

In [None]:
from pyannote.database import get_protocol, FileFinder
protocol = get_protocol('Debug.SpeakerDiarization.Debug', 
                        preprocessors={"audio": FileFinder()})

## Train and apply a voice activity detection model

In [None]:
from pyannote.audio.tasks.voice_activity_detection.task import VoiceActivityDetection
from pyannote.audio.models.debug import SimpleSegmentationModel
import pytorch_lightning as pl

In [None]:
vad = VoiceActivityDetection(protocol, duration=2., batch_size=16, num_workers=4)
model = SimpleSegmentationModel(task=vad)
trainer = pl.Trainer(max_epochs=1, default_root_dir='inference/vad')
_ = trainer.fit(model, vad)

In [None]:
from pyannote.audio.core.inference import Inference
inference = Inference(model, step=0.1, batch_size=128)

In [None]:
# inference
dev_file = next(protocol.development())
scores, frames = inference(dev_file)

In [None]:
from pyannote.core import SlidingWindowFeature
import numpy as np
np.exp(SlidingWindowFeature(scores, frames))

In [None]:
# inference on an excerpt 
from pyannote.core import Segment
scores, frames = inference.crop(dev_file, Segment(10, 15))
np.exp(SlidingWindowFeature(scores, frames))

In [None]:
# inference on an excerpt shorter than sliding window duration
scores, frames = inference.crop(dev_file, Segment(10, 11.5))
np.exp(SlidingWindowFeature(scores, frames))

In [None]:
# inference on a whole chunk
inference = Inference(model, window="whole")
scores = inference.crop(dev_file, Segment(10, 15))

## Load and apply a pretrained VAD model

In [None]:
model = SimpleSegmentationModel.load_from_checkpoint('inference/vad/lightning_logs/version_0/checkpoints/epoch=0.ckpt')
inference = Inference(model, step=0.1, batch_size=128)
scores, frames = inference(dev_file)
np.exp(SlidingWindowFeature(scores, frames))

## Train and apply a speaker embedding model

In [None]:
from pyannote.audio.tasks.speaker_verification.task import SpeakerEmbeddingArcFace
emb = SpeakerEmbeddingArcFace(protocol, duration=2., batch_size=32, num_workers=4)
from pyannote.audio.models.debug import SimpleEmbeddingModel
model = SimpleEmbeddingModel(task=emb)
trainer = pl.Trainer(max_epochs=10, default_root_dir='inference/emb')
_ = trainer.fit(model, emb)

In [None]:
# inference using a sliding window
inference = Inference(model, duration=1., step=0.5)
embeddings, window = inference(dev_file)
embeddings.shape, window.start, window.duration, window.step

In [None]:
# inference using a sliding window on an excerpt
embeddings, window = inference.crop(dev_file, Segment(5, 12))
embeddings.shape, window.start, window.duration, window.step

In [None]:
# inference using a sliding window on an excerpt shorter than sliding window
embeddings, window = inference.crop(dev_file, Segment(11.1, 12))
embeddings.shape, window.start, window.duration, window.step

In [None]:
# inference on a whole chunk
inference = Inference(model, window="whole")
embeddings = inference.crop(dev_file, Segment(5, 12))
embeddings.shape

In [None]:
# inference on a whole chunk shorter than training duration
embeddings = inference.crop(dev_file, Segment(5, 5.2))
embeddings.shape