## Install Dependencies

In [1]:
#@title Install and Import Dependencies

# this assumes that you have a relevant version of PyTorch installed
# !pip install -q torchaudio

SAMPLING_RATE = 16000

import torch
torch.set_num_threads(1)

from IPython.display import Audio
from pprint import pprint
# download example
torch.hub.download_url_to_file('https://models.silero.ai/vad_models/en.wav', 'en_example.wav')

100%|██████████| 1.83M/1.83M [00:01<00:00, 1.26MB/s]


In [2]:
USE_PIP = True # download model using pip package or torch.hub
USE_ONNX = False # change this to True if you want to test onnx model
# if USE_ONNX:
#     !pip install -q onnxruntime
if USE_PIP:
  # !pip install -q silero-vad
  from silero_vad import (load_silero_vad,
                          read_audio,
                          get_speech_timestamps,
                          save_audio,
                          VADIterator,
                          collect_chunks)
  model = load_silero_vad(onnx=USE_ONNX)
else:
  model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
                                model='silero_vad',
                                force_reload=True,
                                onnx=USE_ONNX)

  (get_speech_timestamps,
  save_audio,
  read_audio,
  VADIterator,
  collect_chunks) = utils

## Speech timestapms from full audio

In [3]:
wav = read_audio('en_example.wav', sampling_rate=SAMPLING_RATE)
# get speech timestamps from full audio file
speech_timestamps = get_speech_timestamps(wav, model, sampling_rate=SAMPLING_RATE)
pprint(speech_timestamps)

  wav, sr = torchaudio.sox_effects.apply_effects_file(path, effects=effects)


[{'end': 32736, 'start': 32},
 {'end': 75232, 'start': 42528},
 {'end': 110048, 'start': 79392},
 {'end': 213472, 'start': 149024},
 {'end': 241632, 'start': 216608},
 {'end': 253920, 'start': 245280},
 {'end': 286688, 'start': 260640},
 {'end': 313824, 'start': 293920},
 {'end': 521696, 'start': 325152},
 {'end': 569824, 'start': 523296},
 {'end': 602080, 'start': 571936},
 {'end': 623072, 'start': 607264},
 {'end': 692704, 'start': 637984},
 {'end': 714208, 'start': 697888},
 {'end': 749024, 'start': 720416},
 {'end': 799712, 'start': 781344},
 {'end': 867808, 'start': 817184},
 {'end': 918496, 'start': 871456},
 {'end': 953312, 'start': 920096}]


In [4]:
# merge all speech chunks to one audio
save_audio('only_speech.wav',
           collect_chunks(speech_timestamps, wav), sampling_rate=SAMPLING_RATE)
Audio('only_speech.wav')



## Entire audio inference

In [5]:
wav = read_audio('en_example.wav', sampling_rate=SAMPLING_RATE)
# audio is being splitted into 31.25 ms long pieces
# so output length equals ceil(input_length * 31.25 / SAMPLING_RATE)
predicts = model.audio_forward(wav, sr=SAMPLING_RATE)

## Stream imitation example

In [6]:
## using VADIterator class

vad_iterator = VADIterator(model, sampling_rate=SAMPLING_RATE)
wav = read_audio(f'en_example.wav', sampling_rate=SAMPLING_RATE)

window_size_samples = 512 if SAMPLING_RATE == 16000 else 256
for i in range(0, len(wav), window_size_samples):
    chunk = wav[i: i+ window_size_samples]
    if len(chunk) < window_size_samples:
      break
    speech_dict = vad_iterator(chunk, return_seconds=True)
    if speech_dict:
        print(speech_dict, end=' ')
vad_iterator.reset_states() # reset model states after each audio

{'start': 0.0} {'end': 2.0} {'start': 2.7} {'end': 4.7} {'start': 5.0} {'end': 6.9} {'start': 9.3} {'end': 13.3} {'start': 13.5} {'end': 15.1} {'start': 15.3} {'end': 15.9} {'start': 16.3} {'end': 17.9} {'start': 18.4} {'end': 19.6} {'start': 20.3} {'end': 32.6} {'start': 32.7} {'end': 35.6} {'start': 35.7} {'end': 37.6} {'start': 38.0} {'end': 38.9} {'start': 39.9} {'end': 43.3} {'start': 43.6} {'end': 44.6} {'start': 45.0} {'end': 46.8} {'start': 48.8} {'end': 50.0} {'start': 51.1} {'end': 54.2} {'start': 54.5} {'end': 57.4} {'start': 57.5} {'end': 59.6} {'start': 59.9} 

In [7]:
## just probabilities

wav = read_audio('en_example.wav', sampling_rate=SAMPLING_RATE)
speech_probs = []
window_size_samples = 512 if SAMPLING_RATE == 16000 else 256
for i in range(0, len(wav), window_size_samples):
    chunk = wav[i: i+ window_size_samples]
    if len(chunk) < window_size_samples:
      break
    speech_prob = model(chunk, SAMPLING_RATE).item()
    speech_probs.append(speech_prob)
vad_iterator.reset_states() # reset model states after each audio

print(speech_probs[:10]) # first 10 chunks predicts

[0.20834210515022278, 0.8179430365562439, 0.8911959528923035, 0.9963614344596863, 0.9991777539253235, 0.9999479055404663, 0.9999006986618042, 0.9997742772102356, 0.9993517994880676, 0.9999291896820068]
