In [None]:
# Run it on Binder: https://mybinder.org/v2/gh/parkitny/notebook_demos/main

In [None]:
url = 'https://media1.vocaroo.com/mp3/1n5KKFJs9A8n'
start_ms = 100000
duration_minutes = 1 # minutes

In [None]:
import audiosegment
import librosa
import struct
import webrtcvad

import numpy as np
import matplotlib.pyplot as plt
import soundfile as sf

from matplotlib.patches import Rectangle
from pathlib import Path
from scipy.io import wavfile
from urllib.request import urlopen

from helpers.audio import audiosegment_to_librosawav
from helpers.misc import suppress_stdout_stderr
from helpers.webrtc_vad import read_wave, frame_generator, vad_collector, write_wave

duration_ms = int(duration_minutes * 60 * 1000) # ms
input_filename = "notebook_data/tmp.mp3"
resampled_filename = "notebook_data/tmp.wav"
resampled_filename_mp3 = "notebook_data/tmp.mp3" 
Path(input_filename).parent.mkdir(parents=True, exist_ok=True)
Path(input_filename).write_bytes(urlopen(url).read())
audio = audiosegment.from_file(input_filename)[start_ms: start_ms + duration_ms]
audio.export(resampled_filename_mp3, format="mp3")
sample_rate = audio.frame_rate
samples = audiosegment_to_librosawav(audio)

new_sample_rate = 16000
samples = librosa.resample(samples, sample_rate, new_sample_rate)
sample_rate = new_sample_rate
sf.write(resampled_filename, samples, sample_rate)
sample_rate, samples = wavfile.read(resampled_filename)
vad = webrtcvad.Vad()

# set aggressiveness from 0 to 3
mode = 3
vad.set_mode(mode)
raw_samples = struct.pack("%dh" % len(samples), *samples)
audio, sample_rate = read_wave(resampled_filename)

vad = webrtcvad.Vad(mode)
frames = frame_generator(30, audio, sample_rate)
frames = list(frames)
segments = vad_collector(sample_rate, 30, 300, vad, frames)

with suppress_stdout_stderr():
    for i, segment in enumerate(list(segments)):
        path = 'chunk-{i:02}.wav'
        write_wave(path, segment, sample_rate)

window_duration = 0.03 # duration in seconds
samples_per_window = int(window_duration * sample_rate + 0.5)
bytes_per_sample = 2
segments = []

for start in np.arange(0, len(samples), samples_per_window):
    stop = min(start + samples_per_window, len(samples))
    
    is_speech = vad.is_speech(raw_samples[start * bytes_per_sample: stop * bytes_per_sample], 
                              sample_rate = sample_rate)

    segments.append(dict(
       start = start,
       stop = stop,
       is_speech = is_speech))
    
fig = plt.figure(figsize = (10,7))
ax = plt.gca()
ax.plot(samples)

ymax = max(samples)

## plot segment identifed as speech
x = np.array([])
y = np.array([])
label = np.zeros(len(samples), dtype=int)
recs = []

ax = fig.axes[0]
for segment in segments:
    if segment['is_speech']:
        label[segment['start'] : segment['stop'] - 1] = 1
        ax.add_patch(Rectangle((segment['start'], ymax), segment['stop'] - 1 - segment['start'] , 2000, edgecolor = 'orange',facecolor = 'orange',fill=True, zorder=3))
for r in recs:
    ax.add_patch(r)
plt.xlabel('sample')
plt.grid()
plt.savefig('tmp.png', dpi=150)

In [None]:
from ui.audio_player import get_audio_player

with open(input_filename, 'rb') as f:
    data = f.read()
speech_segments = [s for s in segments if s['is_speech']]
speech_segments = [(float(s['start']) / sample_rate, float(s['stop']) / sample_rate) for s in speech_segments]

audio_player = get_audio_player(data=bytearray(data), speech_segments=speech_segments)
audio_player