In [None]:
from AudioStreamer import AudioStreamer
import numpy as np
import pyaudio

from transformers import Wav2Vec2Tokenizer, Wav2Vec2ForCTC
import torch

In [None]:
tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")

In [None]:
def transcribe(audio_input, model, tokenizer):
    input_values = tokenizer(audio_input, return_tensors="pt").input_values
    logits = model(input_values).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = tokenizer.batch_decode(predicted_ids)[0]
    return transcription

In [None]:
# Audio recording parameters
SAMPLE_RATE = 16000
CHUNK = int(SAMPLE_RATE / 10)  # 100ms
PA_FORMAT = pyaudio.paFloat32

STREAM_CHUNK = 50
STREAM_OVERLAP = 5
STREAM_SILENCE = 5

In [None]:
with AudioStreamer(SAMPLE_RATE, CHUNK, PA_FORMAT) as audio_streamer:
    data = np.array([], dtype=np.float32)
    for i, content in enumerate(audio_streamer.stream()):
        cur_data = np.frombuffer(content, np.float32)
        data = np.hstack((data, cur_data))
        # print(i, data.shape)
        if (i+1)%STREAM_CHUNK == 0:
            text = transcribe(data, model, tokenizer)
            print(i+1, text)
            print(data.shape, data[-STREAM_SILENCE*len(cur_data):].shape)
            check_silence = transcribe(data[-STREAM_SILENCE*len(cur_data):], model, tokenizer)
            if check_silence.strip() == "":
                print("No transcriptions. Breaking away.")
                data = np.array([], dtype=np.float32)
                continue
            data = data[-STREAM_OVERLAP*len(cur_data):]