# Speech Transcriber
### By Mason Howes - 07/05/2024

Installs:
* Interactive widgets library that will later be used to 'Start' and 'Stop' recording microphone audio.
* Python Audio library to allow access to user's microphone and its data

Commented out to prevent installing the library on each run.

# IMPORTANT!

This program runs locally. This means that everything required for this notebook to run requires you to have the correct packages installed on your computer. Before running this program, please un-comment the code block below and install the correct packages.

In [29]:
# %pip install --user ipywidgets
# %pip install --user vosk
# %pip install --user transformers
# %pip install --user torch

## FOR WINDOWS
# %pip install --user pipwin
# %pip install --user pyaudio

## FOR MAC
# !brew install portaudio
# %pip install --user pyaudio

Imports libraries needed for program

In [30]:
# Imports installed library
import ipywidgets as widgets
from IPython.display import display

# Allows Python to run code in the background
from threading import Thread

# Lets us pass messages between threads
from queue import Queue

# Imports Python Audio
import pyaudio

# Imports for Vosk model
import subprocess
import json
from vosk import Model, KaldiRecognizer

Initializes Python Audio Interface

In [31]:
# Shows connected devices to find main microphone
p = pyaudio.PyAudio()
for i in range(p.get_device_count()):
    print(p.get_device_info_by_index(i))
    
p.terminate()

{'index': 0, 'structVersion': 2, 'name': 'Microsoft Sound Mapper - Input', 'hostApi': 0, 'maxInputChannels': 2, 'maxOutputChannels': 0, 'defaultLowInputLatency': 0.09, 'defaultLowOutputLatency': 0.09, 'defaultHighInputLatency': 0.18, 'defaultHighOutputLatency': 0.18, 'defaultSampleRate': 44100.0}
{'index': 1, 'structVersion': 2, 'name': 'Microphone (HyperX SoloCast)', 'hostApi': 0, 'maxInputChannels': 2, 'maxOutputChannels': 0, 'defaultLowInputLatency': 0.09, 'defaultLowOutputLatency': 0.09, 'defaultHighInputLatency': 0.18, 'defaultHighOutputLatency': 0.18, 'defaultSampleRate': 44100.0}
{'index': 2, 'structVersion': 2, 'name': 'Microphone (NexiGo N930AF FHD w', 'hostApi': 0, 'maxInputChannels': 1, 'maxOutputChannels': 0, 'defaultLowInputLatency': 0.09, 'defaultLowOutputLatency': 0.09, 'defaultHighInputLatency': 0.18, 'defaultHighOutputLatency': 0.18, 'defaultSampleRate': 44100.0}
{'index': 3, 'structVersion': 2, 'name': 'VoiceMeeter Output (VB-Audio Vo', 'hostApi': 0, 'maxInputChannels

Variable building

In [32]:
# Tells the thread when to stop recording/transcribing
messages = Queue()

# Stores microphone data
recordings = Queue()

# Creates 'Record Audio' Button
recordWidget = widgets.Button(
    description="Record Audio",
    disabled=False,
    button_style="success",
    icon="microphone"
)

# Creates 'Stop Recording' Button
stopWidget = widgets.Button(
    description="Stop Recording",
    disabled=False,
    button_style="danger",
    icon="stop"
)

transcript = widgets.Output()

Initializes audio information and helper functions (targets from following code block)

In [33]:
CHANNELS = 1
SAMPLING_RATE = 16000
RECORDING_SECONDS = 2.5
AUDIO_FORMAT = pyaudio.paInt16
SAMPLE_SIZE = 2
DEVICE_INDEX = 1

# Functionality for 'Start Recording' button
def microphoneData(chunk=1024):
    p = pyaudio.PyAudio()
    
    stream = p.open(format=AUDIO_FORMAT,
                    channels=CHANNELS,
                    rate=SAMPLING_RATE,
                    input=True,
                    input_device_index=DEVICE_INDEX,
                    frames_per_buffer=chunk)
    
    frames = []
    
    while not messages.empty():
        # Reads 1024 audio frames at a time
        data = stream.read(chunk)
        frames.append(data)
        
        # Transcribes output and empties frames when recording length passes RECORDING_SECONDS
        if len(frames) >= (SAMPLING_RATE * RECORDING_SECONDS) / chunk:
            recordings.put(frames.copy())
            frames = []
            
    # Closes connection to your microphone when recording ceases
    stream.stop_stream()
    stream.close()
    p.terminate()

In [34]:
# Downloads the small vosk language model
voskModel = Model(model_name="vosk-model-small-en-us-0.15")
rec = KaldiRecognizer(voskModel, SAMPLING_RATE)
rec.SetWords(True)

def speechRecognition(transcript):
    while not messages.empty():
        frames = recordings.get()
        
        rec.AcceptWaveform(b''.join(frames))
        results = rec.Result()
        text = json.loads(results)["text"]
        
        transcript.append_stdout(text + '\n')
        
        

Builds defs that give widgets functionality

In [35]:
def startAudio(data):
    # Tells Thread to keep running
    messages.put(True)
    
    # Prints data directly into thread
    with transcript:
        
        display("Recording Started:")
        
        # Accesses microphoneData code and begins threading
        record = Thread(target=microphoneData)
        record.start()
        
        # Accesses speechRecognition code and begins threading
        transcribe = Thread(target=speechRecognition, args=(transcript,))
        transcribe.start()

def stopAudio(data):
    with transcript:
        # Takes messages off of the queue
        messages.get()
        
        display("Recording Finished")

Calls functions

In [36]:
recordWidget.on_click(startAudio)
stopWidget.on_click(stopAudio)

Displays the buttons and transcription output

In [37]:
display(recordWidget)
display(stopWidget)
display(transcript)

Button(button_style='success', description='Record Audio', icon='microphone', style=ButtonStyle())

Button(button_style='danger', description='Stop Recording', icon='stop', style=ButtonStyle())

Output()