# Vosk

In [1]:
pip install vosk

Note: you may need to restart the kernel to use updated packages.


In [22]:
import vosk
from vosk import Model, KaldiRecognizer
model = Model('vosk-model-small-en-us-0.15')
recognizer = KaldiRecognizer(model,16000)

LOG (VoskAPI:ReadDataFiles():model.cc:213) Decoding params beam=10 max-active=3000 lattice-beam=2
LOG (VoskAPI:ReadDataFiles():model.cc:216) Silence phones 1:2:3:4:5:6:7:8:9:10
LOG (VoskAPI:RemoveOrphanNodes():nnet-nnet.cc:948) Removed 0 orphan nodes.
LOG (VoskAPI:RemoveOrphanComponents():nnet-nnet.cc:847) Removing 0 orphan components.
LOG (VoskAPI:CompileLooped():nnet-compile-looped.cc:345) Spent 0.041718 seconds in looped compilation.
LOG (VoskAPI:ReadDataFiles():model.cc:248) Loading i-vector extractor from vosk-model-small-en-us-0.15/ivector/final.ie
LOG (VoskAPI:ComputeDerivedVars():ivector-extractor.cc:183) Computing derived variables for iVector extractor
LOG (VoskAPI:ComputeDerivedVars():ivector-extractor.cc:204) Done.
LOG (VoskAPI:ReadDataFiles():model.cc:281) Loading HCL and G from vosk-model-small-en-us-0.15/graph/HCLr.fst vosk-model-small-en-us-0.15/graph/Gr.fst
LOG (VoskAPI:ReadDataFiles():model.cc:302) Loading winfo vosk-model-small-en-us-0.15/graph/phones/word_boundary.i

In [23]:
checkin = "where is the check in desk"
parents = "i have lost my parents"
suitcase = "please i have lost my suitcase"
what_time = "what time is my plane"
where = "where are the restaurants and shops"

en = [checkin, parents, suitcase, what_time, where]

In [24]:
import librosa as lr
import numpy as np
import os
from prettytable import PrettyTable
from jiwer import wer
import json
import wave

def translate(wf):
    transcription = []
    while True:
        data = wf.readframes(4000)
        if len(data) == 0:
            break
        if recognizer.AcceptWaveform(data):
            # Convert json output to dict
            result_dict = json.loads(recognizer.Result())
            # Extract text values and append them to transcription list
            transcription.append(result_dict.get("text", ""))

    # Get final bits of audio and flush the pipeline
    final_result = json.loads(recognizer.FinalResult())
    transcription.append(final_result.get("text", ""))

    # merge or join all list elements to one big string
    transcription_text = ' '.join(transcription)
    return transcription_text

def fileRead(directory):
    table = PrettyTable(["File","WER"])
    for filename in os.listdir(directory):
        filepath = os.path.join(directory, filename)
        if os.path.isfile(filepath):
            audio_file = filepath
            wf = wave.open(audio_file,'rb')
            res = translate(wf)
            transcription = filepath.strip("EN/")
            print(transcription)
            print(res)
            if transcription.startswith("checkin"):
                tablewer = wer(en[0],res)
            elif transcription.startswith("parents"):
                tablewer = wer(en[1],res)
            elif transcription.startswith("suitcase"):
                tablewer = wer(en[2],res)
            elif transcription.startswith("what_time"):
                tablewer = wer(en[3],res)
            elif transcription.startswith("where"):
                tablewer = wer(en[4],res)
        table.add_row([transcription, tablewer*100])
    print(table)

In [25]:
directory = 'EN/'
fileRead(directory)

where.wav
where are the restaurants and shops
parents.wav
i lost my parents
suitcase.wav
please have lost my suitcase
checkin.wav
where is the check in desk
what_time.wav
what time is my plane
+---------------+--------------------+
|      File     |        WER         |
+---------------+--------------------+
|   where.wav   |        0.0         |
|  parents.wav  |        20.0        |
|  suitcase.wav | 16.666666666666664 |
|  checkin.wav  |        0.0         |
| what_time.wav |        0.0         |
+---------------+--------------------+


Reference: https://towardsdatascience.com/transcribe-large-audio-files-offline-with-vosk-a77ee8f7aa28

# Google Speech Recognition

In [6]:
pip install SpeechRecognition

Note: you may need to restart the kernel to use updated packages.


In [26]:
import librosa as lr
import numpy as np
import os
from prettytable import PrettyTable
from jiwer import wer
import wave
import speech_recognition as sr

def fileRead(directory):
    table = PrettyTable(["File","WER"])
    for filename in os.listdir(directory):
        filepath = os.path.join(directory, filename)
        if os.path.isfile(filepath):
            AUDIO_FILE = filepath
            r = sr.Recognizer()
            with sr.AudioFile(AUDIO_FILE) as source:
                audio = r.record(source)  # read the entire audio file
            res = r.recognize_google(audio)
            transcription = filepath.strip("EN/")
            print(transcription)
            print(res)
            if transcription.startswith("checkin"):
                tablewer = wer(en[0],res)
            elif transcription.startswith("parents"):
                tablewer = wer(en[1],res)
            elif transcription.startswith("suitcase"):
                tablewer = wer(en[2],res)
            elif transcription.startswith("what_time"):
                tablewer = wer(en[3],res)
            elif transcription.startswith("where"):
                tablewer = wer(en[4],res)
        table.add_row([transcription, tablewer*100])
    print(table)

In [27]:
directory = 'EN'
fileRead(directory)

where.wav
where are the restaurants and shops
parents.wav
I have lost my parents
suitcase.wav
please I've lost my suitcase
checkin.wav
where is the check-in desk
what_time.wav
what time is my playing
+---------------+-------------------+
|      File     |        WER        |
+---------------+-------------------+
|   where.wav   |        0.0        |
|  parents.wav  |        20.0       |
|  suitcase.wav | 33.33333333333333 |
|  checkin.wav  | 33.33333333333333 |
| what_time.wav |        20.0       |
+---------------+-------------------+


Reference: https://github.com/Uberi/speech_recognition/blob/master/examples/audio_transcribe.py