# The Ultimate Guide To Speech Recognition With Python

In [None]:
# hướng dẫn
# https://realpython.com/python-speech-recognition/
# sample audio files
# https://github.com/realpython/python-speech-recognition/tree/master/audio_files
# https://convertio.co/aac-wav/

## Using speech_recognition

In [1]:
import speech_recognition as sr
import os
import sys

In [2]:
# os.chdir('./speech_recognition')
os.getcwd()

'c:\\Users\\nn7fr\\Dropbox\\share\\nvnkrus\\advanced-python\\speech_recognition'

In [3]:
r = sr.Recognizer()             # recognizer for speech recognition
fname = os.path.join('audio_files', 'harvard.wav')
harvard = sr.AudioFile(fname)   # audio object
with harvard as source:         # audio data
    audio = r.record(source)
print(type(r),type(harvard),type(audio))
r.recognize_google(audio) # english

<class 'speech_recognition.Recognizer'> <class 'speech_recognition.AudioFile'> <class 'speech_recognition.audio.AudioData'>


'the stale smell of old beer lingers it takes heat to bring out the odor a cold dip restores health and zest a salt pickle taste fine with ham tacos al pastor are my favorite a zestful food is the hot cross bun'

In [4]:
# https://cloud.google.com/speech-to-text/docs/speech-to-text-supported-languages
r = sr.Recognizer()             # recognizer for speech recognition
fname = os.path.join('audio_files', 'ngoctram_voice_001.wav')
harvard = sr.AudioFile(fname)   # audio object
with harvard as source:         # audio data
    audio = r.record(source)
print(type(r),type(harvard),type(audio))
r.recognize_google(audio,language='vi-VI') # vietnamese

<class 'speech_recognition.Recognizer'> <class 'speech_recognition.AudioFile'> <class 'speech_recognition.audio.AudioData'>


'Chồng yêu ơi chồng ở nhà chưa'

In [5]:
help(r.recognize_google)

Help on method recognize_legacy in module speech_recognition.recognizers.google:

recognize_legacy(audio_data: 'AudioData', key: 'str | None' = None, language: 'str' = 'en-US', pfilter: 'ProfanityFilterLevel' = 0, show_all: 'bool' = False, with_confidence: 'bool' = False) method of speech_recognition.Recognizer instance
    Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Google Speech Recognition API.
    
    The Google Speech Recognition API key is specified by ``key``. If not specified, it uses a generic key that works out of the box. This should generally be used for personal or testing purposes only, as it **may be revoked by Google at any time**.
    
    To obtain your own API key, simply following the steps on the `API Keys <http://www.chromium.org/developers/how-tos/api-keys>`__ page at the Chromium Developers site. In the Google Developers Console, Google Speech Recognition is listed as "Speech API".
    
    The recognition language is de

In [8]:
# from pydub import AudioSegment

# # Load AAC audio from file (replace with your actual file path)
# fname = os.path.join('audio_files', 'ngoctram_voice_001.aac')
# aac_audio = AudioSegment.from_file(fname, format="aac")

# # Export to WAV format
# fname = os.path.join('audio_files', 'ngoctram_voice_001.wav')
# aac_audio.export(fname, format="wav")


In [10]:
import speech_recognition as sr
from guessing_game import recognize_speech_from_mic
r = sr.Recognizer()
m = sr.Microphone()

In [11]:
recognize_speech_from_mic(r, m)  # speak after running this line

{'success': True, 'error': 'Unable to recognize speech', 'transcription': None}

In [12]:
sr.Microphone.list_microphone_names()

['Microsoft Sound Mapper - Input',
 'Microphone (A4tech FHD 1080P PC',
 'Microsoft Sound Mapper - Output',
 '헤드폰 (Realtek(R) Audio)',
 '스피커 (Realtek(R) Audio)',
 'Primary Sound Capture Driver',
 'Microphone (A4tech FHD 1080P PC Camera)',
 'Primary Sound Driver',
 '헤드폰 (Realtek(R) Audio)',
 '스피커 (Realtek(R) Audio)',
 '스피커 (Realtek(R) Audio)',
 '헤드폰 (Realtek(R) Audio)',
 'Microphone (A4tech FHD 1080P PC Camera)',
 'Headphones (Realtek HD Audio 2nd output)',
 'Microphone (Realtek HD Audio Mic input)',
 'Speakers (Realtek HD Audio output)',
 'Stereo Mix (Realtek HD Audio Stereo input)',
 'Headphones ()',
 'Headset (@System32\\drivers\\bthhfenum.sys,#2;%1 Hands-Free%0\r\n;(Nhan’s AirPods Pro))',
 'Headset (@System32\\drivers\\bthhfenum.sys,#2;%1 Hands-Free%0\r\n;(Nhan’s AirPods Pro))',
 'Microphone (A4tech FHD 1080P PC Camera)',
 'Output (@System32\\drivers\\bthhfenum.sys,#4;%1 Hands-Free HF Audio%0\r\n;(NIPHONE))',
 'Input (@System32\\drivers\\bthhfenum.sys,#4;%1 Hands-Free HF Audio%0\r\n;

In [None]:
mic = sr.Microphone(device_index=1)

In [16]:
with m as source:
    audio = r.listen(source)
r.recognize_google(audio,language='vi-VI') # vietnamese

'muốn tạo một bức tranh trong đó có một con đường Ron đang chạy trên một cánh đồng để cắt để thu hoạch những trái cây'

In [14]:
r.recognize_google(audio)

'hello'

## Using VietAI ASR
https://github.com/vietai/ASR

In [1]:
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
from datasets import load_dataset
import soundfile as sf
import torch

  torch.utils._pytree._register_pytree_node(


In [3]:
# load model and tokenizer
processor = Wav2Vec2Processor.from_pretrained("nguyenvulebinh/wav2vec2-base-vietnamese-250h")
model = Wav2Vec2ForCTC.from_pretrained("nguyenvulebinh/wav2vec2-base-vietnamese-250h")

# define function to read in sound file
def map_to_array(batch):
    speech, _ = sf.read(batch["file"])
    batch["speech"] = speech
    return batch

# load dummy dataset and read soundfiles
ds = map_to_array({
    "file": 'audio_files/ngoctram_voice_001.wav'
})

# tokenize
input_values = processor(ds["speech"], return_tensors="pt", padding="longest").input_values  # Batch size 1

# retrieve logits
logits = model(input_values).logits

# take argmax and decode
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of the model checkpoint at nguyenvulebinh/wav2vec2-base-vietnamese-250h were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_v', 'wav2vec2.encoder.pos_conv_embed.conv.weight_g']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at nguyenvulebinh/wav2vec2-base-vietnamese-250h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.




In [4]:
transcription

['thần n đưa ra thần gán hàn xợ']