# Speech Recognition techniques

A walkthough the different functions in pyramidman that handle the recording, playing and processing of audio. It is mainly for didactic and testing purposes.

In [6]:
%load_ext autoreload
%autoreload 2

from pyramidman.audio_parameters import AudioParameters
from pyramidman.basic_audio_IO import play_audio, record_audio
from pyramidman.audio_utils import get_available_microphones, get_sysdefault_microphone_index, get_all_devices_str
from pyramidman.queue_utils import record_with_queue
from pyramidman.unwrapper import unwrap
from pyramidman.speech_recognizing import recognize_speech_from_mic

from pyramidman.hieroglyph import plot_timeseries_range_slider, create_tabs, plot_spectrogram
from pyramidman.hieroglyph import add_word_annotations

from pyramidman.audio_utils import calibrate_microphone

from pyramidman.deepspeech_tools import transcribe, DeepSpeechArgs

import speech_recognition as sr
import time
import numpy as np
from scipy import signal
from scipy.io import wavfile

import matplotlib.pyplot as plt
%matplotlib qt

import plotly.graph_objs as go
from IPython.display import display
import ipywidgets as widgets

from queue import Queue

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Instantiate and calibrate microphone

Ideally, we would like a background process in a thread that whenever a sentence is finished, it is translated and plotted. This is the following code.

In [7]:
audio_params = AudioParameters()
audio_params.set_sysdefault_microphone_index()
audio_params.set_default_input_parameters()
# audio_params.sample_rate = 16000


In [13]:
mic = audio_params.get_microphone()
r = sr.Recognizer()

calibrate_microphone(mic, r, duration = 3, dynamic_energy_threshold= False )

Calibrating microphone for 3 seconds.
Calibrated


In [14]:
# Maximum number of seconds of non-speaking seconds before and after the audio
r.non_speaking_duration

# Number of non-speaking seconds to be considered end of sentence.
r.pause_threshold

# Minimum number of seconds of a sentence.
r.phrase_threshold

# The amount of energy in 
r.energy_threshold

# Number of bytes in the 
mic.SAMPLE_WIDTH

2

## Record example message to process with the recognizers

In [15]:
filename_mic = '../audios/temp/hello_world.wav'

r.non_speaking_duration = 0.3
r.pause_threshold = 0.3

with mic as source:
    audio = r.listen(source)

with open(filename_mic, "wb") as f:
    f.write(audio.get_wav_data())

In [169]:
tabs = get_audio_menu_wav_file(filename_mic)
display(tabs)

Tab(children=(FigureWidget({
    'data': [{'line': {'color': 'deepskyblue'},
              'name': 'AAPL High'…

## Speech recognition


In [20]:
response = recognize_speech_from_mic(audio_params)

In [21]:
print('\nSuccess : {}\nError   : {}\n\nText from Speech\n{}\n\n{}' \
      .format(response['success'],
              response['error'],
              '-'*17,
              response['transcription']))


Success : True
Error   : Unable to recognize speech

Text from Speech
-----------------

None


# DeepSpeech recognizer

We have installed and used this decoder. It has the following peculiarities:
- It only works for 16000 sample_rate data, so we need to resample the 48000 recording. Still it is better to record in 48000 and then filter (no just downsamplling) due to aliasing
- It can return metadata with the likelihood of the transformation and the start_time and duration of each word.
- There seems to be an error in the decoder as it always assings the time 0 to the first letter it decodes, independently of when it happens.


In [171]:
args = DeepSpeechArgs()
metadata = transcribe(args, filename_mic)

In [173]:
metadata["sentence"]

'no my friend'

In [174]:
metadata["words"]

[{'word': 'no', 'start_time ': 0.0, 'duration': 2.0},
 {'word': 'my', 'start_time ': 2.04, 'duration': 0.26},
 {'word': 'friend', 'start_time ': 2.3, 'duration': 0.26}]

In [172]:
[[item.character,item.start_time, item.timestep] for item in metadata["characters"].items]

[['n', 0.0, 0],
 ['o', 1.8199999332427979, 91],
 [' ', 2.0, 100],
 ['m', 2.0399999618530273, 102],
 ['y', 2.0999999046325684, 105],
 [' ', 2.299999952316284, 115],
 ['f', 2.299999952316284, 115],
 ['r', 2.359999895095825, 118],
 ['i', 2.440000057220459, 122],
 ['e', 2.4800000190734863, 124],
 ['n', 2.5199999809265137, 126],
 ['d', 2.559999942779541, 128]]

In [175]:
tabs = get_audio_menu_wav_file(filename_mic)
add_word_annotations(tabs.children[0],metadata["words"])
display(tabs)

Tab(children=(FigureWidget({
    'data': [{'line': {'color': 'deepskyblue'},
              'name': 'AAPL High'…

In [117]:
play_audio(audio_params, filename_mic)