Skip to content

Commit e3a64fd

Browse files
authored
Add --use-text-input and --use-text-output.
Add --use-text-input and --use-text-output.
2 parents 94a2e7c + e4c5894 commit e3a64fd

File tree

2 files changed

+98
-5
lines changed

2 files changed

+98
-5
lines changed

google-assistant-sdk/googlesamples/assistant/grpc/pushtotalk.py

Lines changed: 31 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -41,13 +41,15 @@
4141
assistant_helpers,
4242
audio_helpers,
4343
browser_helpers,
44-
device_helpers
44+
device_helpers,
45+
text_helpers
4546
)
4647
except (SystemError, ImportError):
4748
import assistant_helpers
4849
import audio_helpers
4950
import browser_helpers
5051
import device_helpers
52+
import text_helpers
5153

5254

5355
ASSISTANT_API_ENDPOINT = 'embeddedassistant.googleapis.com'
@@ -256,6 +258,10 @@ def gen_assist_requests(self):
256258
help='Enable visual display of Assistant responses in HTML.')
257259
@click.option('--verbose', '-v', is_flag=True, default=False,
258260
help='Verbose logging.')
261+
@click.option('--use-text-input', is_flag=True, default=False,
262+
help='Use the hacky text-to-speech stuff.')
263+
@click.option('--use-text-output', is_flag=True, default=False,
264+
help='Use the hacky speech-to-text stuff.')
259265
@click.option('--input-audio-file', '-i',
260266
metavar='<input file>',
261267
help='Path to input audio file. '
@@ -293,7 +299,8 @@ def gen_assist_requests(self):
293299
help='Force termination after a single conversation.')
294300
def main(api_endpoint, credentials, project_id,
295301
device_model_id, device_id, device_config,
296-
lang, display, verbose,
302+
lang, display, verbose, use_text_input,
303+
use_text_output,
297304
input_audio_file, output_audio_file,
298305
audio_sample_rate, audio_sample_width,
299306
audio_iter_size, audio_block_size, audio_flush_size,
@@ -336,7 +343,13 @@ def main(api_endpoint, credentials, project_id,
336343

337344
# Configure audio source and sink.
338345
audio_device = None
339-
if input_audio_file:
346+
if use_text_input:
347+
audio_source = text_helpers.TextSource(
348+
"",
349+
sample_rate=audio_sample_rate,
350+
sample_width=audio_sample_width
351+
)
352+
elif input_audio_file:
340353
audio_source = audio_helpers.WaveSource(
341354
open(input_audio_file, 'rb'),
342355
sample_rate=audio_sample_rate,
@@ -351,7 +364,13 @@ def main(api_endpoint, credentials, project_id,
351364
flush_size=audio_flush_size
352365
)
353366
)
354-
if output_audio_file:
367+
368+
if use_text_output:
369+
audio_sink = text_helpers.TextSink(
370+
sample_rate=audio_sample_rate,
371+
sample_width=audio_sample_width
372+
)
373+
elif output_audio_file:
355374
audio_sink = audio_helpers.WaveSink(
356375
open(output_audio_file, 'wb'),
357376
sample_rate=audio_sample_rate,
@@ -452,8 +471,15 @@ def blink(speed, number):
452471
# and playing back assistant response using the speaker.
453472
# When the once flag is set, don't wait for a trigger. Otherwise, wait.
454473
wait_for_user_trigger = not once
474+
first = True
455475
while True:
456-
if wait_for_user_trigger:
476+
if use_text_input:
477+
if not first and use_text_output:
478+
logging.info('Recognized response from Assistant: ' + audio_sink.recognize())
479+
audio_sink.reset()
480+
audio_source.reset_text(click.prompt(''))
481+
first = False
482+
elif wait_for_user_trigger:
457483
click.pause(info='Press Enter to send a new request...')
458484
continue_conversation = assistant.assist()
459485
# wait for user trigger if there is no follow-up turn in
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
import io
2+
import wave
3+
4+
try:
5+
from . import audio_helpers
6+
except (SystemError, ImportError):
7+
# needs ffmpeg
8+
import audio_helpers
9+
10+
import speech_recognition as sr
11+
12+
from gtts import gTTS
13+
from pydub import AudioSegment
14+
15+
class TextSource(audio_helpers.WaveSource):
16+
"""Audio source that uses text to speech as the
17+
backing store.
18+
19+
Silence is returned when there is no current text.
20+
Args:
21+
text: the initial text to read.
22+
sample_rate: sample rate in hertz.
23+
sample_width: size of a single sample in bytes.
24+
"""
25+
def __init__(self, text, sample_rate, sample_width):
26+
self._text = text
27+
self._sample_rate = sample_rate
28+
self._sample_width = sample_width
29+
self._sleep_until = 0
30+
self._wavep = None
31+
32+
if text:
33+
self.reset_text(text)
34+
35+
def reset_text(self, text):
36+
self._text = text
37+
with io.BytesIO() as fp:
38+
tts = gTTS(text, 'en')
39+
tts.write_to_fp(fp)
40+
fp.seek(0)
41+
data = AudioSegment.from_mp3(fp)._data
42+
self._fp = io.BytesIO(data)
43+
44+
def close(self):
45+
"""Close the underlying stream."""
46+
pass
47+
48+
class TextSink(audio_helpers.WaveSink):
49+
def __init__(self, sample_rate, sample_width):
50+
self._sample_rate = sample_rate
51+
self._sample_width = sample_width
52+
53+
self._r = sr.Recognizer()
54+
self.reset()
55+
56+
def recognize(self):
57+
self._buf.seek(0)
58+
with sr.AudioFile(self._buf) as source:
59+
audio = self._r.record(source)
60+
text = self._r.recognize_google(audio)
61+
return text
62+
63+
def reset(self):
64+
self._buf = io.BytesIO()
65+
super().__init__(self._buf, self._sample_rate, self._sample_width)
66+
67+

0 commit comments

Comments
 (0)