## Installing Required Packages

In [None]:
!pip install SpeechRecognition wavio ffmpeg-python gtts
!mkdir sounds
!wget https://raw.githubusercontent.com/myprogrammerpersonality/Voice_Recognition/master/Template.csv

Collecting SpeechRecognition
[?25l  Downloading https://files.pythonhosted.org/packages/26/e1/7f5678cd94ec1234269d23756dbdaa4c8cfaed973412f88ae8adf7893a50/SpeechRecognition-3.8.1-py2.py3-none-any.whl (32.8MB)
[K     |████████████████████████████████| 32.8MB 120kB/s 
[?25hCollecting wavio
  Downloading https://files.pythonhosted.org/packages/e6/98/8bf5ea39a3385cc806ba1146a280a113835e5df3b0ad25ca95eea8352040/wavio-0.0.4-py2.py3-none-any.whl
Collecting ffmpeg-python
  Downloading https://files.pythonhosted.org/packages/d7/0c/56be52741f75bad4dc6555991fabd2e07b432d333da82c11ad701123888a/ffmpeg_python-0.2.0-py3-none-any.whl
Collecting gtts
  Downloading https://files.pythonhosted.org/packages/a1/0c/4ca77eca3b739a4a08360930643f58d714e302fee0d2f8c654e67d9af8e7/gTTS-2.1.1-py3-none-any.whl
Collecting gtts-token>=1.1.3
  Downloading https://files.pythonhosted.org/packages/e7/25/ca6e9cd3275bfc3097fe6b06cc31db6d3dfaf32e032e0f73fead9c9a03ce/gTTS-token-1.1.3.tar.gz
Building wheels for collected pa

## Import Packages and Define Functions

In [None]:
import scipy
from scipy.io.wavfile import read as wav_read
import io
from IPython.display import HTML, Audio, clear_output
from google.colab.output import eval_js
from base64 import b64decode
import numpy as np
import ffmpeg
import IPython.display as ipd
from IPython.display import Javascript
import speech_recognition as sr
import matplotlib.pyplot as plt
import time
import wavio
import pandas as pd
from gtts import gTTS #Import Google Text to Speech

RECORD = """
const sleep  = time => new Promise(resolve => setTimeout(resolve, time))
const b2text = blob => new Promise(resolve => {
  const reader = new FileReader()
  reader.onloadend = e => resolve(e.srcElement.result)
  reader.readAsDataURL(blob)
})
var record = time => new Promise(async resolve => {
  stream = await navigator.mediaDevices.getUserMedia({ audio: true })
  recorder = new MediaRecorder(stream)
  chunks = []
  recorder.ondataavailable = e => chunks.push(e.data)
  recorder.start()
  await sleep(time)
  recorder.onstop = async ()=>{
    blob = new Blob(chunks)
    text = await b2text(blob)
    resolve(text)
  }
  recorder.stop()
})
"""
output_html = """<style>
    fieldset {{
    font-family: sans-serif;
    border: 5px solid #1F497D;
    background: #ddd;
    border-radius: 5px;
    padding: 15px;
}}

fieldset legend {{
    background: #1F497D;
    color: #fff;
    padding: 5px 10px ;
    font-size: 32px;
    border-radius: 10px;
    box-shadow: 0 0 0 5px #ddd;
    margin-left: 20px;
}}
</style>

<section style="margin: 15px;">
<fieldset style="min-height:100px;">
<legend><b> {} </b> </legend>
<label> <h1 style="font-size: 80px;float: top;">{} ==> Sample {}</h1><br/> </label>
</fieldset>"""

def record(sec=3, file_name = 'temp.wav', verbose=False):
    if verbose: print('Start Recording :')
    display(Javascript(RECORD))
    s = eval_js('record(%d)' % (sec*1000))
    b = b64decode(s.split(',')[1])

    process = (ffmpeg
        .input('pipe:0')
        .output('pipe:1', format='wav')
        .run_async(pipe_stdin=True, pipe_stdout=True, pipe_stderr=True, quiet=True, overwrite_output=True))
    output, err = process.communicate(input=b)
  
    riff_chunk_size = len(output) - 8
    # Break up the chunk size into four bytes, held in b.
    q = riff_chunk_size
    b = []
    for i in range(4):
        q, r = divmod(q, 256)
        b.append(r)

    # Replace bytes 4:8 in proc.stdout with the actual size of the RIFF chunk.
    riff = output[:4] + bytes(b) + output[8:]

    sr, audio = wav_read(io.BytesIO(riff))

    if verbose: print('Recording Finished')
    return audio, sr

def hearing(step_sec = 5, key_word = 'go', stop_word = 'stop', verbose = False):
    key = key_word.lower()
    key_stop = stop_word.lower()
    num = 0
    while True:
        num += 1
        if verbose: print(f'Round{num}')

        # Part 1: Recording
        t1 = time.time()
        audio, sound_rate = record(sec=step_sec, verbose=False)
        
        # Part 2: Saving Audio File
        t2 = time.time()
        wavio.write('sound.wav', audio, sound_rate)

        # Part 3: Try to Recognize and Check for Key_Word
        t3 = time.time()
        r = sr.Recognizer()
        with sr.WavFile('sound.wav') as source:
            audio = r.record(source)
        try:
            text = r.recognize_google(audio)
            text = text.lower()
            if verbose >= 2: print(f'You Said :{text}')
            if key in text:
                return 1
            if key_stop in text:
                return 0
        except:
            pass
        
        if verbose:print(f'Part 1 {t2-t1}')
        if verbose:print(f'Part 2 {t3-t2}')
        if verbose:print(f'Part 3 {time.time()-t3}')

## Text to Speech

In [None]:
data = pd.read_csv('Template.csv')

main_dict = {}
for name in data['Metabolite']:
    vols = list(data[data['Metabolite']==name].iloc[:,1:].values[0])
    main_dict[name] = [vols, sorted(range(len(vols)), key=lambda k: vols[k])]

for name in main_dict.keys():
    tts = gTTS('Start Aliquoting {}'.format(name)) #Provide the string to convert to speech
    tts.save('sounds/{}.wav'.format(name)) #save the string converted to speech as a .wav file
    for i, vol in enumerate(main_dict[name][0]):
        tts = gTTS('{} in Sample {}'.format(vol, i+1))
        tts.save('sounds/{}_{}.wav'.format(name, i))

## Main Part

In [7]:
# sorted version within each metabolite
for name in main_dict.keys():
    print('Start Aliquoting ', name)
    display(Audio('sounds/{}.wav'.format(name), autoplay=True))
    display(HTML(output_html.format(name, "#", "#")))
    time.sleep(4)
    clear_output(wait=True)
    time.sleep(2)
    for i in range(len(main_dict[name][0])):
        display(Audio('sounds/{}_{}.wav'.format(name, main_dict[name][1][i]), autoplay=True))
        display(HTML(output_html.format(name, main_dict[name][0][main_dict[name][1][i]], main_dict[name][1][i]+1)))
        if hearing(step_sec=5, key_word='go', stop_word='stop', verbose=2):
            pass
        else:
            clear_output(wait=True)
            break
        clear_output(wait=True)

Round1


<IPython.core.display.Javascript object>

You Said :stop
