## Installing Required Packages

In [1]:
!pip install SpeechRecognition wavio ffmpeg-python gtts
!mkdir sounds
!wget https://raw.githubusercontent.com/myprogrammerpersonality/Voice_Recognition/master/Template.csv

Collecting SpeechRecognition
  Downloading SpeechRecognition-3.10.0-py2.py3-none-any.whl (32.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m32.8/32.8 MB[0m [31m48.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting wavio
  Downloading wavio-0.0.8-py3-none-any.whl (9.4 kB)
Collecting ffmpeg-python
  Downloading ffmpeg_python-0.2.0-py3-none-any.whl (25 kB)
Collecting gtts
  Downloading gTTS-2.4.0-py3-none-any.whl (29 kB)
Installing collected packages: wavio, ffmpeg-python, SpeechRecognition, gtts
Successfully installed SpeechRecognition-3.10.0 ffmpeg-python-0.2.0 gtts-2.4.0 wavio-0.0.8
--2023-10-21 15:31:58--  https://raw.githubusercontent.com/myprogrammerpersonality/Voice_Recognition/master/Template.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 2

## Import Packages and Define Functions

In [2]:
import scipy
from scipy.io.wavfile import read as wav_read
import io
from IPython.display import HTML, Audio, clear_output
from google.colab.output import eval_js
from base64 import b64decode
import numpy as np
import ffmpeg
import IPython.display as ipd
from IPython.display import Javascript
import speech_recognition as sr
import matplotlib.pyplot as plt
import time
import wavio
import pandas as pd
from gtts import gTTS #Import Google Text to Speech

RECORD = """
const sleep  = time => new Promise(resolve => setTimeout(resolve, time))
const b2text = blob => new Promise(resolve => {
  const reader = new FileReader()
  reader.onloadend = e => resolve(e.srcElement.result)
  reader.readAsDataURL(blob)
})
var record = time => new Promise(async resolve => {
  stream = await navigator.mediaDevices.getUserMedia({ audio: true })
  recorder = new MediaRecorder(stream)
  chunks = []
  recorder.ondataavailable = e => chunks.push(e.data)
  recorder.start()
  await sleep(time)
  recorder.onstop = async ()=>{
    blob = new Blob(chunks)
    text = await b2text(blob)
    resolve(text)
  }
  recorder.stop()
})
"""
output_html = """<style>
    fieldset {{
    font-family: sans-serif;
    border: 5px solid #1F497D;
    background: #ddd;
    border-radius: 5px;
    padding: 15px;
}}

fieldset legend {{
    background: #1F497D;
    color: #fff;
    padding: 5px 10px ;
    font-size: 32px;
    border-radius: 10px;
    box-shadow: 0 0 0 5px #ddd;
    margin-left: 20px;
}}
</style>

<section style="margin: 15px;">
<fieldset style="min-height:100px;">
<legend><b> {} </b> </legend>
<label> <h1 style="font-size: 80px;float: top;">{} ==> Sample {}</h1><br/> </label>
</fieldset>"""

def record(sec=3, file_name = 'temp.wav', verbose=False):
    if verbose: print('Start Recording :')
    display(Javascript(RECORD))
    s = eval_js('record(%d)' % (sec*1000))
    b = b64decode(s.split(',')[1])

    process = (ffmpeg
        .input('pipe:0')
        .output('pipe:1', format='wav')
        .run_async(pipe_stdin=True, pipe_stdout=True, pipe_stderr=True, quiet=True, overwrite_output=True))
    output, err = process.communicate(input=b)

    riff_chunk_size = len(output) - 8
    # Break up the chunk size into four bytes, held in b.
    q = riff_chunk_size
    b = []
    for i in range(4):
        q, r = divmod(q, 256)
        b.append(r)

    # Replace bytes 4:8 in proc.stdout with the actual size of the RIFF chunk.
    riff = output[:4] + bytes(b) + output[8:]

    sr, audio = wav_read(io.BytesIO(riff))

    if verbose: print('Recording Finished')
    return audio, sr

def hearing(step_sec = 5, key_word = 'next', stop_word = 'skip', verbose = False):
    key = key_word.lower()
    key_stop = stop_word.lower()
    num = 0
    while True:
        num += 1
        if verbose: print(f'Round{num}')

        # Part 1: Recording
        t1 = time.time()
        audio, sound_rate = record(sec=step_sec, verbose=False)

        # Part 2: Saving Audio File
        t2 = time.time()
        wavio.write('sound.wav', audio, sound_rate)

        # Part 3: Try to Recognize and Check for Key_Word
        t3 = time.time()
        r = sr.Recognizer()
        with sr.WavFile('sound.wav') as source:
            audio = r.record(source)
        try:
            text = r.recognize_google(audio)
            text = text.lower()
            if verbose >= 2: print(f'You Said :{text}')
            if key in text:
                return 1
            if key_stop in text:
                return 0
        except:
            pass

        if verbose:print(f'Part 1 {t2-t1}')
        if verbose:print(f'Part 2 {t3-t2}')
        if verbose:print(f'Part 3 {time.time()-t3}')

## Text to Speech

In [7]:
data = pd.read_csv('Template.csv')
columns = list(range(1, len(data)+1))
metabolite = data.columns
data = pd.DataFrame(columns=columns, data=data.values.T)
data['Metabolite'] = metabolite
data = data[['Metabolite']+columns]

main_dict = {}
for name in data['Metabolite']:
    vols = list(data[data['Metabolite']==name].iloc[:,1:].values[0])
    main_dict[name] = [vols, sorted(range(len(vols)), key=lambda k: vols[k])]

for name in main_dict.keys():
    print(f'reading {name} ...')
    tts = gTTS('Start Aliquoting {}'.format(name)) #Provide the string to convert to speech
    tts.save('sounds/{}.wav'.format(name)) #save the string converted to speech as a .wav file
    for i, vol in enumerate(main_dict[name][0]):
        tts = gTTS('{} in Sample {}'.format(vol, i+1))
        tts.save('sounds/{}_{}.wav'.format(name, i))

reading Water ...
reading PEG ...


## Main Part

In [8]:
# sorted version within each metabolite
for name in main_dict.keys():
    print('Start Aliquoting ', name)
    display(Audio('sounds/{}.wav'.format(name), autoplay=True))
    display(HTML(output_html.format(name, "#", "#")))
    time.sleep(4)
    clear_output(wait=True)
    time.sleep(2)
    for i in range(len(main_dict[name][0])):
        display(Audio('sounds/{}_{}.wav'.format(name, main_dict[name][1][i]), autoplay=True))
        display(HTML(output_html.format(name, main_dict[name][0][main_dict[name][1][i]], main_dict[name][1][i]+1)))
        if hearing(step_sec=5, key_word='next', stop_word='skip', verbose=2):
            pass
        else:
            clear_output(wait=True)
            break
        clear_output(wait=True)

Round1


<IPython.core.display.Javascript object>

Part 1 6.693026542663574
Part 2 0.0014030933380126953
Part 3 0.45320653915405273
Round2


<IPython.core.display.Javascript object>

Part 1 6.657322406768799
Part 2 0.0011022090911865234
Part 3 0.32698726654052734
Round3


<IPython.core.display.Javascript object>

Part 1 6.605336427688599
Part 2 0.0048220157623291016
Part 3 0.2468724250793457
Round4


<IPython.core.display.Javascript object>

Part 1 6.334430456161499
Part 2 0.001316070556640625
Part 3 0.22466421127319336
Round5


<IPython.core.display.Javascript object>

Part 1 6.3170998096466064
Part 2 0.0011539459228515625
Part 3 0.5236673355102539
Round6


<IPython.core.display.Javascript object>

KeyboardInterrupt: ignored