<a href="https://colab.research.google.com/github/matakahas/portfolio/blob/main/Copy_of_speech_to_text_with_GUI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## GUI-based speech-to-text tool 

The goal of this project is to develop a real-time speech-to-text (STT) tool with GUI functionality (using `Kivy`). <br>
Reference：[https://www.dskomei.com/entry/2020/04/26/182100](https://www.dskomei.com/entry/2020/04/26/182100)

### Install packages

In [None]:
!pip install google-cloud-speech
!apt install libasound2-dev portaudio19-dev libportaudio2 libportaudiocpp0 ffmpeg
!pip install pyaudio
!pip install kivy[base] kivy_examples

In [4]:
#packages used for real-time STT
from __future__ import division
import os
import io
import sys
import pyaudio
import glob
import json
from oauth2client.service_account import ServiceAccountCredentials
from google.cloud import speech
import numpy as np
import pandas as pd
from six.moves import queue
import threading

#packages used for GUI
from kivy.app import App
from kivy.config import Config
from kivy.uix.label import Label
from kivy.uix.widget import Widget
from kivy.core.text import LabelBase, DEFAULT_FONT
from kivy.properties import StringProperty 
from kivy.uix.boxlayout import BoxLayout

[INFO   ] [Logger      ] Record log in /root/.kivy/logs/kivy_22-02-12_2.txt
[INFO   ] [Kivy        ] v2.0.0
[INFO   ] [Kivy        ] Installed at "/usr/local/lib/python3.7/dist-packages/kivy/__init__.py"
[INFO   ] [Python      ] v3.7.12 (default, Jan 15 2022, 18:48:18) 
[GCC 7.5.0]
[INFO   ] [Python      ] Interpreter at "/usr/bin/python3"
[INFO   ] [Factory     ] 186 symbols loaded
[INFO   ] [Image       ] Providers: img_tex, img_dds, img_sdl2, img_pil (img_ffpyplayer ignored)
[INFO   ] [Text        ] Provider: sdl2


### Other required files
* json file needed for API authentication
* `speechtotext.kv`：To run an application built with Kivy, you need to have a python file specifying the configurations of GUI, and a Kivy file (with .kv extension) specifying the design of GUI. The Kivy file gets loaded automatically by giving it the name that is the lower-case version of the main class name on the python file (e.g., TestApp() → test.kv）

### Set the environmental variables of Google Speech-to-Text API

In [5]:
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="./[your JSON service key name].json"

### Main code for conducting SST with GUI

In [None]:
Config.set('graphics', 'fullscreen', 0)
Config.set('graphics', 'width', str(1000))
Config.set('graphics', 'height', str(300))

STREAMING_LIMIT = 240000  
SAMPLE_RATE = 16000
CHUNK_SIZE = int(SAMPLE_RATE / 10)  

speech_to_text_list = []
stream_close = False

class TextWidget(Widget):
    text = StringProperty()

    def __init__(self, **kwargs):
        super(TextWidget, self).__init__(**kwargs)
        self.text = ''
        self.number = 0   

    def __init__(self, **kwargs):
        super(TextWidget, self).__init__(**kwargs)
        self.text = ''
        self.number = 0


    ## function that runs when the recording has started
    def buttonClickedStart(self):        
        t1 = threading.Thread(target=excecute_speech_to_text_streaming, args=(self,))
        t1.start()


    ## function that runs when the recording has stopped
    def buttonClickedEnd(self):        
        global stream_close
        global speech_to_text_list

        stream_close = True
        
        with open('./streaming_result.txt', 'w' ) as file:
            text = '\n'.join(speech_to_text_list)
            file.writelines(text)

        self.text = ''
        speech_to_text_list = []

        #exit()

    def update(self):
        self.text = display_texts(max_n_text=6)
        

class SpeechToTextApp(App):
    def __init__(self, **kwargs):

        super(SpeechToTextApp, self).__init__(**kwargs)
        self.title = 'Speech to Text'

    def build(self):
        text_widget = TextWidget()
        return text_widget


## function that outputs a chunk of texts from the obtained transcriptions
def display_texts(max_n_text=5):

    if len(speech_to_text_list) <= max_n_text:
        text = '\n'.join(speech_to_text_list)
    else:
        text = '\n'.join(speech_to_text_list[-max_n_text:])
    
    return text


class ResumableMicrophoneStream:

    def __init__(self, rate, chunk_size):
        
        self._rate = rate
        self.chunk_size = chunk_size
        self._num_channels = 1
    
        self._buff = queue.Queue()                 

        
    def __enter__(self):

        global stream_close
        stream_close = False
        self._audio_interface = pyaudio.PyAudio()
        self._audio_stream = self._audio_interface.open(
            format=pyaudio.paInt16,
            channels=1,
            rate=self._rate,
            input=True,
            frames_per_buffer=self.chunk_size,
            stream_callback=self._fill_buffer,
        )
        
        return self

    
    def __exit__(self, type, value, traceback):

        self._audio_stream.stop_stream()
        self._audio_stream.close()
        self._buff.put(None)
        self._audio_interface.terminate()
        global stream_close
        stream_close = True

        
    def _fill_buffer(self, in_data, *args, **kwargs):

        self._buff.put(in_data)
        return None, pyaudio.paContinue

    
    def generator(self):

        global stream_close
        while not stream_close:
            chunk = self._buff.get()
            if chunk is None:
                return
            data = [chunk]

            while True:
                try:
                    chunk = self._buff.get(block=False)
                    if chunk is None:
                        return
                    data.append(chunk)
                except queue.Empty:
                    break

            yield b"".join(data)



def listen_print_loop(responses, stream, text_widget):
    
    global stream_close
    global speech_to_text_list

    for response in responses:
        if stream_close:
            break

        if not response.results:
            continue

        result = response.results[0]

        if not result.alternatives:
            continue
        
        transcript = result.alternatives[0].transcript

        if result.is_final:
            speech_to_text_list[-1] = transcript
            stream.last_transcript_was_final = True
        else:
            if len(speech_to_text_list) == 0:
                speech_to_text_list.append(transcript)
            else:
                if stream.last_transcript_was_final:
                    speech_to_text_list.append(transcript)
                else:
                    speech_to_text_list[-1] = transcript

            stream.last_transcript_was_final = False
        
        text_widget.update()
            
    
def excecute_speech_to_text_streaming(text_widget):

    print('Start Speech to Text Streaming')

    client = speech.SpeechClient()
    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=SAMPLE_RATE,
        language_code='en-US',
    )
    streaming_config = speech.StreamingRecognitionConfig(
        config=config, interim_results=True
    )

    mic_manager = ResumableMicrophoneStream(SAMPLE_RATE, CHUNK_SIZE)
    with mic_manager as stream:
        
        audio_generator = stream.generator()

        requests = (
            speech.StreamingRecognizeRequest(audio_content=content)
            for content in audio_generator
        )

        responses = client.streaming_recognize(streaming_config, requests)
        
        listen_print_loop(responses, stream, text_widget)

    print('End Speech to Text Streaming')

if __name__ == '__main__':
   print('hi')
   #SpeechToTextApp().run()