In [27]:
import io
import speech_recognition as sr
import pyttsx3
from google.cloud import speech
import os
import json
import logging
from io import BufferedReader
from pydub import AudioSegment
import urllib

In [29]:
# samples/snippets/quickstart.py

# Using Google's Asynchronous Speech Recognition

## Google Cloud: load credentials

In [30]:
# https://cloud.google.com/speech-to-text/docs/quickstart-client-libraries

In [31]:
from google.cloud import speech

In [32]:
ptrfile_to_credentials = r'.\links\my_credentials_location.txt'

In [33]:
with open(ptrfile_to_credentials,'r') as f:
    myKeyFile_json = f.readline()

Google instructions talk about using ENV variables and installing SDK. Neither is ideal. ENV variables would have to be reset every time at the command prompt (because they clear upon exit). And SDK installation is overkill.

If you've gone through the oauth2 client process and already obtained a developer credential token, you won't have to do either of those. You can 

In [34]:
#    !set GOOGLE_APPLICATION_CREDENTIALS="path_to_file_inside_these_quotes"

In [35]:
# https://stackoverflow.com/questions/50445556/how-to-add-credentials-to-google-text-to-speech-api
from google.oauth2 import service_account
credentials = service_account.Credentials.from_service_account_file(myKeyFile_json)

In [36]:
client = speech.SpeechClient(credentials=credentials)

In [37]:
print(client)

<google.cloud.speech_v1.SpeechClient object at 0x000002A212119AF0>


Credentials successfully delivered to SpeechClient

## Asynchronous SR

Asynchronous Speech Recognition must use Google Cloud Storage. Audio files up to 480 minutes can be processed.

In [38]:
from google.cloud import storage
import wave
import time
import pickle
import datetime
import os

In [39]:
# Brooklyn Bridge example:

In [40]:
gcs_uri = "gs://cloud-samples-data/speech/brooklyn_bridge.raw"

In [41]:
# Example: GCS, Brooklyn Bridge
# https://cloud.google.com/speech-to-text/docs/libraries#windows

gcs_uri = "gs://cloud-samples-data/speech/brooklyn_bridge.raw"
audio = speech.RecognitionAudio(uri=gcs_uri) 
config = speech.RecognitionConfig(
    encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
    sample_rate_hertz=16000,
    language_code="en-US",
)

# Detects speech in the audio file
response = client.recognize(config=config, audio=audio)
for result in response.results:
    print("Transcript: {}".format(result.alternatives[0].transcript))

Transcript: how old is the Brooklyn Bridge


### Setup

In [55]:
audio_file_name = "my_audio_recording.flac"
bucket_name = "my-bucket-name"

In [56]:
# This function only works if the input file is a WAV
def frame_rate_channel(audio_file_name):
    with wave.open(audio_file_name, "rb") as wave_file:
        frame_rate = wave_file.getframerate()
        channels = wave_file.getnchannels()
        return frame_rate,channels

In [63]:
#print(frame_rate_channel('audio4gcs/VoiceRecording - Copy.wav'))
#(44100, 2)
#print(frame_rate_channel('audio4gcs/RICKER AUDIO.wav'))
#(22050, 1)

In [58]:
# if WAV
'''
(framerate,numchannels) = frame_rate_channel(os.path.join('audio4gcs',audio_file_name))
print((framerate,numchannels))
'''

"\n(framerate,numchannels) = frame_rate_channel(os.path.join('audio4gcs',audio_file_name))\nprint((framerate,numchannels))\n"

In [59]:
gcs_uri = r'gs://' + bucket_name + '/' + audio_file_name
print(gcs_uri)

gs://my-bucket-name/my_audio_recording.flac


In [35]:
# Long-Running Recognize:

audio = speech.RecognitionAudio(uri=gcs_uri) 

diarization_config = speech.SpeakerDiarizationConfig(enable_speaker_diarization=True)


config = speech.RecognitionConfig(
    #encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
    #sample_rate_hertz=framerate,
    language_code="en-US",
    diarization_config=diarization_config,
    enable_automatic_punctuation=True,
    #audio_channel_count=numchannels,   # defaults to one channel if omitted
    #enable_separate_recognition_per_channel=True,
)

# Detects speech in the audio file
# 

In [36]:
speech.SpeakerDiarizationConfig

google.cloud.speech_v1.types.cloud_speech.SpeakerDiarizationConfig

In [62]:
audio

uri: "gs://my-bucket-name/my_audio_recording.flac"

### Execute Async SR

In [39]:
operation = client.long_running_recognize(config=config, audio=audio)
# You might see an error here if task completion would exceed your quota
# Quota can be configured within Google Developer account dashboard

In [41]:
tic=time.time()
response = operation.result(timeout=30000)
print(time.time()-tic)

# Some example running times:
# 10min 1-channel WAV took 120s
# 1h WAV with 2 channels took 10m (2 channels)
# M J-M Interview (4h) required 3631s (1h)

3631.5423810482025


In [60]:
# response     # output can be really long

In [42]:
len(response.results)

859

In [43]:
# Pickle the entire object in case we want to revisit later.

words_obj = response.results[-1].alternatives[0].words
google_output={}
for field in ['start_time','end_time','word','speaker_tag']:
    google_output[field]=[getattr((words_obj[j]),field) for j in range(len(words_obj))]

audio_filenamebase = os.path.splitext(os.path.split(gcs_uri)[-1])[0]
now_str = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
output_filename = 'output_'+audio_filenamebase+now_str+'.p'
pickle.dump(google_output, open(output_filename, "wb" ) )

### Output Transcript

In [45]:
# Transcript: OPTION 1: 
    # Use Speaker Diarization
    # New line whenever speaker ID changes
    
result = response.results[-1] #Changed
words_info = result.alternatives[0].words #Changed
    
tag=1 #Changed
sentence="" #Changed
transcript=[]
(minutes_start,seconds_start) = (0,0)

for word_info in words_info: #Changed
    if word_info.speaker_tag==tag: # continue current speaker
        sentence=sentence+" "+word_info.word 
    else: # harvest current line, start new line to change speaker
        (minutes_cur,seconds_cur) = divmod(word_info.start_time.seconds,60)
        transcript.append(" speaker %s: (%02d:%02d-%02d:%02d) %s" %\
                              (tag,minutes_start,seconds_start,minutes_cur,seconds_cur,sentence))
        tag=word_info.speaker_tag #Changed
        sentence=""+word_info.word #Changed
        (minutes_start,seconds_start) = (minutes_cur,seconds_cur)
        #timestamp = word_info.start_time.seconds + 1e-6*word_info.start_time.microseconds

(minutes_cur,seconds_cur) = divmod(word_info.start_time.seconds,60)
transcript.append(" speaker %s: (%02d:%02d-%02d:%02d) %s" %\
                              (tag,minutes_start,seconds_start,minutes_cur,seconds_cur,sentence))

In [46]:
outfile = audio_filenamebase+'_v1.txt'
with open(outfile,'a') as f:
    for s in transcript:
        f.write(s+'\n')

In [47]:
len(words_info)

30355

In [48]:
# Transcript: OPTION 2: 
    # Fixed-length lines.
    # No speaker diarization
    # New line every 15sec audio

result = response.results[-1]
words_info = result.alternatives[0].words
    
sentence="" #Changed
transcript=[]
(m0,s0) = (0,0) # (minutes,seconds) start timeblock
ts_final = words_info[-1].start_time.seconds
timeblock_seconds = 15

j=0
for ts_start in range(0,1+ts_final,timeblock_seconds):
    ts_end = ts_start + timeblock_seconds
    (m1,s1) = divmod(ts_end,60) # (minutes,seconds) end timeblock
    # get words in current timeblock:
    while ((j<len(words_info)) and (words_info[j].start_time.seconds<ts_end)):
        sentence=sentence+" "+words_info[j].word
        j+=1
    # harvest sentence, and increment to next timeblock:
    transcript.append("(%02d:%02d-%02d:%02d) %s" %\
                         (m0,s0,m1,s1,sentence))
    sentence=""
    (m0,s0)=(m1,s1)

In [49]:
outfile = audio_filenamebase+'_v2.txt'
with open(outfile,'a') as f:
    for s in transcript:
        f.write(s+'\n')