In [None]:
#install the necessary libraries
import sys
import os
from os.path import join, dirname
import json
from ibm_watson import SpeechToTextV1
from ibm_watson.speech_to_text_v1 import CustomWord
from ibm_cloud_sdk_core.authenticators import BearerTokenAuthenticator
from ibm_watson.websocket import RecognizeCallback, AudioSource
import warnings
warnings.filterwarnings("ignore")

In [None]:
#Connect to the CP4D Speech to Text Service
bearer = "XXX"
authenticator = BearerTokenAuthenticator(bearer)

speech_to_text = SpeechToTextV1(
    authenticator=authenticator
)

speech_to_text.set_service_url('https://cp4d-cpd-cp4d.icp-poc.am.lilly.com/speech-to-text/watson-speech-base/instances/1608759635733/api')
speech_to_text.set_disable_ssl_verification(True)

In [None]:
#Retrieve the Speech to Text models
speech_models = speech_to_text.list_models().get_result()
print(json.dumps(speech_models, indent=2))

In [None]:
#Set the audio file name that should be transcribed
fileName = "1030237_FVSSS_June15th2020.mp3"

In [None]:
# Define the MyRecognizeCallback class
class MyRecognizeCallback(RecognizeCallback):
    def __init__(self):
        RecognizeCallback.__init__(self)

    def on_transcription(self, transcript):
        print(transcript)

    def on_connected(self):
        print('Connection was successful')

    def on_error(self, error):
        print('Error received: {}'.format(error))

    def on_inactivity_timeout(self, error):
        print('Inactivity timeout: {}'.format(error))

    def on_listening(self):
        print('Service is listening')

    def on_hypothesis(self, hypothesis):
        print(hypothesis)

    def on_data(self, data):
        #print(data)
        f_data = open(fileName[:-4] + "_transcript.json", "w")
        f_data.write(json.dumps(data))
        f_data.close()

        
mycallback = MyRecognizeCallback()
audio_file = open(fileName, 'rb')
audio_source = AudioSource(audio_file)

#Start the transcription request
t = speech_to_text.recognize_using_websocket (audio=audio_source, content_type="audio/mp3", recognize_callback=mycallback, speaker_labels=True, smart_formatting=True, model="en-US_ShortForm_NarrowbandModel", inactivity_timeout=600)
t

In [None]:
#Open Transcript File
with open(fileName [:-4] + "_transcript.json") as f:
    data = json.load(f)

# Retrieve Speaker Labels section of the JSON
speaker_labels = data["speaker_labels"]

#set counter
i = 0
last_speaker = 0
phrase = ""

#Full transcript file name
f1_data = open(fileName [:-4] + "_transcript.txt" , "w")

#Iterate through transcript results section of the JSON file (speaker labels are available)
for result in data["results"]:
    for timestamp in result["alternatives"][0]["timestamps"]:
        #Retrieve each transcribed word and align it to the speaker 
        current_speaker = speaker_labels [i]["speaker"]
        current_phrase =timestamp [0].replace("%HESITATION","")
        #If Speaker has not changed, compound the speech phrase
        if (current_speaker==last_speaker):
            phrase = phrase + " " + current_phrase
        #print the last speaker phrase and start the new speaker phrase
        else:
            #print ("Speaker: " +  str(last_speaker) + " " + phrase)
            f_data = open(fileName [:-4] + "_" + str(i) + ".json" , "w")
            data_out = {'speaker': last_speaker, 'text': phrase}
            f_data.write(json.dumps(data_out, indent=4))
            f_data.close()
            f1_data.write("Speaker: " +  str(last_speaker) + " " + phrase + "\n")
            phrase = current_phrase
            last_speaker = current_speaker
        i = i + 1
        #print (i)

#print ("Speaker:" +  str(last_speaker) + " " + phrase)
f_data = open(fileName [:-4] + "_" + str(i) + ".json" , "w")
data_out = {'speaker': last_speaker, 'text': phrase}
f_data.write(json.dumps(data_out, indent=4))
f_data.close()

f1_data.write("Speaker: " +  str(last_speaker) + " " + phrase+ "\n")
f1_data.close()
#print(i)

In [None]:
# Print the transcript phrases (speaker labels are not available)
for result in data["results"]:
    print (result["alternatives"][0]["transcript"])

In [None]:
#list custom language models
language_models = speech_to_text.list_language_models().get_result()
print(json.dumps(language_models, indent=2))

In [None]:
# Create a custom language model
language_model = speech_to_text.create_language_model(
    'Eli Lilly Custom Language Model',
    'en-US_NarrowbandModel',
    description='Eli Lilly Custom Language Model')

print(json.dumps(language_model.get_result(), indent=2))

# Store the customize id returned from the STT service
customization_id = language_model.get_result()['customization_id']

print(customization_id)

In [None]:
# Specify the model id (save for future reuse)
#V1
#customization_id = '34723c52-19b0-4e25-b2cc-74262b65a2e4'

In [None]:
#Check Custom Language model
speech_model = speech_to_text.get_language_model(customization_id)

print(json.dumps(speech_model.get_result(), indent=2))

In [None]:
#list custom language models
language_models = speech_to_text.list_language_models().get_result()
print(json.dumps(language_models, indent=2))

In [None]:
# delete a custom language model
#speech_to_text.delete_language_model("XXX")

In [None]:
# List Corpora
corpora = speech_to_text.list_corpora(customization_id).get_result()
print(json.dumps(corpora, indent=2))

In [None]:
# Add a corpus
corpus_name = "LM_English v1"
headers = {'Content-Type' : "multipart/form-data"}

with open('corpus.txt','rb') as corpus_file:
    #add the corpus
    speech_to_text.add_corpus(customization_id = customization_id, headers=headers, corpus_name = corpus_name, corpus_file = corpus_file)

In [None]:
# List Corpora
corpora = speech_to_text.list_corpora(customization_id).get_result()
print(json.dumps(corpora, indent=2))

In [None]:
# List custom words
words = speech_to_text.list_words(customization_id).get_result()
print(json.dumps(words, indent=2))

In [None]:
# Add Custom words
custom_word_list = []
custom_word_list.append (CustomWord(word='NAME', sounds_like=['VAR', 'BAR'], display_as='NAME'))
custom_word_list.append (CustomWord(word='NAMEA', sounds_like=['VARA', 'BARA'], display_as='NAMEA'))
speech_to_text.add_words(customization_id,custom_word_list).get_result()

In [None]:
# List custom words
words = speech_to_text.list_words(customization_id).get_result()
print(json.dumps(words, indent=2))

In [None]:
# Delete a custom word
speech_to_text.delete_word(customization_id, 'NAME').get_result()

In [None]:
# Issue a train request
print(speech_to_text.train_language_model(customization_id).get_result())

In [None]:
#list custom language models - ensure the status is 'available'
language_models = speech_to_text.list_language_models().get_result()
print(json.dumps(language_models, indent=2))

In [None]:
# Delete a corpus
speech_to_text.delete_corpus(customization_id, corpus_name).get_result()

In [None]:
# delete a custom language model
speech_to_text.delete_language_model(customization_id).get_result()

In [None]:
#list custom language models

language_models = speech_to_text.list_language_models().get_result()
print(json.dumps(language_models, indent=2))