# Notebook to transcribe all files in directory via Watson Speech-To-Text service

## TODO:
  1) Connect to IBM Cloud Storage Bucket instead of local directory 
  2) Set up callbacks and asynch I/O for the transcriptions
  3) Parallelize the transcription processing
  4) Walk the JSONs produced in the transcription process and load them back into memory
  5) Notebookify analyze.py, and scriptify the auto-transcribe notebook for easier use.

#### Customization variables

In [None]:
apikey = ''
endpoint = ''
content_type = 'audio/l16'
file_type = '.wav'
file_dir = 'static/audio/'
transcription_dir = 'static/transcriptions/'
model_name = 'en-US_NarrowbandModel'
model_customization_id = ''
language_customization_id = ''
reference_transcriptions_file='static/transcriptions/reference-transcriptions.csv'
stt_transcriptions_file='static/transcriptions/auto-transcriptions.csv'

Authentication is via IAM token.

In [None]:
import json
from ibm_watson import SpeechToTextV1
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator
from timeit import default_timer as timer

authenticator = IAMAuthenticator(apikey)
service = SpeechToTextV1(authenticator=authenticator)
service.set_service_url(endpoint)

__service.list_models()__:
Lists all models supported by the endpoint.

In [None]:
models = service.list_models().get_result()
print(json.dumps(models, indent=2))

__service.list_language_models()__: Lists all the customized language models supported by the endpoint.

In [None]:
language_models = service.list_language_models().get_result()
print(json.dumps(language_models, indent=2))

__service.get_model()__:
This next method gets a specific, named model, from the service, provided it is supported.

In [None]:
model = service.get_model(model_name).get_result()
print(json.dumps(model, indent=2))

Using __pathlib.Path__, iterate through all the files in *file_dir* and append a dictionary of input file names matching *file_type* and output JSON file names matching the input file.

In [None]:
from os.path import dirname, join
from pathlib import Path

files = []
basepath = Path(file_dir)
basepath_child_items = basepath.iterdir()
for item in basepath_child_items:
    if item.is_file() and item.suffix == file_type:
        files.append({'in_file': item.name,
                      'out_file': item.name.replace(file_type,
                                                    '.json')})
print(json.dumps(files))

**TODO**: For all file names in a list, access a IBM Cloud Object Storage service and retrieve the files, send it to the STT service, and upload the resulting JSON to the same IBM Cloud Object Storage service.

#### Setting up the CSV export

In [None]:
transcriptions = []
transcriptions.append({'file_name': 'Audio File Name',
                       'model_name': 'Model name',
                       'transcription': 'Transcription',
                       'time_to_transcribe': 'Time to transcribe'})

### Transcribe audio

For all files in the list created above, send the file in question to be transcribed by the Watson Speech-To-Text service, and dump the response to a JSON file in the *transcription_dir* that matches the file name of the audio file it matches.

#### Narrowband model

In [None]:
for file in files:
    print(f"{join(file_dir, file['in_file'])} being transcribed.")
    
    start = timer()
    with open(join(file_dir, file['in_file']), 'rb') as audio_file:
        narrowband_transcription_result = service.recognize(
                audio=audio_file,
                content=content_type,
                timestamps=True,
                speaker_labels=True,
                end_of_phrase_silence_time=120.0,
                split_transcript_at_phrase_end=False,
                smart_formatting=True,
                model=model_name).get_result()
        
        with open(join(join(transcription_dir, 'narrowband'), file['out_file']),
                  'w') as transcript_file:
            transcript_file.write(json.dumps(narrowband_transcription_result, indent=2))       
    end = timer()
    
    for result in telephony_transcription_result['results']:
        transcription = {'file_name': file['in_file'],
                         'model_name': model,
                         'transcription': result['alternatives'][0]['transcript'],
                         'time_to_transcribe': (end - start)}
        transcriptions.append(transcription)
    print(f"Telephony transcription complete in {(end - start)} seconds.")

#### Custom narrowband model

In [None]:
for file in files:
    print(f"{join(file_dir, file['in_file'])} being transcribed.")
 
    start = timer()
    with open(join(file_dir, file['in_file']), 'rb') as audio_file:
        custom_narrowband_transcription_result = service.recognize(
                audio=audio_file,
                content=content_type,
                timestamps=True,
                speaker_labels=True,
                end_of_phrase_silence_time=120.0,
                split_transcript_at_phrase_end=False,
                smart_formatting=True,
                model=model_name,
                language_customization_id=model_customization_id).get_result()
        
        with open(join(join(transcription_dir, 'custom_narrowband'), file['out_file']),
                  'w') as transcript_file:
            transcript_file.write(json.dumps(custom_narrowband_transcription_result, indent=2))
    end = timer()
    
    for result in telephony_transcription_result['results']:
        transcription = {'file_name': file['in_file'],
                         'model_name': model,
                         'transcription': result['alternatives'][0]['transcript'],
                         'time_to_transcribe': (end - start)}
        transcriptions.append(transcription)
    print(f"Telephony transcription complete in {(end - start)} seconds.")

#### (BETA) next-gen telephony model

In [None]:
model = 'en-US_Telephony'

for file in files:
    print(f"{join(file_dir, file['in_file'])} being transcribed.")
    
    start = timer()
    with open(join(file_dir, file['in_file']), 'rb') as audio_file:
        telephony_transcription_result = service.recognize(
                audio=audio_file,
                content=content_type,
                timestamps=True,
                speaker_labels=True,
                end_of_phrase_silence_time=120.0,
                split_transcript_at_phrase_end=False,
                smart_formatting=True,
                model=model).get_result()

        with open(join(join(transcription_dir, 'telephony'), file['out_file']),
                  'w') as transcript_file:
            transcript_file.write(json.dumps(telephony_transcription_result, indent=2))
            
        audio_file.close()
    end = timer()

    for result in telephony_transcription_result['results']:
        transcription = {'file_name': file['in_file'],
                         'model_name': model,
                         'transcription': result['alternatives'][0]['transcript'],
                         'time_to_transcribe': (end - start)}
        transcriptions.append(transcription)
    print(f"Telephony transcription complete in {(end - start)} seconds.")

__(Optional)__: We write out the full `transcriptions` list as a JSON object to a file to prevent having to rerun all the transcriptions.

In [None]:
with open(join(transcription_dir, stt_transcriptions_file.replace('.csv', '.json')), 'w') as transcriptions_json:
    transcriptions_json.write(f"{json.dumps(transcriptions, indent=2)}")

print(f"{json.dumps(transcriptions, indent=2)}")

__(Optional)__: If you have a previous transcription session to load, use the below to open and read the transcriptions back into kernel.

In [None]:
with open(join(transcriptions_dir, stt_transcriptions_file.replace('.csv', '.json')), 'rb') as transcriptions_json:
    transcriptions = json.load(transcriptions_json)

__TODO__: __(Optional)__: Walk transcription JSONs back into memory
*Presently non-functional*

In [None]:
json_files = []

def transcribe(path):
    transcriptions = []
    transcription_basepath = Path(path)
    transcription_basepath_child_items = transcription_basepath.iterdir()
    print(json.dumps(transcription_basepath_child_items))
    for item in transcription_basepath_child_items:
        pass
    return transcriptions

transcriptions = transcribe(transcription_dir)
print(json.dumps(transcriptions))

We then translate the consolidated transcription output into comma-separated value notation and write it out to file to then be used with `analyze.py`.

In [None]:
with open(join(transcription_dir, stt_transcriptions_file), 'w') as consolidated_csv:
    for transcription in transcriptions:
        consolidated_csv.write(','.join([transcription['file_name'],
                                         transcription['model_name'],
                                         transcription['transcription'],
                                         str(transcription['time_to_transcribe']),
                                         '\n']))