# Speech to Text classification experimentation notebook 

This notebook outlines the main processes for transcribing the steps inclusing:
 1. Configuring the ML processes and pipelines
 2. Digital signal processing (DSP) of the audio files for filtering processes
 3. Performng the speech-to-text transcripts for each of the audio files (filtered)
 4. Further NLP on the transcripted files to include: key phrase extractio, named entity recognition (NER) and topic modelling

In [None]:
import warnings
warnings.filterwarnings('ignore')

# setup the current paths
import os, sys, time
currentDir = os.path.dirname(os.getcwd())
print(f'Current working directory: {currentDir}')
sys.path.append(currentDir)
sys.path.append('./../')
sys.path.append('././')

# system related
import pandas as pd
import numpy as np
from dotenv import load_dotenv, find_dotenv
from azureml.core.authentication import AzureCliAuthentication

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# import from common setups & environments
from common.constants import *
from common.ontology import *
from common.azureml_configuration import *
from common.general_utilities import *
from common.signal_processing import *
from common.speech_services import *
from common.nlp_modelling import *
from common.ml_modelling import *

#load the env variables from the hidden file
print('Loading environmental variables', load_dotenv(find_dotenv(ENVIORNMENT_FILE)))

## I. Azure ML Configuration

In [None]:
# 1. configure the azure ml workspace
#------------------------------------
warnings.filterwarnings('ignore')

print('Configuring the Azure ML services')
print('---------------------------------')

# get subscription id and other keys from .env file, Other constabst are from source files
SUBSCRIPTION_ID = os.environ.get('SUBSCRIPTION_ID')
RESOURCE_GROUP = os.environ.get('RESOURCE_GROUP')
REGION = os.environ.get('REGION')
TENANT_ID = os.environ.get('TENANT_ID')
WORKSPACE_NAME = os.environ.get('WORKSPACE_NAME')
STORAGE_ACCOUNT = os.environ.get('STORAGE_ACCOUNT')
STORAGE_KEY = os.environ.get('STORAGE_KEY')
SPEECH_KEY = os.environ.get('SPEECH_KEY')
LOCATION=os.environ.get('LOCATION')
TEXT_ANALYTICS_KEY = os.environ.get('TEXT_ANALYTICS_KEY')

# create the results directories - based on the use_case
utilConfig = GeneraltUtilities()
utilConfig.createTmpDir(dsp_results_folders)
utilConfig.createTmpDir(transcripts_results_folder)
utilConfig.createTmpDir(assessed_results_folder)


# configure Azure ML services
#-----------------------------
# initilaise the azureml config class
cli_auth = AzureCliAuthentication()
azuremlConfig = AzureMLConfiguration(workspace=WORKSPACE_NAME
                                    ,tenant_id=TENANT_ID
                                    ,subscription_id=SUBSCRIPTION_ID
                                    ,resource_group=RESOURCE_GROUP
                                    ,location=REGION
                                    ,auth=cli_auth)

# configure Azure ML workspace
azuremlConfig.configWorkspace()

# configure the azure ML compute 
azuremlConfig.configCompute()

# configure the experiment(s)
azuremlConfig.configExperiment(experiment_name=EXPERIMENT_NAME)

# configure the environment - conda
azuremlConfig.configEnvironment(environment_name=ENVIRONMENT_NAME)

# confogure and register the datastore(s) with Azure ML piplines
raw_datastore = azuremlConfig.configDataStore(datastore=RAW_DATASTORE_NAME, container_name=RAW_CONTAINER_NAME)
processed_datastore = azuremlConfig.configDataStore(datastore=DSP_DATASTORE_NAME, container_name=DSP_CONATINER_NAME)
transcribed_datastore = azuremlConfig.configDataStore(datastore=TRANSCRIBED_DATASTORE_NAME, container_name=TRANSCRIBED_CONATINER_NAME)
assessed_datastore = azuremlConfig.configDataStore(datastore=ASSESSED_DATASTORE_NAME, container_name=ASSESSED_CONATINER_NAME)

# Prepare the datasets
#------------------------
# register the datasets associated with the datastore - recordings
raw_recordings_datasets = azuremlConfig.configDatasets(datastore=raw_datastore, file_path= RECORDINGS_FOLDER, 
                                            dataset_name=RECORDINGS_DATASET_NAME, description='raw datasets')

# register the datasets associated with the datastore - truth transcription provided
truth_transcribed_datasets = azuremlConfig.configDatasets(datastore=raw_datastore, file_path = TRUTH_TRANSCRIPTED_FOLDER, 
                                            dataset_name=TRUTH_DATASET_NAME, description='truth transcripted datasets')

# register the datasets associated with the datastore - key phrases
key_phrases_datasets = azuremlConfig.configDatasets(datastore=raw_datastore, file_path = ONTOLOGY_FOLDER, 
                                            dataset_name=ONTOLOGY_DATASET_NAME, description='ontology datasets')

# register the datasets associated with the datastore - assessed data
assessed_datasets = azuremlConfig.configDatasets(datastore=assessed_datastore, file_path = RESULTS_ASSESSED_PATH, 
                                            dataset_name=ASSESSED_DATASET_NAME, description='assessed datasets')

# Mount the datasets
# ---------------------
# mount the datasets - note; providing the root path since dataset name has the embedded path
raw_recordings_datasets_context, raw_recordings_datasets_mounted = azuremlConfig.downloadDatasets(datasets_registered=raw_recordings_datasets, 
                                                download_path=RECORDINGS_MOUNT_PATH)

# mount the truth datasets
truth_transcribed_datasets_context, truth_transcribed_datasets_mounted = azuremlConfig.downloadDatasets(datasets_registered=truth_transcribed_datasets, 
                                                download_path=TRUTH_MOUNT_PATH)

# mount the datasets - note; providing the root path since dataset name has the embedded path 
key_phrases_datasets_context, key_phrases_datasets_mounted = azuremlConfig.downloadDatasets(datasets_registered=key_phrases_datasets, 
                                                download_path=ONTOLOGY_MOUNT_PATH)

## II. Ontology Processing

We import some definition to analyze audio files 
- `ontology_list` defines (TBA)
- `KEY_PHRASES_SEARCH_FILENAME` defines special keywords and their categories to use text analytics.
- `HOMOPHONE_LIST_FILENAME` is used to replace extract keywords.

In [None]:
# 3. configure the ontology and corpus
# ------------------------------------
print('Prepare the ontology')
print('--------------------')

ontology = Ontology()
enhance = True

# configure the ontologies, and any enhancements (if required)
# pass the list of ontologies of interest and configure
ontology_list = [GENERAL_ONTOLOGY_FILENAME]

# enahnce the radio-check ontology
ontology.configOntology(ONTOLOGY_MOUNT_PATH, ontology_list, ontology_to_enhance=None)

# configure the key phrase for searching dictionary
ontology.configKeyPhraseSearch(f'{ONTOLOGY_MOUNT_PATH}{KEY_PHRASES_SEARCH_FILENAME}')

# configure the homophone list (word replacement list)
ontology.configHomophone(f'{ONTOLOGY_MOUNT_PATH}{HOMOPHONE_LIST_FILENAME}')

## III. Audio Signal Processing (DSP)

In this process, we do pre-processing for analyzing like noise reduction for each audio file before actual analyzing.

In [None]:
# 2. perform audio filtering processes
#-------------------------------------
print('Perforing audio signal processing (DSP)')
print('---------------------------------------')

# setup filtering
filterAudio = SignalProcessing()

# option to apply butterworth filter
butterworth_filter = True

# obtain the mounted audio files to perform the ML pipelines
# only extract files that are audio within the mounted datasets
raw_audio_files = [x for x in raw_recordings_datasets_mounted if x.endswith('.wav')]

# loop though each audio file and perform DSP
for raw_audio_file in raw_audio_files:
    print(f'Performing DSP (filtering) for audio file : {raw_audio_file}')
    
    # configure the filter
    filterAudio.configFilter(low_freq_cut=LOW_FREQ_CUTOFF, high_freq_cut=HIGH_FREQ_CUTOFF,
                            order=FILTER_ORDER)

    # read the audio file for the byte data and the sampling rate
    # through the class fields 
    filterAudio.readAudioFile(audio_file_name=raw_audio_file, mount_path=RECORDINGS_MOUNT_PATH, stereo=False)
    # apply the butterworth filter
    if butterworth_filter:
        filterAudio.butterworthFilter()

    # apply further noise reduction
    filterAudio.fftFilter(stationary=True, thresh_stationary=THRESH_STATIONARY, prop_decrease=PROP_DECREASE, freq_smooth=FREQ_MASK_SMOOTH)

    # save the audio filtered files
    filterAudio.saveAudioFiltered(filtered_audio_file_path=dsp_results_folders, volume=12, fix_Name=True)

# also save to datastore
filterAudio.saveAudioFilteredtoDatastore(datastore=processed_datastore, 
                            filtered_audio_file_path=dsp_results_folders, target_path=RECORDINGS_FOLDER)

## IV. Speech to Text Processing

We execute Text processing using Speech Service in Azure Cognitive Service to extract the sentence which are inside audio files.

In [None]:

# 4. perform the speech-to-text 
# -----------------------------
print('Performing speech-to-text transcription')
print('---------------------------------------')

#setup
validate = True
filtered = True

# initilaise the speech to text
speech = AzureCognitiveSpeechServices(speech_key=SPEECH_KEY, location=LOCATION, validate=validate, filtered=filtered)

# prepare the ground truth transcripted text
if validate:
    speech.processTranscriptsTruth(f'{TRUTH_MOUNT_PATH}{TRANSCRIPTS_TRUTH_FILENAME}')

# select wich folder to point to
if filtered:
    audio_folder = dsp_results_folders
    file_flag = f'_filtered{FILE_FLAG}'
else:
    audio_folder = RECORDINGS_MOUNT_PATH
    file_flag = FILE_FLAG


# loop through each audio file and perform transcription
if len(os.listdir(audio_folder)) > 0:
    audio_files = os.listdir(audio_folder)
    audio_files = [x for x in audio_files if x.endswith(file_flag)]
        
    # loop through each mounted dataset and perform the speech-to-text process
    # the audio file to transcribe
    for audio_file in audio_files:
        print(f'Starting the filtered transcription for file: {audio_file}')
        
        # configure, 
        speech.configSpeechtoText(audio_file, file_path=audio_folder, add_phrases=True, dictionary=ontology.main_dictionary)
        
        # transcribe - also applying time delay to avoid race conditions
        time.sleep(0.25)
        speech.transcribeAudioFile()
    
        # format results, with the option of applying homophone list
        speech.transcribeResults(homophone_list = ontology.homophone_list)
      
    # perform validation (if required)    
    if validate:
        speech.werAnalysis()
        print('The average WER is:', speech.transcript_performance_df['wer'].mean())
    
    # convert results to dataframe also
    speech.processDataframe()
    
    # save results to datastore 
    speech.saveTranscripts(datastore=transcribed_datastore, file_path = transcripts_results_folder, 
                            target_path = f'{use_case}/')
        
else:
    print('could not transcibe the filtered audio files')


## V. NLP Modelling

We continue NLP modelling with transcribed data, where we tokenize transcript, remove stopwords, extract nouns, keyphrase extraction, etc.

These extracted features are used in `Advanced modelling` for classifying the audio files.

In [None]:
# 5. NLP Analysis
#---------------------
print('Performing NLP modelling')
print('------------------------')

# setup
read_from_memory = False

# setup paths and previously transcribed files (if required)
assessed_results_folder =  f'{RESULTS_PATH}{RESULTS_ASSESSED_PATH}{use_case}/' 
transcribed_file_path = f'{transcripts_results_folder}{TRANSCRIBED_JSON_FILENAME}'
transcribed_df_file_path = f'{transcripts_results_folder}{TRANSCRIBED_DATAFRAME_FILENAME}'

# read the transcripted distionary, either from disk or memeory
if read_from_memory:
    transcripted_dictionary = speech.transcripted_dict_all
    transcripted_df = speech.transcripted_dataframe_all
else:
    with open(transcribed_file_path, 'r') as read_file:
        transcripted_dictionary = json.load(read_file)
    transcripted_df = pd.read_csv(transcribed_df_file_path, encoding = 'unicode_escape', engine ='python').reset_index(drop=True)


# initialise the NLP service class
nlp = NLPModelling(cogs_url=COGS_URL, nlp_key=TEXT_ANALYTICS_KEY)

# perform steps of the NLP
if (transcripted_dictionary is not None):

    # perform the tokenization
    nlp.tokenizeTranscript(transcripted_dictionary)

    # perform the stop word filtering - passed the calss field as argument
    nlp.removeStopWords(nlp.tokenized_dict_all) 

    # extract the nouns from the tokenised dictionary
    nlp.nounExtraction(nlp.filtered_tokenized_dict_all)

    # perform key phrase extraction - on original transcribes
    nlp.keyPhraseExtraction(nlp_url=NLP_KEY_PHRASE_URL, body=transcripted_dictionary) 

    # perform non-domain specific NER extraction - on original transcribes
    nlp.nerExtraction(nlp_url=NLP_NER_URL, body=transcripted_dictionary)

    # extract custom key interests - on original transcribes
    nlp.customKeyPhraseExtraction(text_dictionary=transcripted_dictionary, main_dictionary=ontology.main_dictionary,
                                    key_phrase_dictionary=ontology.key_phrase_search_dictionary, word_to_num_dict=ontology.word_to_num_dict)
    
    # process the NLP results
    nlp.processNLPResults(transcripted_dictionary)

    # save the MLP results to local and datastore 
    nlp.saveNLPResults(datastore=assessed_datastore, file_path = assessed_results_folder, target_path = f'{use_case}/', 
                        transcript_dataframe=transcripted_df)

else:
    print(f'Transcripted dictionary {transcripted_dictionary} is empty')


## VI. Advanced Modelling

We classify audio files with machine learning techniques, where target column is defined in variable `MESSAGE_CLASSIFICATION_GROUP`.

In [None]:
# 6. ML Modelling
# ---------------
print('Performing ML classification modelling')
print('--------------------------------------')

# setup
EDA = True

autoML = False
train = True
heuristic = False

# initialise the ml class
ml_modelling = MLModelling()

# prepare the ml dataframe
ml_modelling.configMLDataframe(nlp.nlp_dataframe_all, normalize=False)

# reduce the ML dataframe ready for modelling
ml_modelling.prepareMLDataframe(train = True)

# prepare the train and test split
ml_modelling.prepareTrainTestSplit(test_size=0.33)

# apply special case of random forest
ml_modelling.randomForest_special(plot=False, model_path=f'{assessed_results_folder}RandomForest-Special')

# save the ML dataframe
ml_modelling.saveMLResults(datastore=assessed_datastore, 
                            file_path=assessed_results_folder, 
                            ml_file_name='ML_dataframe.csv', 
                            all_file_name='NLP_dataframe.csv', 
                            target_path = f'{use_case}/', 
                            ml_dataframe=ml_modelling.ml_dataframe_all, 
                            all_dataframe=nlp.nlp_dataframe_all)


## VII. Submit Experiment

This cell shows an example for using Azure ML pipelines about training Machine Learning model with `train.py` script.

You can change it as your need.

In [None]:
## Assign csv file for preparing pandas Dataframe
ML_DATFRAME_FILENAME = 'NLP_dataframe.csv'

base_dir = f'{os.getcwd()}/'

TRAIN_PATH = base_dir + '../'
TRAIN_FILENAME = 'engine/machine_learning/train.py'

results_file_path_full = assessed_datasets.as_mount()

train_arg_list = ['--data-folder', results_file_path_full , 
                '--file-name', ML_DATFRAME_FILENAME, 
                '--regularization', 0.5,
                '--workspace', WORKSPACE_NAME,
                '--tenant_id', TENANT_ID,
                '--subscription_id', SUBSCRIPTION_ID,
                '--resource_group', RESOURCE_GROUP, 
                '--location', REGION, 
                '--datastore', ASSESSED_DATASTORE_NAME,
                '--container_name', ASSESSED_CONATINER_NAME
                ]

## Define configuration for experiments
ml_modelling.configScriptRun(arg_list=train_arg_list, 
                            script_dir = TRAIN_PATH, 
                            script= TRAIN_FILENAME, 
                            compute_target = azuremlConfig.compute_target, 
                            environment=azuremlConfig.environment)

run = azuremlConfig.experiment.submit(config = ml_modelling.script_run_config)
run


## VIII. Cleanup

In [10]:
azuremlConfig.removeLocalDatasets(remove_path = RECORDINGS_MOUNT_PATH)
azuremlConfig.removeLocalDatasets(remove_path = TRUTH_MOUNT_PATH)
azuremlConfig.removeLocalDatasets(remove_path = ONTOLOGY_MOUNT_PATH)

Error in: [Errno 2] No such file or directory: '1249120_44142156_65967262.wav'
No dataset files to remove.
No dataset files to remove.
