In [11]:
import os
import json
import re
import math
import string

In [12]:
from num2words import num2words  # Import num2words for number to word conversion
from nltk.corpus import cmudict
cmu_dict = cmudict.dict() # Initialize CMU Pronouncing Dictionary

In [13]:
import numpy as np
import whisper # whisperopenai

## Preparing filenames and their respective full filepaths

In [14]:
# Using the base directory to generate a list of filenames, and full filepaths
def file_names_paths(mooc_base_dir):
    mooc_folders = os.listdir(mooc_base_dir) # display all filenames
    mooc_subfolders=[] # the paths of base dir plus word name
    for item in mooc_folders:
        audio = os.path.join(mooc_base_dir,item)
        mooc_subfolders.append(audio)
    mooc_fnl_subfolders = sorted(mooc_subfolders)[1:] # starting from 1, and sorted, so DS. not displayed
    mooc_filenames = []
    mooc_filepaths = []
    flatten_mooc_filenames = []
    for subfolder in mooc_fnl_subfolders:
        mooc_filepath= sorted(os.listdir(subfolder))
        for item in mooc_filepath:
            full_path = os.path.join(subfolder, item)
            mooc_filepaths.append(full_path)
            flatten_mooc_filenames.append(item)
    return flatten_mooc_filenames, mooc_filepaths  


In [15]:
# Count the intelligibel and unintelligible distribution
mooc_base_dir = "./mooc_audio"
flatten_mooc_filenames, mooc_filepaths = file_names_paths(mooc_base_dir)
def count_labels(mooc_filenames):
    intelligible_count = 0
    unintelligible_count = 0   
    for x in mooc_filenames:
        if x.endswith('1.mp3'):
            intelligible_count += 1
        else:
            unintelligible_count += 1
    print(f'There are {intelligible_count} "intelligible" labels,')
    print(f'and {unintelligible_count } "unintelligible" labels')
    return 

In [16]:
#testing
print(len(flatten_mooc_filenames))
print(flatten_mooc_filenames[0:5])
print(mooc_filepaths[0:5])

10230
['series01-s00000-Paul-1.mp3', 'series01-s000011-Paul-0.mp3', 'series01-s000012-Paul-1.mp3', 'series01-s000013a-Paul-0.mp3', 'series01-s000013b-Paul-0.mp3']
['./mooc_audio/Paul/series01-s00000-Paul-1.mp3', './mooc_audio/Paul/series01-s000011-Paul-0.mp3', './mooc_audio/Paul/series01-s000012-Paul-1.mp3', './mooc_audio/Paul/series01-s000013a-Paul-0.mp3', './mooc_audio/Paul/series01-s000013b-Paul-0.mp3']


In [17]:
# Calling to show the overall distribution of intelligible and unintelligible labels
count_labels(flatten_mooc_filenames)


There are 5726 "intelligible" labels,
and 4504 "unintelligible" labels


In [18]:
# Splitting the filenames and filepaths into equal distributions of test and dev 
def testdev_split(a, b):
    dev_filenames = []
    test_filenames = []
    dev_filepaths = []
    test_filepaths = []
    for (i_filename, filename), (i_filepath, filepath) in zip(enumerate(a),enumerate(b)):       
        if i_filename%2 == 0:
            dev_filenames.append(filename)
        elif i_filename%2 == 1:
            test_filenames.append(filename)    
    #for i, filepath in enumerate(b):  
        if i_filepath%2 == 0:
            dev_filepaths.append(filepath)
        elif i_filepath%2 == 1:
            test_filepaths.append(filepath)
    return dev_filenames, test_filenames, dev_filepaths, test_filepaths
    

In [19]:
# providing the dev and test filenames and full file paths for the task 
dev_fn, test_fn, dev_fp, test_fp = testdev_split(flatten_mooc_filenames, mooc_filepaths)

In [20]:
print(dev_fn[5:10])
print()
print(dev_fp[0:10])
print()
print(test_fn[0:10])
print()
print(test_fp[0:10])

['series01-s000026-Paul-0.mp3', 'series01-s000032-Paul-1.mp3', 'series01-s000036-Paul-0.mp3', 'series01-s00004-Paul-0.mp3', 'series01-s000043-Paul-0.mp3']

['./mooc_audio/Paul/series01-s00000-Paul-1.mp3', './mooc_audio/Paul/series01-s000012-Paul-1.mp3', './mooc_audio/Paul/series01-s000013b-Paul-0.mp3', './mooc_audio/Paul/series01-s000017-Paul-0.mp3', './mooc_audio/Paul/series01-s000021-Paul-0.mp3', './mooc_audio/Paul/series01-s000026-Paul-0.mp3', './mooc_audio/Paul/series01-s000032-Paul-1.mp3', './mooc_audio/Paul/series01-s000036-Paul-0.mp3', './mooc_audio/Paul/series01-s00004-Paul-0.mp3', './mooc_audio/Paul/series01-s000043-Paul-0.mp3']

['series01-s000011-Paul-0.mp3', 'series01-s000013a-Paul-0.mp3', 'series01-s000014-Paul-1.mp3', 'series01-s000018-Paul-0.mp3', 'series01-s000023-Paul-0.mp3', 'series01-s000030-Paul-1.mp3', 'series01-s000034-Paul-0.mp3', 'series01-s000038-Paul-0.mp3', 'series01-s000042-Paul-1.mp3', 'series01-s000046-Paul-0.mp3']

['./mooc_audio/Paul/series01-s000011-Pau

In [21]:
# counting the intelligible and unintelligible labels
distribute_dev = count_labels(dev_fn)
distribute_test = count_labels(test_fn)

There are 2858 "intelligible" labels,
and 2257 "unintelligible" labels
There are 2868 "intelligible" labels,
and 2247 "unintelligible" labels


In [22]:
# Load the Whisper English-only BASE model (LATER SWITCH TO OTHER MODEL)
model = whisper.load_model('base.en')

In [23]:
# turning it into a function( WITH PRESET TEMPERATURE)
def whisper_top_res(audio_file, temperature):
    audiosamp=whisper.load_audio(audio_file)
    audiosamp=whisper.pad_or_trim(audiosamp) #pad/trim it to fit 30 seconds
    mel = whisper.log_mel_spectrogram(audiosamp).to(model.device)
    options = whisper.DecodingOptions(language="en", temperature=temperature, best_of=1)
    result = whisper.decode(model, mel, options)

    num_samples = 20
    res_list= []
    scores = []
    for _ in range(num_samples):
        result = whisper.decode(model, mel, options)
        scores.append(result.avg_logprob)
        res_list.append({'text':result.text, 'avg_logprob':result.avg_logprob})
        
    return res_list

In [25]:
#testing a sample file with temp = 0.4
whisper_top_res('previous_data_audio/furong_dev_all/s03_pause_1.mp3', 0.4)

[{'text': 'pause', 'avg_logprob': -1.2353898286819458},
 {'text': 'pause', 'avg_logprob': -1.6766149997711182},
 {'text': 'pause', 'avg_logprob': -1.6043052673339844},
 {'text': 'Buzz.', 'avg_logprob': -1.7566905975341798},
 {'text': 'pause', 'avg_logprob': -1.5188400745391846},
 {'text': 'Both.', 'avg_logprob': -1.3413328170776366},
 {'text': 'pause', 'avg_logprob': -1.2353898286819458},
 {'text': 'pause', 'avg_logprob': -1.2353898286819458},
 {'text': 'pause', 'avg_logprob': -1.9460774660110474},
 {'text': 'pause', 'avg_logprob': -2.0537474155426025},
 {'text': 'pause', 'avg_logprob': -1.7509652376174927},
 {'text': 'pause', 'avg_logprob': -1.2353898286819458},
 {'text': 'pause', 'avg_logprob': -1.2353898286819458},
 {'text': 'pause', 'avg_logprob': -1.5169219970703125},
 {'text': 'pause', 'avg_logprob': -1.2353898286819458},
 {'text': 'Both.', 'avg_logprob': -1.6104118347167968},
 {'text': 'both.', 'avg_logprob': -1.5019969940185547},
 {'text': 'pause', 'avg_logprob': -1.98871302604

In [26]:
## Using dummy files
filenames_dm = dev_fn[0:15]
filepaths_dm = dev_fp[0:15]
print(filenames_dm)
print()
print(filepaths_dm)

['series01-s00000-Paul-1.mp3', 'series01-s000012-Paul-1.mp3', 'series01-s000013b-Paul-0.mp3', 'series01-s000017-Paul-0.mp3', 'series01-s000021-Paul-0.mp3', 'series01-s000026-Paul-0.mp3', 'series01-s000032-Paul-1.mp3', 'series01-s000036-Paul-0.mp3', 'series01-s00004-Paul-0.mp3', 'series01-s000043-Paul-0.mp3', 'series01-s00007-Paul-1.mp3', 'series02-s00000-Paul-1.mp3', 'series02-s00008-Paul-1.mp3', 'series02-s00016-Paul-1.mp3', 'series02-s00021-Paul-0.mp3']

['./mooc_audio/Paul/series01-s00000-Paul-1.mp3', './mooc_audio/Paul/series01-s000012-Paul-1.mp3', './mooc_audio/Paul/series01-s000013b-Paul-0.mp3', './mooc_audio/Paul/series01-s000017-Paul-0.mp3', './mooc_audio/Paul/series01-s000021-Paul-0.mp3', './mooc_audio/Paul/series01-s000026-Paul-0.mp3', './mooc_audio/Paul/series01-s000032-Paul-1.mp3', './mooc_audio/Paul/series01-s000036-Paul-0.mp3', './mooc_audio/Paul/series01-s00004-Paul-0.mp3', './mooc_audio/Paul/series01-s000043-Paul-0.mp3', './mooc_audio/Paul/series01-s00007-Paul-1.mp3', '

## *Transcribing multiple predictions using whisper like wav2vec's multi_w2v_transcribe_audio_folder

In [27]:
# Function to transcribe audio files in a specific folder and save the transcriptions to JSON files
def mooc_multi_whisper_transcribe_audio_folder(filenames, filepaths, temperature, transcriptions_info_path, raw_text_path):    
    raw_text = {}
    transcriptions_info = {}

    for filename, filepath in zip(filenames, filepaths):
        if filename.endswith(".mp3"):    # Remove '.mp3' extension from the filename        
            base_filename = filename[:-4]                        
            parts = base_filename.split('-') # Split filename to extract studentID, word, and trueLabel
            studentID = parts[0] + '-' + parts[1]
            word = parts[2]
            trueLabel = 'intelligible' if parts[3] == '1' else 'unintelligible'  

            result = whisper_top_res(filepath, temperature) # HERE TEMPERATURE SHOULD BE GIVEN        
            segments_info = []  # Extract required fields from all segments
            segment_info = {
                    "text and avg_logprob": result # to differ from the 'text_one' since here we have mulitple in a list
                    }
            segments_info.append(segment_info)

            transcriptions_info[base_filename] = {
                "studentID": studentID,
                "word": word,
                "trueLabel": trueLabel,
                "segments": segments_info 
            }

            raw_text[base_filename] = result
    #return transcriptions_info, raw_text
    with open(transcriptions_info_path, 'w') as json_file: #and save the detailed transcriptions info to a JSON file
        json.dump(transcriptions_info, json_file, indent=4)
    print(f"Transcription info saved to {transcriptions_info_path}")
    
    with open(raw_text_path, 'w') as json_file: # Save the raw transcriptions to a JSON file
        json.dump(raw_text, json_file, indent=4)
    print(f"Raw transcription saved to {raw_text_path}")    

In [28]:
def mooc_multi_batch_transcribe_with_different_temperatures(filenames, filepaths, temperature_values, info_paths, raw_text_paths):
    for temp, info_path, raw_text_path in zip(temperature_values, info_paths, raw_text_paths):
        mooc_multi_whisper_transcribe_audio_folder(filenames, filepaths, temp, info_path, raw_text_path)

In [29]:
# the info_path follow similar naming convention of wav2vec 
dummy_temperature_values = [0.4, 0.5]

# but the whisper folders have no 'whisper' so it is added to the "result" of json
dummy_multi_whisper_transcription_info_paths =  ["./tst_json_files/transcription_output/whisper_raw_transcription_info/multi_raw_info_result_whisper04.json",
                                               "./tst_json_files/transcription_output/whisper_raw_transcription_info/multi_raw_info_result_whisper05.json"]

dummy_multi_whisper_raw_text_paths =  ["./tst_json_files/transcription_output/raw_text/multi_raw_text_result_whisper04.json",
                                     "./tst_json_files/transcription_output/raw_text/multi_raw_text_result_whisper05.json",]


In [None]:
## CALLING
# DUMMY VERSION
mooc_multi_batch_transcribe_with_different_temperatures(filenames_dm,filepaths_dm, dummy_temperature_values, dummy_multi_whisper_transcription_info_paths, dummy_multi_whisper_raw_text_paths)


In [None]:
# Load the JSON data from a file
def load_json(json_file_path):
    with open(json_file_path, 'r') as file:
        data = json.load(file)
    return data

# Preprocesses the text by removing leading spaces and punctuations, and converting to lowercase.
def preprocess_text(text):
    text = text.strip()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.lower()
    return text

# Convert numbers to words if necessary
def convert_number_to_word(text):
    words = text.split()
    converted_words = []
    for word in words:
        if word.isdigit():
            converted_word = num2words(int(word))  # Convert number to word
            converted_words.append(converted_word)
        else:
            converted_words.append(word)
    return ' '.join(converted_words)

# Function to retrieve phonetic representations from CMU Pronouncing Dictionary.
def get_phonetic_representation(word):
    phonetic_representations = cmu_dict.get(word.lower())
    if phonetic_representations:
        return ' '.join(phonetic_representations[0])  # Join the phonetic representations with space
    else:
        return 'N/A'  # Return 'N/A' if word not found in dictionary


In [None]:
## TESTING DUMMY
dummy04 = "./tst_json_files/transcription_output/raw_transcription_info/multi_raw_info_result_whisper04.json"


In [None]:
# Preprocesses the segment texts and calculates the confidence score for each transcription -- modified version
def model_preprocess_and_calculate_multi(json_file_path):
    data = load_json(json_file_path)
    preprocessed_results = {}

    for key, value in data.items():
        studentID = value["studentID"]
        word = value["word"]
        trueLabel = value["trueLabel"]
        segments = value["segments"]
        true_transcription_stripped = convert_number_to_word(word)  # Convert true transcription to words if necessary                   
        true_phonetic_rep = get_phonetic_representation(true_transcription_stripped) # Get phonetic representations for both true and whisper's transcriptions
        # Preprocess text        
        text_avglog_list = segments[0]["text and avg_logprob"] 
        lst_model_transcription = []
        lst_true_phonetic_rep = []
        lst_model_phonetic_rep = []
        lst_model_judge_phonetic = []        
        lst_confidence_scores = []
        for text in text_avglog_list: ## its a list of dictionary containing text and average log prob
            preprocessed_texts = []
            ## setting confidence score to 0 if len(SGM) >1
            ## SETTING CONFIDENCE SCORE TO 0 if there's no word
            if len(text['text']) == 0:
                confidence_score = 0               
                preprocessed_texts.append('')
                print(f"No transcriptions for audio file: {studentID}_{word}") 
            else:
                avg_logprob = text["avg_logprob"]
                confidence_score = round(math.exp(avg_logprob), 3)
                preprocessed_text = preprocess_text(text['text'])
                preprocessed_texts.append(preprocessed_text) 
            
            lst_confidence_scores.append(confidence_score) # collecting confidence scores
            # Join and process text for all cases
            model_transcription = ' '.join(preprocessed_texts).strip()
            model_transcription = convert_number_to_word(model_transcription)  # Convert numbers to words if necessary 
            lst_model_transcription.append(model_transcription)      # DECIDING NOT USE WHISPER_JUDGE;             
                      
            lst_true_phonetic_rep.append(true_phonetic_rep)
            model_phonetic_rep = get_phonetic_representation(model_transcription)
            lst_model_phonetic_rep.append(model_phonetic_rep)       
            lst_topn=[]  ### NEW CONVERSION OF DATA FORMAT
            for a, b, d in zip(lst_model_transcription,lst_confidence_scores,lst_model_phonetic_rep):
                dict_new={'model_transcription':a,'confidence score':b,'model_phonetic_rep':d}
                lst_topn.append(dict_new)
            
        preprocessed_results[key] = {
            "studentID": studentID,
            "true_transcription": word,
            "trueLabel": trueLabel,
            "true_phonetic_rep":true_phonetic_rep,
            "top res": lst_topn
            }
   
    return preprocessed_results

In [None]:
# preparing the json file path FOR EXAMPLE TEMPERATURE 0.4
multi_whisper_transcription_info_path04 =  "./tst_json_files/transcription_output/raw_transcription_info/multi_raw_info_1result_whisper04.json"
# preprocess the json using multi_whisper_preprocess_and_calculate
whisper_multi_res04 = model_preprocess_and_calculate_multi(multi_whisper_transcription_info_path04)


In [None]:
# preparing the output_path
multi_whisper_output_json_file_path04 = "./tst_json_files/transcription_output/processed_output/multi_whisper_output04.json"

with open(multi_whisper_output_json_file_path04, 'w') as json_file:
    json.dump(whisper_multi_res04, json_file, indent=4)
print(f"Preprocessed results saved to {multi_whisper_output_json_file_path04}")  

In [None]:
# A FUNCTION TO FILTER NUMBER OF RESULTS USING PROCESSED RESULTS
def filter_topn(whisper_multi_res):
    collection_transcript = []
    for key, value in whisper_multi_res.items():
        collection_transcript.append(value['top res'])
    fnl_unique=[]
    fnl_lst_full=[]
    freq_lst=[]
    for collection_lst in collection_transcript:
        lst_unique=[]
        lst_full=[]
        for item in collection_lst:   
            if item['whisper_transcription'] not in lst_unique:
                lst_unique.append(item['whisper_transcription']) 
                lst_full.append(item)
        #fnl_unique.append((len(lst_unique),lst_unique))
        fnl_lst_full.append((len(lst_full),lst_full))
        freq_lst.append(len(lst_full))
    return fnl_lst_full, freq_lst
    #print(lst_full)

In [None]:
fnl_lst04,freq_lst04 =filter_topn(whisper_multi_res04) # TRYING TO PRINT THE TOP N RESULT OF EACH AUDIO
print(freq_lst04)

In [None]:
print(fnl_lst04)

In [None]:
# A Method to calculate the frequency of number of top N results
