In [141]:
import math
import os
import json
import string

In [139]:
from transformers import Wav2Vec2Tokenizer, Wav2Vec2ForCTC
from datasets import load_dataset
import torch

In [140]:
import soundfile as sf
import librosa

In [142]:
from IPython.display import Audio
from scipy.io import wavfile
import numpy as np

In [144]:
import logging #to disable warnings

logger = logging.getLogger("pyctcdecode.alphabet")
logger.setLevel(logging.CRITICAL) 

In [143]:
from num2words import num2words  # Import num2words for number to word conversion
from nltk.corpus import cmudict
cmu_dict = cmudict.dict() # Initialize CMU Pronouncing Dictionary

In [145]:
from transformers import Wav2Vec2ProcessorWithLM

In [146]:
processor = Wav2Vec2ProcessorWithLM.from_pretrained("patrickvonplaten/wav2vec2-base-100h-with-lm")

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

In [147]:
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-100h")

Some weights of the model checkpoint at facebook/wav2vec2-base-100h were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.mask_time_emb_vector']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-100h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [148]:
sample_file_name = 'previous_data_audio/furong_dev_all/s04_bard_0.mp3'

In [149]:
Audio(sample_file_name)

In [150]:
# top few result using wav2vec and turning it into a function
def wav2vec_top_res(audio_file): # giving top ten prediction results, to be deleted later
    audio, sr = librosa.load(audio_file, sr=16000)
    inputs = processor(audio, sampling_rate=16000, return_tensors="pt", return_attention_mask=True, padding=True)
    input_values = inputs.input_values  # [batch_size, sequence_length]
    attention_mask = inputs.attention_mask

    with torch.no_grad():
        logits = model(input_values).logits  # [batch_size, seq_len, vocab_size]
    from pyctcdecode import build_ctcdecoder
    vocab_list = list(processor.tokenizer.get_vocab().keys())

    decoder = build_ctcdecoder(vocab_list, None)
    beam_results = decoder.decode_beams(logits[0].cpu().numpy(), beam_width=50)

    ### KEEP TOP N BEAMS (10 results)
    beams_to_keep = beam_results[:min(10, len(beam_results))]
    # CONVERT logit scores to probabilities
    probs = [math.exp(beam[3]) for beam in beams_to_keep]  # beam[3] = AM log-prob
    total_prob = sum(probs)
    ### Normalize so they sum to 1
    normalized_probs = [p / total_prob for p in probs]
    
    res_list= []
    #for beam in beam_results[:10]:
    for beam, prob in zip(beams_to_keep, normalized_probs):
        text = beam[0]
        score = beam[3]
        num_tokens = len(text.replace(" ", ""))
        avg_logprob = score / max(1, num_tokens)
        confidence_score = round(math.exp(avg_logprob), 3)
        res_list.append({'text':text, 'avg_logprob':float(avg_logprob)}) # Texts are different paths through the same decoding lattice.
    ## added 
    res_list = sorted(res_list, key=lambda x: x['avg_logprob'], reverse=True)
    return res_list
    

In [151]:
wav2vec_top_res(sample_file_name) # printing an example of top10 unique results

[{'text': 'BAD', 'avg_logprob': -0.3500041755706074},
 {'text': 'BADE', 'avg_logprob': -0.7687228634394133},
 {'text': 'BAUD', 'avg_logprob': -0.8009523313731263},
 {'text': 'BARD', 'avg_logprob': -0.8601239986529743},
 {'text': 'BATD', 'avg_logprob': -0.9871410878691836},
 {'text': 'BADD', 'avg_logprob': -1.0157532510521434},
 {'text': 'BAHD', 'avg_logprob': -1.031194387455914},
 {'text': 'BOD', 'avg_logprob': -1.0692155284770475},
 {'text': 'BAWD', 'avg_logprob': -1.0902448205502724},
 {'text': 'BA', 'avg_logprob': -2.193876411815085}]

In [152]:
# Using the base directory to generate a list of filenames, and full filepaths
def file_names_paths(mooc_base_dir):
    mooc_folders = os.listdir(mooc_base_dir) # display all filenames
    mooc_subfolders=[] # the paths of base dir plus word name
    for item in mooc_folders:
        audio = os.path.join(mooc_base_dir,item)
        mooc_subfolders.append(audio)
    mooc_fnl_subfolders = sorted(mooc_subfolders)[1:] # starting from 1, and sorted, so DS. not displayed
    mooc_filenames = []
    mooc_filepaths = []
    flatten_mooc_filenames = []
    for subfolder in mooc_fnl_subfolders:
        mooc_filepath= sorted(os.listdir(subfolder))
        for item in mooc_filepath:
            full_path = os.path.join(subfolder, item)
            mooc_filepaths.append(full_path)
            flatten_mooc_filenames.append(item)
    return flatten_mooc_filenames, mooc_filepaths  

# Count the intelligibel and unintelligible distribution

def count_split(mooc_filenames):
    intelligible_count = 0
    unintelligible_count = 0   
    for x in mooc_filenames:
        if x.endswith('1.mp3'):
            intelligible_count += 1
        else:
            unintelligible_count += 1
    print(f'There are {intelligible_count} "intelligible" labels,')
    print(f'and {unintelligible_count } "unintelligible" labels')
    return 

# Splitting the filenames and filepaths into equal distributions of test and dev 
def split_only(a, b):
    dev_filenames = []
    test_filenames = []
    dev_filepaths = []
    test_filepaths = []
    for (i_filename, filename), (i_filepath, filepath) in zip(enumerate(a),enumerate(b)):       
        if i_filename%2 == 0:
            dev_filenames.append(filename)
        elif i_filename%2 == 1:
            test_filenames.append(filename)    
    #for i, filepath in enumerate(b):  
        if i_filepath%2 == 0:
            dev_filepaths.append(filepath)
        elif i_filepath%2 == 1:
            test_filepaths.append(filepath)
    return dev_filenames, test_filenames, dev_filepaths, test_filepaths

In [153]:
# CALLING... / TO BE DELETED LATER (REDUNDANCY)
dummooc_base_dir = "./mooc_audio"
flatten_mooc_filenames, mooc_filepaths = file_names_paths(dummooc_base_dir)
dev_fn, test_fn, dev_fp, test_fp = split_only(flatten_mooc_filenames, mooc_filepaths)

In [154]:
#testing a sample
print(flatten_mooc_filenames[0:5])

['series01-s00000-Paul-1.mp3', 'series01-s000011-Paul-0.mp3', 'series01-s000012-Paul-1.mp3', 'series01-s000013a-Paul-0.mp3', 'series01-s000013b-Paul-0.mp3']


## Here we will generate two json files, for each audio file in each json, there are top 10 hypotheses with the avg log probs. 

In [155]:
# Function to transcribe audio files in a specific folder and save the transcriptions to JSON files
# it takes the filenames and filepaths of each file and output a json file of raw text with avg log prob and info text with av log prob
def mooc_multi_w2v_transcribe_audio_folder(filenames, filepaths, transcriptions_info_path, raw_text_path):    
    raw_text = {}
    transcriptions_info = {}

    for filename, filepath in zip(filenames, filepaths):
        if filename.endswith(".mp3"):   # Remove '.mp3' extension from the filename        
            base_filename = filename[:-4]                        
            parts = base_filename.split('-') # Split filename to extract studentID, word, and trueLabel
            studentID = parts[0] + '-' + parts[1]
            word = parts[2]
            trueLabel = 'intelligible' if parts[3] == '1' else 'unintelligible'

            result = wav2vec_top_res(filepath)  
            segments_info = []  # Extract required fields from all segments
            segment_info = {
                    'text and avg_logprob':result                   
                    }
            segments_info.append(segment_info)

            transcriptions_info[base_filename] = {
                "studentID": studentID,
                "word": word,
                "trueLabel": trueLabel,
                "segments": segments_info 
            }

            raw_text[base_filename] = result
    #return transcriptions_info amd raw_text
    # Save the detailed transcriptions info to a JSON file
    with open(transcriptions_info_path, 'w') as json_file:
        json.dump(transcriptions_info, json_file, indent=4)
    print(f"Transcription info saved to {transcriptions_info_path}")

    # Save the raw transcriptions to a JSON file
    with open(raw_text_path, 'w') as json_file:
        json.dump(raw_text, json_file, indent=4)
    print(f"Raw transcription saved to {raw_text_path}")    

In [156]:
multi_w2v_transcription_info_path =  "./tst_json_files/transcription_output/w2v_raw_transcription_info/mooc_multi_raw_info_result.json"
multi_w2v_raw_text_path =  "./tst_json_files/transcription_output/w2v_raw_text/mooc_multi_raw_text_result.json"


In [157]:
# naming the filenames and full paths of files for development
filenames_mooc = dev_fn
filepaths_mooc = dev_fp

In [158]:
## CALLING to output raw information containing studentID, word, true label and segments (text and avg log prob), and raw text with avg log prb 
mooc_multi_w2v_transcribe_audio_folder(filenames_mooc, filepaths_mooc, multi_w2v_transcription_info_path, multi_w2v_raw_text_path)

Transcription info saved to ./tst_json_files/transcription_output/w2v_raw_transcription_info/mooc_multi_raw_info_result.json
Raw transcription saved to ./tst_json_files/transcription_output/w2v_raw_text/mooc_multi_raw_text_result.json


In [159]:
## inspecting
loaded_f = load_json(multi_w2v_transcription_info_path)
for k, v in loaded_f.items():
    print(k)
    print(v)
    break

series01-s00000-Paul-1
{'studentID': 'series01-s00000', 'word': 'Paul', 'trueLabel': 'intelligible', 'segments': [{'text and avg_logprob': [{'text': 'CALLIN', 'avg_logprob': -0.2531940364808277}, {'text': 'CALLIM', 'avg_logprob': -0.36811439275183533}, {'text': 'CALLING', 'avg_logprob': -0.4035371777119488}, {'text': 'CALLIMG', 'avg_logprob': -0.501103558324397}, {'text': 'COLLIN', 'avg_logprob': -0.5485117393676265}, {'text': 'CALL IN', 'avg_logprob': -0.6171833224746351}, {'text': 'COLLIM', 'avg_logprob': -0.6703274062938022}, {'text': 'CALLEN', 'avg_logprob': -0.717398567663285}, {'text': 'CALLON', 'avg_logprob': -0.7192308144436805}, {'text': 'CALLI', 'avg_logprob': -0.8514732434352075}]}]}


In [160]:
# using the json path to obtain student and series numbers
def get_series_student_numbers(info_path):
    loaded_f = load_json(info_path)
    student_numbers=[]
    series_numbers=[]
    for k, v in loaded_f.items():
        parts= v['studentID'].split('-') # get series and student numbers
        student_numbers.append(parts[-1])
        series_numbers.append(parts[0])
    print(len(set(student_numbers))) # set to filter repeated student and series numbers
    print(len(set(series_numbers)))
## calling
get_series_student_numbers(multi_w2v_transcription_info_path)

170
7


In [161]:
# Load the JSON data from a file
def load_json(json_file_path):
    with open(json_file_path, 'r') as file:
        data = json.load(file)
    return data

# Preprocesses the text by removing leading spaces and punctuations, and converting to lowercase.
def preprocess_text(text):
    text = text.strip()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.lower()
    return text

# Convert numbers to words if necessary
def convert_number_to_word(text):
    words = text.split()
    converted_words = []
    for word in words:
        if word.isdigit():
            converted_word = num2words(int(word))  # Convert number to word
            converted_words.append(converted_word)
        else:
            converted_words.append(word)
    return ' '.join(converted_words)

# Function to retrieve phonetic representations from CMU Pronouncing Dictionary.
def get_phonetic_representation(word):
    phonetic_representations = cmu_dict.get(word.lower())
    if phonetic_representations:
        return ' '.join(phonetic_representations[0])  # Join the phonetic representations with space
    else:
        return 'N/A'  # Return 'N/A' if word not found in dictionary


## The preprocess generate the information containing hypotheses, avg log probs and CMUDict representations of each hypothesis. Later in the ensemble_result notebook, we will see the performance at each index of hypotheses to determine what the n should be for top n results

In [162]:
# Preprocesses the segment texts and calculates the confidence score for each transcription -- modified version
def model_preprocess_and_calculate_multi(json_file_path):
    data = load_json(json_file_path)
    preprocessed_results = {}

    for key, value in data.items():
        studentID = value["studentID"]
        word = value["word"]
        trueLabel = value["trueLabel"]
        segments = value["segments"]
        true_transcription_stripped = convert_number_to_word(word)  # Convert true transcription to words if necessary                   
        true_phonetic_rep = get_phonetic_representation(true_transcription_stripped) # Get phonetic representations for both true and whisper's transcriptions
        # Preprocess text        
        text_avglog_list = segments[0]["text and avg_logprob"] 
        lst_model_transcription = []
        lst_true_phonetic_rep = []
        lst_model_phonetic_rep = []
        lst_model_judge_phonetic = []        
        lst_confidence_scores = []
        for text in text_avglog_list: ## its a list of dictionary containing text and average log prob
            preprocessed_texts = []
            ## setting confidence score to 0 if len(SGM) >1
            ## SETTING CONFIDENCE SCORE TO 0 if there's no word
            if len(text['text']) == 0:
                confidence_score = 0               
                preprocessed_texts.append('')
                print(f"No transcriptions for audio file: {studentID}_{word}") 
            else:
                avg_logprob = text["avg_logprob"]
                confidence_score = round(math.exp(avg_logprob), 3)
                preprocessed_text = preprocess_text(text['text'])
                preprocessed_texts.append(preprocessed_text) 
            
            lst_confidence_scores.append(confidence_score) # collecting confidence scores
            # Join and process text for all cases
            model_transcription = ' '.join(preprocessed_texts).strip()
            model_transcription = convert_number_to_word(model_transcription)  # Convert numbers to words if necessary 
            lst_model_transcription.append(model_transcription)      # DECIDING NOT USE WHISPER_JUDGE;             
                      

            lst_true_phonetic_rep.append(true_phonetic_rep)
            model_phonetic_rep = get_phonetic_representation(model_transcription)
            lst_model_phonetic_rep.append(model_phonetic_rep)       
            lst_topn=[]  ### NEW CONVERSION OF DATA FORMAT
            for a, b, d in zip(lst_model_transcription,lst_confidence_scores,lst_model_phonetic_rep):
                dict_new={'model_transcription':a,'confidence score':b,'model_phonetic_rep':d}
                lst_topn.append(dict_new)
            
        preprocessed_results[key] = {
            "studentID": studentID,
            "true_transcription": word,
            "trueLabel": trueLabel,
            "true_phonetic_rep":true_phonetic_rep,
            "top res": lst_topn
            }
   
    return preprocessed_results

In [163]:
multi_w2v_transcription_info_path =  "./tst_json_files/transcription_output/w2v_raw_transcription_info/mooc_multi_raw_info_result.json"


In [164]:
multi_pre_res = model_preprocess_and_calculate_multi(multi_w2v_transcription_info_path)


No transcriptions for audio file: series01-s000026_Paul
No transcriptions for audio file: series02-s00024_Paul
No transcriptions for audio file: series03-s000012_Paul
No transcriptions for audio file: series02-s00021_bay
No transcriptions for audio file: series03-s0000143_bay
No transcriptions for audio file: series03-s0000194_bay
No transcriptions for audio file: series03-s000057_bay
No transcriptions for audio file: series04-s00034_bay
No transcriptions for audio file: series04-s00095_bay
No transcriptions for audio file: series06-s00034_bay
No transcriptions for audio file: series07-s00028_bay
No transcriptions for audio file: series07-s00135_bay
No transcriptions for audio file: series07-s00145_bay
No transcriptions for audio file: series01-s000017_bee
No transcriptions for audio file: series02-s00012_bee
No transcriptions for audio file: series03-s0000151_bee
No transcriptions for audio file: series03-s0000222_bee
No transcriptions for audio file: series03-s000062_bee
No transcrip

In [165]:
multi_w2v_output_json_file_path = "./tst_json_files/transcription_output/w2v_processed_output/mooc_multi_w2v_output.json"

with open(multi_w2v_output_json_file_path, 'w') as json_file:
    json.dump(multi_pre_res, json_file, indent=4)
print(f"Preprocessed results saved to {multi_w2v_output_json_file_path}")   

Preprocessed results saved to ./tst_json_files/transcription_output/w2v_processed_output/mooc_multi_w2v_output.json


## Below is for the test data of this thesis

In [166]:
# naming the transcription info path and raw text path for test files
test_multi_w2v_transcription_info_path =  "./tst_json_files/transcription_output/w2v_raw_transcription_info/test_multi_raw_info_result.json"
test_multi_w2v_raw_text_path =  "./tst_json_files/transcription_output/w2v_raw_text/test_multi_raw_text_result.json"

In [167]:
# naming the filenames and full paths of files for development
test_filenames_mooc = test_fn
test_filepaths_mooc = test_fp

In [168]:
# calling to generate raw text and transcription info for the test files using wav2vec
mooc_multi_w2v_transcribe_audio_folder(test_filenames_mooc, test_filepaths_mooc, test_multi_w2v_transcription_info_path, test_multi_w2v_raw_text_path)

Transcription info saved to ./tst_json_files/transcription_output/w2v_raw_transcription_info/test_multi_raw_info_result.json
Raw transcription saved to ./tst_json_files/transcription_output/w2v_raw_text/test_multi_raw_text_result.json


In [169]:
# calling to preprocess results for wav2vec of test files
test_multi_pre_res = model_preprocess_and_calculate_multi(test_multi_w2v_transcription_info_path)


No transcriptions for audio file: series03-s0000119_Paul
No transcriptions for audio file: series04-s00002_Paul
No transcriptions for audio file: series07-s00032_Paul
No transcriptions for audio file: series07-s00135_Paul
No transcriptions for audio file: series04-s00032_bard
No transcriptions for audio file: series01-s00009_bay
No transcriptions for audio file: series04-s00032_bay
No transcriptions for audio file: series05-s00002_bay
No transcriptions for audio file: series07-s00032_bay
No transcriptions for audio file: series07-s00079_bay
No transcriptions for audio file: series04-s00038_bear
No transcriptions for audio file: series01-s000013_bee
No transcriptions for audio file: series01-s000015_bee
No transcriptions for audio file: series01-s000032_bee
No transcriptions for audio file: series01-s00004_bee
No transcriptions for audio file: series01-s00007_bee
No transcriptions for audio file: series02-s00016_bee
No transcriptions for audio file: series02-s00024_bee
No transcriptions

In [170]:
# saving results in a file
test_multi_w2v_output_json_file_path = "./tst_json_files/transcription_output/w2v_processed_output/test_multi_w2v_output.json"

with open(test_multi_w2v_output_json_file_path, 'w') as json_file:
    json.dump(test_multi_pre_res, json_file, indent=4)
print(f"Preprocessed results saved to {test_multi_w2v_output_json_file_path}")   

Preprocessed results saved to ./tst_json_files/transcription_output/w2v_processed_output/test_multi_w2v_output.json


## Here, we also obtain data from Furong's test data in 2024

In [171]:
## we need to use Furong's code and adapted the 'transcribe' into 'wav2vec_top_res(filepath)' 
# Function to transcribe audio files in a specific folder and save the transcriptions to JSON files
def transcribe_audio_folder(folder_path, transcriptions_info_path, raw_text_path):    
    raw_text = {}
    transcriptions_info = {}
    filenames = sorted(os.listdir(folder_path)) # added sorted

    for filename in filenames:       
        if filename.endswith(".mp3"):   # Remove '.mp3' extension from the filename        
            base_filename = filename[:-4]                        
            parts = base_filename.split('_') # Split filename to extract studentID, word, and trueLabel
            studentID = parts[0]
            word = parts[1]
            trueLabel = 'intelligible' if parts[2] == '1' else 'unintelligible'

            file_path = os.path.join(folder_path, filename)
            result = wav2vec_top_res(file_path)  
            segments_info = []  # Extract required fields from all segments
            segment_info = {
                    'text and avg_logprob':result                   
                    }
            segments_info.append(segment_info)

            transcriptions_info[base_filename] = {
                "studentID": studentID,
                "word": word,
                "trueLabel": trueLabel,
                "segments": segments_info 
            }

            raw_text[base_filename] = result

    # Save the detailed transcriptions info to a JSON file
    with open(transcriptions_info_path, 'w') as json_file:
        json.dump(transcriptions_info, json_file, indent=4)
    print(f"Transcription info saved to {transcriptions_info_path}")

    # Save the raw transcriptions to a JSON file
    with open(raw_text_path, 'w') as json_file:
        json.dump(raw_text, json_file, indent=4)
    print(f"Raw transcription saved to {raw_text_path}")


In [172]:
# defining the base dir for test 
furong_test_base_dir =  "./previous_data_audio/furong_test_all"


In [173]:
# preparing the output paths for test
furong_test_info_path = "./tst_json_files/transcription_output/w2v_raw_transcription_info/furong_test_raw_info_result.json"
furong_test_raw_text_path =  "./tst_json_files/transcription_output/w2v_raw_text/furong_test_raw_text_result.json"


In [175]:
#calling test paths for raw and info
transcribe_audio_folder(furong_test_base_dir, furong_test_info_path, furong_test_raw_text_path)


Transcription info saved to ./tst_json_files/transcription_output/w2v_raw_transcription_info/furong_test_raw_info_result.json
Raw transcription saved to ./tst_json_files/transcription_output/w2v_raw_text/furong_test_raw_text_result.json


In [176]:
# calling to preprocess results for wav2vec of Furong's  test files
print("----------below is no trans for Furong's test files-----------")
furong_test_preprocess_results = model_preprocess_and_calculate_multi(furong_test_info_path)


----------below is no trans for Furong's test files-----------
No transcriptions for audio file: s13_fair
No transcriptions for audio file: s15_paw
No transcriptions for audio file: s18_buy
No transcriptions for audio file: s18_pet
No transcriptions for audio file: s18_seedy
No transcriptions for audio file: s22_bee
No transcriptions for audio file: s22_tide
No transcriptions for audio file: s23_bee
No transcriptions for audio file: s23_pool
No transcriptions for audio file: s25_pit
No transcriptions for audio file: s26_fur
No transcriptions for audio file: s31_bee
No transcriptions for audio file: s32_bee
No transcriptions for audio file: s32_buy
No transcriptions for audio file: s36_bay
No transcriptions for audio file: s36_buy
No transcriptions for audio file: s36_fir
No transcriptions for audio file: s36_paw
No transcriptions for audio file: s36_pool
No transcriptions for audio file: s38_bee
No transcriptions for audio file: s41_bear
No transcriptions for audio file: s41_buy
No tra

In [178]:
furong_test_w2v_output_json_file_path = "./tst_json_files/transcription_output/w2v_processed_output/furong_test_w2v_output.json"

with open(furong_test_w2v_output_json_file_path, 'w') as json_file:
    json.dump(furong_test_preprocess_results, json_file, indent=4)
print(f"Preprocessed results saved to {furong_test_w2v_output_json_file_path}")  

Preprocessed results saved to ./tst_json_files/transcription_output/w2v_processed_output/furong_test_w2v_output.json


In [2]:

words = [
    'seedy', 'caught', 'poor', 'pet', 'pour', 'buy', 'fair', 'father',
    'Paul', 'fern', 'bear', 'bay', 'pool', 'paws', 'tied', 'fur', 'cot', 'pit', 'doll',
    'pole', 'pot', 'putt', 'bee', 'weight', 'nose', 'pause', 'paw', 'board', 'bird',
    'city', 'pull', 'put', 'dance', 'meat', 'pat', 'bard', 'daft', 'boot', 'beer', 'fir',
    'farther', 'mate', 'boy', 'tide', 'wait', 'plate', 'bout', 'knows', 'pore', 'meet',
    'boat', 'hat', 'half'
]

# Normalize capitalization and spacing
words = [w.strip().lower() for w in words]

# Check which words exist and print phonemes
found = {}
missing = []

for word in words:
    if word in cmu_dict:
        found[word] = cmu_dict[word][0]  # take first pronunciation
    else:
        missing.append(word)

# --- Print results ---
print("✅ Words found in CMUdict (with phonemes):")
for w, ph in found.items():
    print(f"{w:10s} → {' '.join(ph)}")

print("\n❌ Words NOT found in CMUdict:")
print(missing if missing else "All words found!")


✅ Words found in CMUdict (with phonemes):
seedy      → S IY1 D IY0
caught     → K AA1 T
poor       → P UH1 R
pet        → P EH1 T
pour       → P AO1 R
buy        → B AY1
fair       → F EH1 R
father     → F AA1 DH ER0
paul       → P AO1 L
fern       → F ER1 N
bear       → B EH1 R
bay        → B EY1
pool       → P UW1 L
paws       → P AO1 Z
tied       → T AY1 D
fur        → F ER1
cot        → K AA1 T
pit        → P IH1 T
doll       → D AA1 L
pole       → P OW1 L
pot        → P AA1 T
putt       → P AH1 T
bee        → B IY1
weight     → W EY1 T
nose       → N OW1 Z
pause      → P AO1 Z
paw        → P AO1
board      → B AO1 R D
bird       → B ER1 D
city       → S IH1 T IY0
pull       → P UH1 L
put        → P UH1 T
dance      → D AE1 N S
meat       → M IY1 T
pat        → P AE1 T
bard       → B AA1 R D
daft       → D AE1 F T
boot       → B UW1 T
beer       → B IH1 R
fir        → F ER1
farther    → F AA1 R DH ER0
mate       → M EY1 T
boy        → B OY1
tide       → T AY1 D
wait       → W EY1 T