In [2]:
from transformers import pipeline
from evaluate import load
import os
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
data = pd.read_csv("/home/paige/Documents/vocal_ambiguity/tts_validation/tts-validation - whisper tests.csv")

In [5]:
audio_dir = "/home/paige/Documents/vocal_ambiguity/tts_validation/whisper_tests"

In [6]:
base = data[data['model']=='base']

In [7]:
emphasis = data[data['model']=='emphasis']

In [8]:
stretch = data[data['model']=='stretch']

In [9]:
clarity = data[data['model']=='clarity']

In [104]:
wer_metric = load("wer")

In [69]:
asr_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-medium.en")

Device set to use cuda:0


In [178]:
#treat homophones
homophones = {
    "cot": ["caught"],
    "peel": ["peal"],
    "rot": ["wrought"],
    "bin": ["been"],
    "wood": ["would"],
    "scene": ["seen"],
    "beat": ["beet"],
    "but": ["butt"],
    "bought": ["bot"],
}

In [202]:
def normalize_with_homophones(results):
    replaced = []
    for text in results:
        words = text.split()
        normalized = []
        for word in words:
            for key, equivalents in homophones.items():
                if word in equivalents or word == key:
                    normalized.append(key)
                    break
            else:
                normalized.append(word)
        replaced.append(" ".join(normalized))
    return replaced

## Base

In [171]:
ground_truth_transcriptions = base["phrase"]
files = base['filename']

In [179]:
transcriptions = []
for i, audio_file in enumerate(files):
        audio_path = os.path.join(audio_dir, audio_file)

        result = asr_pipeline(audio_path)
        transcriptions.append(result["text"])



In [180]:
wer = wer_metric.compute(predictions=transcriptions, references=ground_truth_transcriptions)

for ref, pred in zip(ground_truth_transcriptions, transcriptions):
    print(f"Reference: {ref}")
    print(f"Prediction: {pred}\n")

print("Average Word Error Rate (WER):", wer)

Reference: Ship was what I heard, but maybe I misunderstood.
Prediction:  Ship was what I heard, but maybe I misunderstood.

Reference: Sheep might have been what they meant, but it wasn’t clear.
Prediction:  Sheep might have been what they meant, but it wasn't clear.

Reference: Pool was the word I caught, though it could’ve been something else.
Prediction:  Pull was the word I caught, though it could have been something else.

Reference: Pull might have been what they were talking about, but I’m unsure
Prediction:  Poll might have been what they were talking about, but I'm unsure.

Reference: Cut was the word used, but it seemed odd in context.
Prediction:  Cut was the word used, but it seemed odd in context.

Reference: Cot could have been the intention, but I wasn’t sure.
Prediction:  Cott could have been the intention, but I wasn't sure.

Reference: I thought they said something about peel.
Prediction:  I thought they said something about peel.

Reference: I think the phrase ended

In [205]:
normalized_transcriptions = normalize_with_homophones(transcriptions)
wer = wer_metric.compute(predictions=normalized_transcriptions, references=ground_truth_transcriptions)

for ref, pred in zip(ground_truth_transcriptions, normalized_transcriptions):
    print(f"Reference: {ref}")
    print(f"Prediction: {pred}\n")

print("Average Word Error Rate (WER):", wer)

Reference: Ship was what I heard, but maybe I misunderstood.
Prediction: Ship was what I heard, but maybe I misunderstood.

Reference: Sheep might have been what they meant, but it wasn’t clear.
Prediction: Sheep might have bin what they meant, but it wasn't clear.

Reference: Pool was the word I caught, though it could’ve been something else.
Prediction: Pull was the word I caught, though it could have bin something else.

Reference: Pull might have been what they were talking about, but I’m unsure
Prediction: Poll might have bin what they were talking about, but I'm unsure.

Reference: Cut was the word used, but it seemed odd in context.
Prediction: Cut was the word used, but it seemed odd in context.

Reference: Cot could have been the intention, but I wasn’t sure.
Prediction: Cott could have bin the intention, but I wasn't sure.

Reference: I thought they said something about peel.
Prediction: I thought they said something about peel.

Reference: I think the phrase ended with full.

## Stretch

In [206]:
ground_truth_transcriptions = stretch["phrase"].reset_index()
ground_truth_transcriptions = ground_truth_transcriptions["phrase"]
files = stretch['filename']

In [207]:
transcriptions = []
for i, audio_file in enumerate(files):
        audio_path = os.path.join(audio_dir, audio_file)

        result = asr_pipeline(audio_path)
        transcriptions.append(result["text"])



In [208]:
wer = wer_metric.compute(predictions=transcriptions, references=ground_truth_transcriptions)

for ref, pred in zip(ground_truth_transcriptions, transcriptions):
    print(f"Reference: {ref}")
    print(f"Prediction: {pred}\n")

print("Average Word Error Rate (WER):", wer)

Reference: Ship was what I heard, but maybe I misunderstood.
Prediction:  Ship was what I heard, but maybe I misunderstood.

Reference: Sheep might have been what they meant, but it wasn’t clear.
Prediction:  Sheep might have been what they meant, but it wasn't clear.

Reference: Pool was the word I caught, though it could’ve been something else.
Prediction:  Fool was the word I caught, though it could have been something else.

Reference: Pull might have been what they were talking about, but I’m unsure
Prediction:  Paul might have been what they were talking about, but I'm unsure.

Reference: Cut was the word used, but it seemed odd in context.
Prediction:  Cut was the word used, but it seemed odd in context.

Reference: Cot could have been the intention, but I wasn’t sure.
Prediction:  Cut could have been the intention, but I wasn't sure.

Reference: I thought they said something about peel.
Prediction:  I thought they said something about peel.

Reference: I think the phrase ended 

In [209]:
normalized_transcriptions = normalize_with_homophones(transcriptions)
wer = wer_metric.compute(predictions=normalized_transcriptions, references=ground_truth_transcriptions)

for ref, pred in zip(ground_truth_transcriptions, normalized_transcriptions):
    print(f"Reference: {ref}")
    print(f"Prediction: {pred}\n")

print("Average Word Error Rate (WER):", wer)

Reference: Ship was what I heard, but maybe I misunderstood.
Prediction: Ship was what I heard, but maybe I misunderstood.

Reference: Sheep might have been what they meant, but it wasn’t clear.
Prediction: Sheep might have bin what they meant, but it wasn't clear.

Reference: Pool was the word I caught, though it could’ve been something else.
Prediction: Fool was the word I caught, though it could have bin something else.

Reference: Pull might have been what they were talking about, but I’m unsure
Prediction: Paul might have bin what they were talking about, but I'm unsure.

Reference: Cut was the word used, but it seemed odd in context.
Prediction: Cut was the word used, but it seemed odd in context.

Reference: Cot could have been the intention, but I wasn’t sure.
Prediction: Cut could have bin the intention, but I wasn't sure.

Reference: I thought they said something about peel.
Prediction: I thought they said something about peel.

Reference: I think the phrase ended with full.


## Clarity

In [210]:
ground_truth_transcriptions = clarity["phrase"].reset_index()
ground_truth_transcriptions = ground_truth_transcriptions["phrase"]
files = clarity['filename']

In [211]:
transcriptions = []
for i, audio_file in enumerate(files):
        audio_path = os.path.join(audio_dir, audio_file)

        result = asr_pipeline(audio_path)
        transcriptions.append(result["text"])



In [212]:
wer = wer_metric.compute(predictions=transcriptions, references=ground_truth_transcriptions)

for ref, pred in zip(ground_truth_transcriptions, transcriptions):
    print(f"Reference: {ref}")
    print(f"Prediction: {pred}\n")

print("Average Word Error Rate (WER):", wer)

Reference: Ship was what I heard, but maybe I misunderstood.
Prediction:  Ship was what I heard, but maybe I misunderstood.

Reference: Sheep might have been what they meant, but it wasn’t clear.
Prediction:  Sheep might have been what they meant, but it wasn't clear.

Reference: Pool was the word I caught, though it could’ve been something else.
Prediction:  Full was the word I caught, though it could have been something else.

Reference: Pull might have been what they were talking about, but I’m unsure
Prediction:  might have been what they were talking about, but I'm unsure.

Reference: Cut was the word used, but it seemed odd in context.
Prediction:  Cut was the word used, but it seemed odd in context.

Reference: Cot could have been the intention, but I wasn’t sure.
Prediction:  Cut could have been the intention, but I wasn't sure.

Reference: I thought they said something about peel.
Prediction:  I thought they said something about peel.

Reference: I think the phrase ended with 

In [213]:
normalized_transcriptions = normalize_with_homophones(transcriptions)
wer = wer_metric.compute(predictions=normalized_transcriptions, references=ground_truth_transcriptions)

for ref, pred in zip(ground_truth_transcriptions, normalized_transcriptions):
    print(f"Reference: {ref}")
    print(f"Prediction: {pred}\n")

print("Average Word Error Rate (WER):", wer)

Reference: Ship was what I heard, but maybe I misunderstood.
Prediction: Ship was what I heard, but maybe I misunderstood.

Reference: Sheep might have been what they meant, but it wasn’t clear.
Prediction: Sheep might have bin what they meant, but it wasn't clear.

Reference: Pool was the word I caught, though it could’ve been something else.
Prediction: Full was the word I caught, though it could have bin something else.

Reference: Pull might have been what they were talking about, but I’m unsure
Prediction: might have bin what they were talking about, but I'm unsure.

Reference: Cut was the word used, but it seemed odd in context.
Prediction: Cut was the word used, but it seemed odd in context.

Reference: Cot could have been the intention, but I wasn’t sure.
Prediction: Cut could have bin the intention, but I wasn't sure.

Reference: I thought they said something about peel.
Prediction: I thought they said something about peel.

Reference: I think the phrase ended with full.
Predi

## Emphasis

In [215]:
ground_truth_transcriptions = emphasis["phrase"].reset_index()
ground_truth_transcriptions = ground_truth_transcriptions["phrase"]
files = emphasis['filename']

In [219]:
transcriptions = []
for i, audio_file in enumerate(files):
        audio_path = os.path.join(audio_dir, audio_file)

        result = asr_pipeline(audio_path)
        transcriptions.append(result["text"])



In [220]:
wer = wer_metric.compute(predictions=transcriptions, references=ground_truth_transcriptions)

for ref, pred in zip(ground_truth_transcriptions, transcriptions):
    print(f"Reference: {ref}")
    print(f"Prediction: {pred}\n")

print("Average Word Error Rate (WER):", wer)

Reference: Ship was what I heard, but maybe I misunderstood.
Prediction:  Ship was what I heard, but maybe I misunderstood.

Reference: Sheep might have been what they meant, but it wasn’t clear.
Prediction:  Sheep might have been what they meant, but it wasn't clear.

Reference: Pool was the word I caught, though it could’ve been something else.
Prediction:  Full was the word I caught, though it could have been something else.

Reference: Pull might have been what they were talking about, but I’m unsure
Prediction:  The poll might have been what they were talking about, but I'm unsure.

Reference: Cut was the word used, but it seemed odd in context.
Prediction:  Cut was the word used, but it seemed odd in context.

Reference: Cot could have been the intention, but I wasn’t sure.
Prediction:  Cut could have been the intention, but I wasn't sure.

Reference: I thought they said something about peel.
Prediction:  I thought they said something about peel.

Reference: I think the phrase en

## Noise tests