In [1]:
import os
import json
import warnings
warnings.filterwarnings("ignore")
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline

In [2]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model_id = "openai/whisper-large-v3"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)

WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(128, 1280, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(1280, 1280, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 1280)
      (layers): ModuleList(
        (0-31): 32 x WhisperEncoderLayer(
          (self_attn): WhisperSdpaAttention(
            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)
            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)
            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)
            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1280, out_features=5120, bias=True)
          (fc2): Linear(in_features=5120, out_features=1280, bia

In [3]:
processor = AutoProcessor.from_pretrained(model_id)

In [4]:
pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    torch_dtype=torch_dtype,
    device=device,
)

Device set to use cuda:0


In [5]:
def calc_wer(gt_texts, asr_result):
    from jiwer import wer
    sum_wer = 0
    for gt_text, asr_text in zip(gt_texts, asr_result):
        gt_text = english_normalizer(gt_text)
        asr_text = english_normalizer(asr_text['text'])
        error = wer(gt_text, asr_text)
        sum_wer += error
    avg_wer = sum_wer / len(gt_texts)
    print(f"Average WER: {avg_wer:.4f}")
    return avg_wer

In [6]:
from whisper_normalizer.english import EnglishTextNormalizer
english_normalizer = EnglishTextNormalizer()

# StressTest

In [11]:
import os
import json

In [12]:
dirname = '/home/siqiouya/code/Expressive-S2S/data/stresstest'

In [13]:
with open(os.path.join(dirname, 'stresstest.jsonl'), 'r') as f:
    manifest = [json.loads(line) for line in f]

In [14]:
audio_paths = []
gt_texts = []
for idx, item in enumerate(manifest):
    text = item['transcription']
    audio_paths.append(os.path.join(dirname, 'ground_truth', 'audio_{}.wav'.format(idx)))
    gt_texts.append(text)

In [None]:
asr_result = pipe(audio_paths, batch_size=8)

Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


In [21]:
!pip install whisper_normalizer

Collecting whisper_normalizer
  Downloading whisper_normalizer-0.1.12-py3-none-any.whl.metadata (7.9 kB)
Collecting indic-numtowords (from whisper_normalizer)
  Downloading indic_numtowords-1.1.0-py3-none-any.whl.metadata (3.1 kB)
Downloading whisper_normalizer-0.1.12-py3-none-any.whl (36 kB)
Downloading indic_numtowords-1.1.0-py3-none-any.whl (71 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.6/71.6 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25h[33mDEPRECATION: omegaconf 2.0.6 has a non-standard dependency specifier PyYAML>=5.1.*. pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of omegaconf or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0mInstalling collected packages: indic-numtowords, whisper_normalizer
Successfully installed indic-numtowords-1.1.0 whisper_normalizer-0.1

In [22]:
from whisper_normalizer.english import EnglishTextNormalizer
english_normalizer = EnglishTextNormalizer()

In [23]:
!pip install jiwer

[33mDEPRECATION: omegaconf 2.0.6 has a non-standard dependency specifier PyYAML>=5.1.*. pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of omegaconf or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m

In [36]:
def calc_wer(gt_texts, asr_result):
    from jiwer import wer
    sum_wer = 0
    for gt_text, asr_text in zip(gt_texts, asr_result):
        gt_text = english_normalizer(gt_text)
        asr_text = english_normalizer(asr_text['text'])
        error = wer(gt_text, asr_text)
        sum_wer += error
    avg_wer = sum_wer / len(gt_texts)
    print(f"Average WER: {avg_wer:.4f}")
    return avg_wer

In [38]:
# Ground Truth
calc_wer(gt_texts, asr_result)

Average WER: 0.0044


0.004434250764525994

In [39]:
gen_dir = '/data/user_data/siqiouya/runs/express-s2s'

In [42]:
# CosyVoice2
audio_paths = [
    os.path.join(gen_dir, 'CosyVoice2/gens/stresstest', '{}.wav'.format(i))
    for i in range(len(manifest))
]
asr_result = pipe(audio_paths, batch_size=8)



Average WER: 0.0297


0.02972748454399831

In [41]:
calc_wer(gt_texts, asr_result)

Average WER: 0.0297


0.02972748454399831

In [43]:
# IndexTTS2
audio_paths = [
    os.path.join(gen_dir, 'IndexTTS2/gens/stresstest', '{}.wav'.format(i))
    for i in range(len(manifest))
]
asr_result = pipe(audio_paths, batch_size=8)



In [44]:
calc_wer(gt_texts, asr_result)

Average WER: 0.0024


0.0024464831804281344

In [46]:
# Parler TTS
audio_paths = [
    os.path.join(gen_dir, 'parler/stresstest', '{}.wav'.format(i))
    for i in range(len(manifest))
]
asr_result = pipe(audio_paths, batch_size=8)



In [47]:
calc_wer(gt_texts, asr_result)

Average WER: 0.0313


0.03134920634920636

In [48]:
# GPT 4o
audio_paths = [
    os.path.join(gen_dir, 'openai/stresstest', '{}.wav'.format(i))
    for i in range(len(manifest))
]
asr_result = pipe(audio_paths, batch_size=8)



In [49]:
calc_wer(gt_texts, asr_result)

Average WER: 0.0032


0.0032292121741663024

# Expresso Emotion

In [7]:
dirname = '/home/siqiouya/code/Expressive-S2S/data/expresso'
with open(os.path.join(dirname, 'expresso.jsonl'), 'r') as f:
    manifest = [json.loads(line) for line in f]

In [8]:
audio_paths = []
gt_texts = []
for idx, item in enumerate(manifest):
    text = item['transcription']
    audio_paths.append(os.path.join(dirname, 'ground_truth', 'audio_{}.wav'.format(idx)))
    gt_texts.append(text)

In [None]:
# Ground Truth
asr_result = pipe(audio_paths, batch_size=8)
calc_wer(gt_texts, asr_result)

In [9]:
gen_dir = '/data/user_data/siqiouya/runs/express-s2s'

In [17]:
# CosyVoice2
audio_paths = [
    os.path.join(gen_dir, 'CosyVoice2/gens/expresso/wo_stress', '{}.wav'.format(i))
    for i in range(len(manifest))
]
asr_result = pipe(audio_paths, batch_size=8)
calc_wer(gt_texts, asr_result)

Average WER: 0.0561


0.05608358175806207

In [18]:
# IndexTTS2
audio_paths = [
    os.path.join(gen_dir, 'IndexTTS2/gens/expresso', '{}.wav'.format(i))
    for i in range(len(manifest))
]
asr_result = pipe(audio_paths, batch_size=8)
calc_wer(gt_texts, asr_result)

Average WER: 0.0410


0.04096595010944302

In [10]:
# Parler TTS
audio_paths = [
    os.path.join(gen_dir, 'parler/expresso_emotion', '{}.wav'.format(i))
    for i in range(len(manifest))
]
asr_result = pipe(audio_paths, batch_size=8)
calc_wer(gt_texts, asr_result)

Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Average WER: 0.2645


0.264517745229405

In [20]:
# GPT 4o
audio_paths = [
    os.path.join(gen_dir, 'openai/expresso_emotion', '{}.wav'.format(i))
    for i in range(len(manifest))
]
asr_result = pipe(audio_paths, batch_size=8)
calc_wer(gt_texts, asr_result)

Average WER: 0.0318


0.03177572730314879

# Expresso Emotion + Stress

In [22]:
dirname = '/home/siqiouya/code/Expressive-S2S/data/expresso'
with open(os.path.join(dirname, 'expresso.jsonl'), 'r') as f:
    manifest = [json.loads(line) for line in f]

In [23]:
emphasis = []
for item in manifest:
    emphasis.append('*' in item['transcription'])

In [24]:
audio_paths = []
gt_texts = []
for idx, item in enumerate(manifest):
    if emphasis[idx]:
        text = item['transcription']
        audio_paths.append(os.path.join(dirname, 'ground_truth', 'audio_{}.wav'.format(idx)))
        gt_texts.append(text)

In [28]:
# Ground Truth
asr_result = pipe(audio_paths, batch_size=8)
calc_wer(gt_texts, asr_result)

Average WER: 0.0130


0.013013041810510165

In [29]:
gen_dir = '/data/user_data/siqiouya/runs/express-s2s'

In [30]:
# CosyVoice2
audio_paths = [
    os.path.join(gen_dir, 'CosyVoice2/gens/expresso/w_stress', '{}.wav'.format(i))
    for i in range(len(manifest)) if emphasis[i]
]
asr_result = pipe(audio_paths, batch_size=8)
calc_wer(gt_texts, asr_result)

Average WER: 0.0288


0.028794083801529813

In [32]:
# Parler TTS
audio_paths = [
    os.path.join(gen_dir, 'parler/expresso_emotion_stress', '{}.wav'.format(i))
    for i in range(len(manifest)) if emphasis[i]
]
asr_result = pipe(audio_paths, batch_size=8)
calc_wer(gt_texts, asr_result)

Average WER: 0.0275


0.027479587922625896

In [35]:
# GPT 4o
audio_paths = [
    os.path.join(gen_dir, 'openai/expresso_emotion', '{}.wav'.format(i))
    for i in range(len(manifest)) if emphasis[i]
]
asr_result = pipe(audio_paths, batch_size=8)
calc_wer(gt_texts, asr_result)

Average WER: 0.0125


0.012543153049482164