# Benchmarks

For the sake of reproducibility, we discribe here the detailed evaluation steps reported in the EMNLP 2022 demo submission: https://arxiv.org/abs/2210.02545

dependecies:
- joeynmt
- datasets
- transformers
- pytorch
- numpy
- sacrebleu
- editdistance

> **Note:**:
> We normalized the ASR transcriptions for training only, not for testing. (If we had normalized the trg text in test set too, the scores would be better.)

check package versions

In [3]:
from pathlib import Path
import os
import numpy as np
import torch
import datasets
import transformers

print('torch', torch.__version__)
print('datasets', datasets.__version__)
print('transformers', transformers.__version__)

torch 1.11.0+cu115
datasets 2.4.0
transformers 4.21.0


### metrics: WER, BLEU

use joeyS2T implementation

In [4]:
from joeynmt.metrics import wer, bleu
from joeynmt.tokenizers import EvaluationTokenizer

tok = EvaluationTokenizer(lowercase=True, tokenize="13a", no_punc=True)

## LibriSpeech Benchmarks


In [3]:
from datasets import load_dataset

librispeech_eval = load_dataset("librispeech_asr", name="all")
librispeech_eval

Reusing dataset librispeech_asr (/workspace/mitarb/ohta/cache/librispeech_asr/all/2.1.0/14c8bffddb861b4b3a4fcdff648a56980dbb808f3fc56f5a3d56b18ee88458eb)


  0%|          | 0/7 [00:00<?, ?it/s]

DatasetDict({
    train.clean.100: Dataset({
        features: ['file', 'audio', 'text', 'speaker_id', 'chapter_id', 'id'],
        num_rows: 28539
    })
    train.clean.360: Dataset({
        features: ['file', 'audio', 'text', 'speaker_id', 'chapter_id', 'id'],
        num_rows: 104014
    })
    train.other.500: Dataset({
        features: ['file', 'audio', 'text', 'speaker_id', 'chapter_id', 'id'],
        num_rows: 148688
    })
    validation.clean: Dataset({
        features: ['file', 'audio', 'text', 'speaker_id', 'chapter_id', 'id'],
        num_rows: 2703
    })
    validation.other: Dataset({
        features: ['file', 'audio', 'text', 'speaker_id', 'chapter_id', 'id'],
        num_rows: 2864
    })
    test.clean: Dataset({
        features: ['file', 'audio', 'text', 'speaker_id', 'chapter_id', 'id'],
        num_rows: 2620
    })
    test.other: Dataset({
        features: ['file', 'audio', 'text', 'speaker_id', 'chapter_id', 'id'],
        num_rows: 2939
    })
})

### SpeechBrain

https://huggingface.co/speechbrain/asr-transformer-transformerlm-librispeaech

In [None]:
from speechbrain.pretrained import EncoderDecoderASR

asr_model = EncoderDecoderASR.from_hparams(
    source="speechbrain/asr-transformer-transformerlm-librispeech",
    run_opts={"device":"cuda"},
)
asr_model.eval()

batch_size = 4

def map_to_pred(batch):
    lengths = [len(b['array']) for b in batch['audio']]
    curr_batch_size = len(lengths)
    max_len = max(lengths)
    input_array = np.zeros((batch_size, max_len))
    length_array = np.zeros((batch_size,))
    
    for i, b in enumerate(batch['audio']):
        input_array[i, :lengths[i]] = b['array']
        length_array[i] = lengths[i] / max_len
        
    transcription, _ = asr_model.transcribe_batch(torch.tensor(input_array), torch.tensor(length_array))
    
    batch["transcription"] = transcription[:curr_batch_size]
    return batch

for split in ['validation.clean', 'validation.other', 'test.clean', 'test.other']:
    result = librispeech_eval[split].map(map_to_pred,
                                         batched=True,
                                         batch_size=batch_size)
                                         #remove_columns=["audio"])
    
    print(split, wer(hypotheses=result["transcription"], references=result["text"], tokenizer=tok))



  0%|          | 0/676 [00:00<?, ?ba/s]

validation.clean 2.1322745487298262


  0%|          | 0/716 [00:00<?, ?ba/s]

validation.other 5.513464709115176


  0%|          | 0/655 [00:00<?, ?ba/s]

test.clean 2.3128423615337796


  0%|          | 0/735 [00:00<?, ?ba/s]

test.other 5.614886422253215


number of parameters

In [5]:
import numpy as np

#model_parameters = filter(lambda p: p.requires_grad, asr_model.parameters())
n_params = sum([np.prod(p.size()) for p in asr_model.parameters()])
n_params

164859096

### facebook wav2vec2

https://huggingface.co/facebook/wav2vec2-base-960h

In [8]:
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h").to("cuda")
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model.eval()

def map_to_pred(batch):
    input_values = processor(batch["audio"]["array"],
                             sampling_rate=batch["audio"]["sampling_rate"],
                             return_tensors="pt", padding="longest").input_values
    with torch.no_grad():
        logits = model(input_values.to("cuda")).logits

    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(predicted_ids)
    batch["transcription"] = transcription
    return batch

for split in ['validation.clean', 'validation.other', 'test.clean', 'test.other']:
    result = librispeech_eval[split].map(map_to_pred,
                                         batched=False,
                                         batch_size=1)
                                         #remove_columns=["audio"])

    hyp = [s[0] for s in result["transcription"]]
    print(split, wer(hypotheses=hyp, references=result['text'], tokenizer=tok))

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/2703 [00:00<?, ?ex/s]

validation.clean 3.167162971949561


  0%|          | 0/2864 [00:00<?, ?ex/s]

validation.other 8.860014132056214


  0%|          | 0/2620 [00:00<?, ?ex/s]

test.clean 3.3855751673767496


  0%|          | 0/2939 [00:00<?, ?ex/s]

test.other 8.568480981220029


number of parameters

In [9]:
import numpy as np

model_parameters = filter(lambda p: p.requires_grad, model.parameters())
n_params = sum([np.prod(p.size()) for p in model_parameters])
n_params

94396320

### facebook s2t

https://huggingface.co/facebook/s2t-medium-librispeech-asr

In [12]:
from transformers import Speech2TextForConditionalGeneration, Speech2TextProcessor
import torch

model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-medium-librispeech-asr").to("cuda")
processor = Speech2TextProcessor.from_pretrained("facebook/s2t-medium-librispeech-asr", do_upper_case=False)
model.eval()

def map_to_pred(batch):
    features = processor(batch["audio"]["array"],
                         sampling_rate=batch["audio"]["sampling_rate"],
                         padding=True, return_tensors="pt")
    input_features = features.input_features.to("cuda")
    attention_mask = features.attention_mask.to("cuda")

    with torch.no_grad():
        gen_tokens = model.generate(input_features=input_features, attention_mask=attention_mask)
    
    transcription = processor.batch_decode(gen_tokens, skip_special_tokens=True)
    batch["transcription"] = transcription
    return batch

for split in ['validation.clean', 'validation.other', 'test.clean', 'test.other']:
    result = librispeech_eval[split].map(map_to_pred,
                                         batched=False,
                                         batch_size=1)
                                         #remove_columns=["audio"])
    
    ref = [s.lower() for s in result['text']]
    hyp = [s[0] for s in result["transcription"]]
    print(split, wer(hypotheses=hyp, references=ref, tokenizer=tok))

  0%|          | 0/2703 [00:00<?, ?ex/s]

validation.clean 3.23149884195434


  0%|          | 0/2864 [00:00<?, ?ex/s]

validation.other 8.008165188034859


  0%|          | 0/2620 [00:00<?, ?ex/s]

test.clean 3.5225197808886186


  0%|          | 0/2939 [00:00<?, ?ex/s]

test.other 7.832948054181074


number of parameters

In [11]:
import numpy as np

model_parameters = filter(lambda p: p.requires_grad, model.parameters())
n_params = sum([np.prod(p.size()) for p in model_parameters])
n_params

71207936

### JoeyS2T

In [5]:
from pathlib import Path

root_dir = Path('/workspace/mitarb/ohta/models')

In [20]:
from collections import defaultdict

for model in ['librispeech100h', 'librispeech960h']:
    print('---', model, '---')
    scores = defaultdict(list)
    for seed in [321, 42, 987]:
        model_dir = root_dir / f'{model}_seed{seed}'
        for split, key in [('clean.dev', 'validation.clean'),
                          ('other.dev', 'validation.other'),
                          ('clean.test', 'test.clean'),
                          ('other.test', 'test.other')]:
            data = librispeech_eval[key].sort('id')
            hyp = (model_dir / f'avg10_{split}').read_text().splitlines()
            ref = [s.lower() for s in data['text']]
            assert len(hyp) == len(ref)

            score = wer(hypotheses=hyp, references=ref, tokenizer=tok)
            scores[split].append(score)
    for k, v in scores.items():
        print('%10s: mean=%.2f std=%.2f %r' % (k, np.mean(v), np.std(v), v))
    print()

Loading cached sorted indices for dataset at /workspace/mitarb/ohta/cache/librispeech_asr/all/2.1.0/14c8bffddb861b4b3a4fcdff648a56980dbb808f3fc56f5a3d56b18ee88458eb/cache-5adcea4a61262a78.arrow


--- librispeech100h ---
 clean.dev: mean=10.66 std=0.36 [11.036358957391272, 10.769824638800044, 10.179772802470497]
 other.dev: mean=23.82 std=0.34 [24.228625264976053, 23.847844861427337, 23.386590248881213]
clean.test: mean=12.02 std=0.32 [12.334525258673159, 12.150030432136335, 11.577525867315885]
other.test: mean=24.75 std=0.37 [25.204898458246568, 24.744473950671534, 24.310796094988824]

--- librispeech960h ---
 clean.dev: mean=3.79 std=0.27 [4.145068196022205, 3.501709495974413, 3.720451453990662]
 other.dev: mean=8.84 std=0.39 [9.362487241893696, 8.436052445630839, 8.728507497840935]
clean.test: mean=4.31 std=0.52 [5.025106512477176, 3.78119293974437, 4.123554473524042]
other.test: mean=8.66 std=0.35 [9.132071146094034, 8.318208738513269, 8.524540053111208]



## MuST-C v1 en-de Benchmarks

In [5]:
from datasets import DatasetDict, load_dataset

mustc_v1 = DatasetDict()
for split in ["validation", "tst.COMMON", "tst.HE"]:
    mustc_v1[split] = load_dataset("mustc",
                                   split=split,
                                   name="en-de",
                                   data_dir="data/MUSTC_v1.0")
mustc_v1

Using custom data configuration en-de-2981737b16a98f43
Reusing dataset mustc (/workspace/mitarb/ohta/cache/mustc/en-de-2981737b16a98f43/2.0.0/06c09e13605d29280ddf6eb3fb66314164294b532f2e7f344b6d4112acc47193)
Using custom data configuration en-de-2981737b16a98f43
Reusing dataset mustc (/workspace/mitarb/ohta/cache/mustc/en-de-2981737b16a98f43/2.0.0/06c09e13605d29280ddf6eb3fb66314164294b532f2e7f344b6d4112acc47193)
Using custom data configuration en-de-2981737b16a98f43
Reusing dataset mustc (/workspace/mitarb/ohta/cache/mustc/en-de-2981737b16a98f43/2.0.0/06c09e13605d29280ddf6eb3fb66314164294b532f2e7f344b6d4112acc47193)


DatasetDict({
    validation: Dataset({
        features: ['client_id', 'file', 'audio', 'sentence', 'translation', 'id'],
        num_rows: 1423
    })
    tst.COMMON: Dataset({
        features: ['client_id', 'file', 'audio', 'sentence', 'translation', 'id'],
        num_rows: 2641
    })
    tst.HE: Dataset({
        features: ['client_id', 'file', 'audio', 'sentence', 'translation', 'id'],
        num_rows: 600
    })
})

In [30]:
%env MUSTC_v1=/scratch5t/ohta/MUSTC_v1.0_new

env: MUSTC_v1=/scratch5t/ohta/MUSTC_v1.0_new


## fairseq s2t

https://github.com/facebookresearch/fairseq/blob/main/examples/speech_to_text/docs/mustc_example.md

In [31]:
!for task in asr st; \
    do for split in dev tst-COMMON tst-HE; \
        do fairseq-generate ${MUSTC_v1}/en-de \
            --config-yaml config_${task}.yaml \
            --gen-subset ${split}_${task} \
            --task speech_to_text \
            --path ${MUSTC_v1}/en-de/mustc_de_${task}_transformer_s.pt \
            --max-tokens 10000 \
            --beam 20 > ${MUSTC_v1}/en-de/${split}_${task}.log; \
    done; \
done

2022-08-01 17:15:36 | INFO | fairseq.tasks.text_to_speech | Please install tensorboardX: pip install tensorboardX
2022-08-01 17:15:39 | INFO | fairseq.tasks.speech_to_text | dictionary size (spm_unigram_5000.txt): 5,000
2022-08-01 17:15:39 | INFO | fairseq_cli.generate | loading model(s) from /scratch5t/ohta/MUSTC_v1.0_new/en-de/mustc_de_asr_transformer_s.pt
2022-08-01 17:15:43 | INFO | fairseq.tasks.speech_to_text | pre-tokenizer: {'tokenizer': None}
2022-08-01 17:15:43 | INFO | fairseq.tasks.speech_to_text | tokenizer: {'bpe': 'sentencepiece', 'sentencepiece_model': '/scratch5t/ohta/MUSTC_v1.0_new/en-de/spm_unigram_5000.model'}
2022-08-01 17:15:43 | INFO | fairseq.data.audio.speech_to_text_dataset | 'dev_asr' has 0.00% OOV
2022-08-01 17:15:43 | INFO | fairseq.data.audio.speech_to_text_dataset | SpeechToTextDataset(split="dev_asr", n_samples=1_423, prepend_tgt_lang_tag=False, shuffle=False, transforms=CompositeAudioFeatureTransform(
    UtteranceCMVN(norm_means=True, norm_vars=True)
)

In [32]:
!for task in asr st; \
    do for split in dev tst-COMMON tst-HE; \
        do grep ^D ${MUSTC_v1}/en-de/${split}_${task}.log | LC_ALL=C sort -V | cut -f3- > ${MUSTC_v1}/en-de/${split}_${task}.hyp; \
    done; \
done

In [33]:
for task in ['asr', 'st']:
    print('---', task, '---')
    for split, key in [('dev', 'validation'), ('tst-COMMON', 'tst.COMMON'), ('tst-HE', 'tst.HE')]:
        ref = mustc_v1[key]
        hyp = (Path(os.environ['MUSTC_v1']) / f'en-de/{split}_{task}.hyp').read_text().splitlines()
        assert len(ref) == len(hyp)
        
        if task == 'asr':
            score = wer(hypotheses=hyp, references=ref['sentence'], tokenizer=tok)
        elif task == 'st':
            score = bleu(hypotheses=hyp, references=ref['translation'], tokenize="13a")
            
        print('%12s %2.2f' % (split, score))
    print()

--- asr ---
         dev 13.07
  tst-COMMON 12.72
      tst-HE 10.93

--- st ---
         dev 22.05
  tst-COMMON 22.70
      tst-HE 21.70



In [7]:
from collections import defaultdict
from pathlib import Path
import numpy as np
import pandas as pd

def load_tsv(path):
    p = path.read_text().splitlines()
    entries = []
    for i, line in enumerate(p):
        if i > 0:
            line = line.split('\t')
            entries.append({
                'id': line[0],
                'audio': line[1],
                'n_frames': int(line[2]),
                'tgt_text': line[3],
                'speaker': line[4],
            })
    df = pd.DataFrame.from_dict(entries)
    #df = pd.read_csv(path, sep="\t")
    return df

root_dir = Path('/workspace/mitarb/ohta/models')

In [37]:
for task, t in [('asr', ''), ('mt', ''), ('mt', '_cascade'), ('st', '')]:
    model = f'mustc_v2_{task}'
    print('---', f'{model}{t}', '---')
    scores = defaultdict(list)
    for seed in [321, 42, 987]:
        model_dir = root_dir / f'{model}_seed{seed}'
        for split, key in [('dev', 'validation'), ('tst-COMMON', 'tst.COMMON'), ('tst-HE', 'tst.HE')]:
            ref = mustc_v1[key]
            ckpt = "avg10" if task in ["asr", "st"] else "avg5"
            ext = ".en" if task in ["asr"] else ""
            hyp_raw = (model_dir / f'{ckpt}{t}_v1.{split}{ext}').read_text().splitlines()
            
            if f'{task}{t}' in ["asr", "st", "mt_cascade"]:
                tt = task if task in ["asr", "st"] else "asr"
                df = load_tsv(Path(os.environ['MUSTC_v1']) / f'en-de/joey_{split}_{tt}.tsv')
                short_items = df[df['n_frames'] <= 10]
                hyp = []
                for item in ref:
                    idx = item['id']
                    if (idx not in df['id'].tolist()) or (idx in short_items['id'].tolist()):
                        hyp.append('')
                    else:
                        hyp.append(hyp_raw.pop(0))
            else:
                hyp = hyp_raw
            assert len(hyp) == len(ref), (len(hyp), len(ref))
            
            if task in ["asr"]:
                score = wer(hypotheses=hyp, references=ref['sentence'], tokenizer=tok)
                
            elif task in ["mt", "st"]:
                score = bleu(hypotheses=hyp, references=ref['translation'], tokenize="13a")
                
            scores[split].append(score)

    for k, v in scores.items():
        print('%12s: mean=%.2f std=%.2f %r' % (k, np.mean(v), np.std(v), v))
    print()

--- mustc_v2_asr ---
         dev: mean=18.79 std=0.61 [19.57116617310792, 18.716497357274058, 18.08674138771226]
  tst-COMMON: mean=18.86 std=0.37 [19.384788927936135, 18.538765239272852, 18.66458414681765]
      tst-HE: mean=15.19 std=0.56 [15.956973953240288, 14.650191128100277, 14.97021957507334]

--- mustc_v2_mt ---
         dev: mean=21.65 std=0.24 [21.772558277350843, 21.85346651122046, 21.313055322761763]
  tst-COMMON: mean=23.07 std=0.14 [23.184641282742795, 23.148724630486548, 22.87574048857216]
      tst-HE: mean=20.21 std=0.17 [20.296177594561602, 20.372050882152354, 19.969481160302298]

--- mustc_v2_mt_cascade ---
         dev: mean=21.43 std=0.63 [20.54657959064018, 21.79377175639091, 21.95741642064011]
  tst-COMMON: mean=21.89 std=0.64 [20.993011117357035, 22.328004351471453, 22.36151621616054]
      tst-HE: mean=21.03 std=0.66 [20.22587289794963, 21.848313074776083, 21.013385180851117]

--- mustc_v2_st ---
         dev: mean=20.48 std=0.48 [20.494653179350077, 19.884716

## MuST-C v2 en-de Benchmarks



In [133]:
from datasets import DatasetDict, load_dataset

mustc_v2 = DatasetDict()
for split in ["validation", "tst.COMMON", "tst.HE"]:
    mustc_v2[split] = load_dataset("mustc",
                                   split=split,
                                   name="en-de",
                                   data_dir="data/MUSTC_v2.0")
mustc_v2

Using custom data configuration en-de-f1a63e2b2e62995c
Reusing dataset mustc (/workspace/mitarb/ohta/cache/mustc/en-de-f1a63e2b2e62995c/2.0.0/06c09e13605d29280ddf6eb3fb66314164294b532f2e7f344b6d4112acc47193)
Using custom data configuration en-de-f1a63e2b2e62995c
Reusing dataset mustc (/workspace/mitarb/ohta/cache/mustc/en-de-f1a63e2b2e62995c/2.0.0/06c09e13605d29280ddf6eb3fb66314164294b532f2e7f344b6d4112acc47193)
Using custom data configuration en-de-f1a63e2b2e62995c
Reusing dataset mustc (/workspace/mitarb/ohta/cache/mustc/en-de-f1a63e2b2e62995c/2.0.0/06c09e13605d29280ddf6eb3fb66314164294b532f2e7f344b6d4112acc47193)


DatasetDict({
    validation: Dataset({
        features: ['client_id', 'file', 'audio', 'sentence', 'translation', 'id'],
        num_rows: 1415
    })
    tst.COMMON: Dataset({
        features: ['client_id', 'file', 'audio', 'sentence', 'translation', 'id'],
        num_rows: 2580
    })
    tst.HE: Dataset({
        features: ['client_id', 'file', 'audio', 'sentence', 'translation', 'id'],
        num_rows: 600
    })
})

In [134]:
%env MUSTC_v2=/scratch5t/ohta/MUSTC_v2.0_new

env: MUSTC_v2=/scratch5t/ohta/MUSTC_v2.0_new


### fairseq s2t

https://github.com/facebookresearch/fairseq/blob/main/examples/speech_to_text/docs/mustc_example.md

In [82]:
!for task in asr st; \
    do for split in dev tst-COMMON tst-HE; \
        do fairseq-generate ${MUSTC_v2}/en-de \
            --config-yaml config_${task}.yaml \
            --gen-subset ${split}_${task} \
            --task speech_to_text \
            --path ${MUSTC_v2}/en-de/mustc_de_${task}_transformer_s.pt \
            --max-tokens 10000 \
            --beam 20 > ${MUSTC_v2}/en-de/${split}_${task}.log; \
    done; \
done

2022-07-29 18:27:15 | INFO | fairseq.tasks.text_to_speech | Please install tensorboardX: pip install tensorboardX
2022-07-29 18:27:17 | INFO | fairseq.tasks.speech_to_text | dictionary size (spm_unigram_5000.txt): 5,000
2022-07-29 18:27:17 | INFO | fairseq_cli.generate | loading model(s) from /scratch5t/ohta/MUSTC_v2.0_new/en-de/mustc_de_asr_transformer_s.pt
2022-07-29 18:27:20 | INFO | fairseq.tasks.speech_to_text | pre-tokenizer: {'tokenizer': None}
2022-07-29 18:27:20 | INFO | fairseq.tasks.speech_to_text | tokenizer: {'bpe': 'sentencepiece', 'sentencepiece_model': '/scratch5t/ohta/MUSTC_v2.0_new/en-de/spm_unigram_5000.model'}
2022-07-29 18:27:20 | INFO | fairseq.data.audio.speech_to_text_dataset | 'dev_asr' has 0.42% OOV
2022-07-29 18:27:20 | INFO | fairseq.data.audio.speech_to_text_dataset | SpeechToTextDataset(split="dev_asr", n_samples=1_415, prepend_tgt_lang_tag=False, shuffle=False, transforms=CompositeAudioFeatureTransform(
    UtteranceCMVN(norm_means=True, norm_vars=True)
)

In [83]:
!for task in asr st; \
    do for split in dev tst-COMMON tst-HE; \
        do grep ^D ${MUSTC_v2}/en-de/${split}_${task}.log | LC_ALL=C sort -V | cut -f3- > ${MUSTC_v2}/en-de/${split}_${task}.hyp; \
    done; \
done

In [89]:
for task in ['asr', 'st']:
    print('---', task, '---')
    for split, key in [('dev', 'validation'), ('tst-COMMON', 'tst.COMMON'), ('tst-HE', 'tst.HE')]:
        ref = mustc_ende[key]
        hyp = (Path(os.environ['MUSTC_v2']) / f'en-de/{split}_{task}.hyp').read_text().splitlines()
        assert len(ref) == len(hyp)
        
        if task == 'asr':
            score = wer(hypotheses=hyp, references=ref['sentence'], tokenizer=tok)
        elif task == 'st':
            score = bleu(hypotheses=hyp, references=ref['translation'], tokenize="13a")
            
        print('%12s %2.2f' % (split, score))
    print()

--- asr ---
         dev 9.11
  tst-COMMON 11.88
      tst-HE 10.43

--- st ---
         dev 23.38
  tst-COMMON 23.20
      tst-HE 22.23



### JoeyS2T

In [108]:
from pathlib import Path

root_dir = Path('/workspace/mitarb/ohta/models')

In [138]:
for task, t in [('asr', ''), ('mt', ''), ('mt', '_cascade'), ('st', '')]:
    model = f'mustc_v2_{task}'
    print('---', f'{model}{t}', '---')
    scores = defaultdict(list)
    for seed in [321, 42, 987]:
        model_dir = root_dir / f'{model}_seed{seed}'
        for split, key in [('dev', 'validation'), ('tst-COMMON', 'tst.COMMON'), ('tst-HE', 'tst.HE')]:
            ref = mustc_v2[key]
            ckpt = "avg10" if task in ["asr", "st"] else "avg5"
            ext = ".en" if task in ["asr"] else ""
            hyp_raw = (model_dir / f'{ckpt}{t}.{split}{ext}').read_text().splitlines()
            
            if f'{task}{t}' in ["asr", "st", "mt_cascade"]:
                tt = task if task in ["asr", "st"] else "asr"
                df = load_tsv(Path(os.environ['MUSTC_v2']) / f'en-de/joey_{split}_{tt}.tsv')
                short_items = df[df['n_frames'] <= 10]
                hyp = []
                for item in ref:
                    idx = item['id']
                    if (idx not in df['id'].tolist()) or (idx in short_items['id'].tolist()):
                        hyp.append('')
                    else:
                        hyp.append(hyp_raw.pop(0))
            else:
                hyp = hyp_raw
            assert len(hyp) == len(ref), (len(hyp), len(ref))
            
            if task in ["asr"]:
                score = wer(hypotheses=hyp, references=ref['sentence'], tokenizer=tok)
                
            elif task in ["mt", "st"]:
                score = bleu(hypotheses=hyp, references=ref['translation'], tokenize="13a")
                
            scores[split].append(score)

    for k, v in scores.items():
        print('%12s: mean=%.2f std=%.2f %r' % (k, np.mean(v), np.std(v), v))
    print()

--- mustc_v2_asr ---
         dev: mean=10.27 std=0.43 [10.819426214138383, 10.226889180573785, 9.769360585036564]
  tst-COMMON: mean=12.95 std=0.32 [13.276147348253941, 13.062788469912043, 12.511974222764085]
      tst-HE: mean=11.16 std=0.31 [11.427057576827975, 11.329918756623101, 10.72942423172024]

--- mustc_v2_mt ---
         dev: mean=26.45 std=0.75 [25.390705502875505, 26.978046442438234, 26.99326579089932]
  tst-COMMON: mean=27.17 std=0.63 [26.27450170545455, 27.612297404963513, 27.608497352365507]
      tst-HE: mean=24.85 std=0.68 [23.890310521482185, 25.397185257314472, 25.25525286854287]

--- mustc_v2_mt_cascade ---
         dev: mean=23.86 std=0.76 [22.788992194115746, 24.377921594559243, 24.422131504399257]
  tst-COMMON: mean=23.95 std=0.59 [23.11313444237887, 24.344195901542143, 24.400466368856243]
      tst-HE: mean=22.65 std=0.58 [21.834181399166415, 22.99811303119327, 23.131362026298824]

--- mustc_v2_st ---
         dev: mean=23.52 std=0.53 [23.28003961548913, 23.025