In [8]:
import os
import subprocess
from tqdm import tqdm
import numpy as np

from ConST.prepare_data.data_utils import load_df_from_tsv, save_df_to_tsv

# Generate Translation

In [57]:
work_dir = '/home/siqiouyang/work/projects/ConST/'

In [None]:
# Base
prefix = '/mnt/data2/siqiouyang/runs/ConST'
names = ['ablation_data_efficiency_1h_baseline', 'ablation_data_efficiency_10h_baseline']
tags = ['base_ft_1h', 'base_ft_10h']
for name, tag in zip(tqdm(names), tags):
    cmd = """CUDA_VISIBLE_DEVICES=0 python fairseq_cli/generate.py /mnt/data/siqiouyang/datasets/must-c-v1.0/ --gen-subset tst-COMMON_st_de --task speech_to_text \
    --prefix-size 1 --max-tokens 4000000 --max-source-positions 4000000 --beam 10 --lenpen 0.6 --scoring sacrebleu \
    --config-yaml config_st_de.yaml  --path {}/{}/checkpoint_best.pt \
    --results-path /home/siqiouyang/work/projects/ConST/ConST/analysis/generation/{}""".format(prefix, name, tag)
    pipe = subprocess.Popen('cd {}; {}'.format(work_dir, cmd), stdin=subprocess.DEVNULL, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, shell=True)
    res = pipe.communicate()

In [58]:
# CTC and Sent
prefix = '/mnt/data2/siqiouyang/runs/ConST'
name_temp = 'ablation_pretrain_{}{}_ft_{}h'
for method in ['ctc', 'sent']:
    for pt_h in [1348]: # [10, 100, 370, 1348]:
        for ft_h in [1, 10]:
            if ft_h < pt_h:
                name = name_temp.format(method, '_{}h'.format(pt_h) if pt_h != 370 else '', ft_h)
                tag = '{}_pt_{}h_ft_{}h'.format(method, pt_h, ft_h)
                cmd = """CUDA_VISIBLE_DEVICES=0 python fairseq_cli/generate.py /mnt/data/siqiouyang/datasets/must-c-v1.0/ --gen-subset tst-COMMON_st_de --task speech_to_text \
                --prefix-size 1 --max-tokens 4000000 --max-source-positions 4000000 --beam 10 --lenpen 0.6 --scoring sacrebleu \
                --config-yaml config_st_de.yaml  --path {}/{}/checkpoint_best.pt \
                --results-path /home/siqiouyang/work/projects/ConST/ConST/analysis/generation/{}""".format(prefix, name, tag)
                print('Running', tag)
                pipe = subprocess.Popen('cd {}; {}'.format(work_dir, cmd), stdin=subprocess.DEVNULL, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, shell=True)
                res = pipe.communicate()
                # print(res)
                # break             
            # break
        # break
    # break

Running ctc_pt_1348h_ft_1h
Running ctc_pt_1348h_ft_10h
Running sent_pt_1348h_ft_1h
Running sent_pt_1348h_ft_10h


In [59]:
# WACO
prefix = '/mnt/data/siqiouyang/runs/ConST'
name_temp = 'ablation_pretrain_token_mfat{}_t0.20_ft_{}h'
for pt_h in [1348]: # [10, 100, 370, 1348]:
    for ft_h in [1, 10]:
        if ft_h < pt_h:
            name = name_temp.format('_{}h'.format(pt_h) if pt_h != 370 else '', ft_h)
            tag = 'token_pt_{}h_ft_{}h'.format(pt_h, ft_h)
            cmd = """CUDA_VISIBLE_DEVICES=0 python fairseq_cli/generate.py /mnt/data/siqiouyang/datasets/must-c-v1.0/ --gen-subset tst-COMMON_st_de --task speech_to_text \
            --prefix-size 1 --max-tokens 4000000 --max-source-positions 4000000 --beam 10 --lenpen 0.6 --scoring sacrebleu \
            --config-yaml config_st_de.yaml  --path {}/{}/checkpoint_best.pt \
            --results-path /home/siqiouyang/work/projects/ConST/ConST/analysis/generation/{}""".format(prefix, name, tag)
            print('Running', tag)
            pipe = subprocess.Popen('cd {}; {}'.format(work_dir, cmd), stdin=subprocess.DEVNULL, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, shell=True)
            res = pipe.communicate()
            # print(res)
            # break
        # break
    # break

Running token_pt_1348h_ft_1h
Running token_pt_1348h_ft_10h


# Collect Translation

In [9]:
from fairseq.scoring.tokenizer import EvaluationTokenizer

In [60]:
tokenizer = EvaluationTokenizer(
    tokenizer_type='13a',
    lowercase=False,
    character_tokenization=False,
)

In [61]:
def extract(path):
    refs = []
    gens = []
    with open(path, 'r') as r:
        for line in r.readlines():
            line = line.strip('\n')
            parts = line.split('\t')
            if line.startswith('T-'):
                refs.append(tokenizer.tokenize(parts[1]))
            elif line.startswith('D-'):
                gens.append(tokenizer.tokenize(parts[2]))
    return refs, gens

In [62]:
gen_root = '/home/siqiouyang/work/projects/ConST/ConST/analysis/generation'

In [64]:
translation = {}
for pt_h in [10, 100, 370, 1348]:
    for ft_h in [1, 10]:
        if ft_h < pt_h:
            ref, base = extract(os.path.join(gen_root, 'base_ft_{}h'.format(ft_h), 'generate-tst-COMMON_st_de.txt'))
            ref, ctc = extract(os.path.join(gen_root, 'ctc_pt_{}h_ft_{}h'.format(pt_h, ft_h), 'generate-tst-COMMON_st_de.txt'))
            ref, sent = extract(os.path.join(gen_root, 'sent_pt_{}h_ft_{}h'.format(pt_h, ft_h), 'generate-tst-COMMON_st_de.txt'))
            ref, token = extract(os.path.join(gen_root, 'token_pt_{}h_ft_{}h'.format(pt_h, ft_h), 'generate-tst-COMMON_st_de.txt'))
            translation[(pt_h, ft_h)] = [np.array(base), np.array(ctc), np.array(sent), np.array(token), np.array(ref)]

# Resample Subsets

In [65]:
import sacrebleu

In [66]:
n_resample = 1000
subset_ratio = 0.2

In [67]:
def compute_bleu(ref, gen):
    return sacrebleu.corpus_bleu(
        gen, [ref], tokenize="none"
    ).score

In [69]:
var_ctc = {}
var_sent = {}
var_token = {}
for pt_h, ft_h in translation:
    base, ctc, sent, token, ref = translation[(pt_h, ft_h)]

    n_sample = len(ref)
    indices = list(range(n_sample))
    subset_size = int(n_sample * subset_ratio)

    b_ctcs = []
    b_sents = []
    b_tokens = []
    
    for _ in tqdm(range(n_resample), desc='pt {}h ft {}h'.format(pt_h, ft_h)):
        subset_indices = np.random.choice(indices, subset_size, replace=True)
        
        s_base = base[subset_indices]
        s_ctc = ctc[subset_indices]
        s_sent = sent[subset_indices]
        s_token = token[subset_indices]
        s_ref = ref[subset_indices]

        # b_base = compute_bleu(s_ref, s_base)
        b_ctc = compute_bleu(s_ref, s_ctc)
        b_sent = compute_bleu(s_ref, s_sent)
        b_token = compute_bleu(s_ref, s_token)

        b_ctcs.append(b_ctc)
        b_sents.append(b_sent)
        b_tokens.append(b_token)

    var_ctc[(pt_h, ft_h)] = np.std(b_ctcs)
    var_sent[(pt_h, ft_h)] = np.std(b_sents)
    var_token[(pt_h, ft_h)] = np.std(b_tokens)

pt 10h ft 1h: 100%|██████████| 1000/1000 [04:06<00:00,  4.05it/s]
pt 100h ft 1h: 100%|██████████| 1000/1000 [04:39<00:00,  3.58it/s]
pt 100h ft 10h: 100%|██████████| 1000/1000 [04:40<00:00,  3.57it/s]
pt 370h ft 1h: 100%|██████████| 1000/1000 [04:57<00:00,  3.36it/s]
pt 370h ft 10h: 100%|██████████| 1000/1000 [04:50<00:00,  3.44it/s]
pt 1348h ft 1h: 100%|██████████| 1000/1000 [04:46<00:00,  3.49it/s]
pt 1348h ft 10h: 100%|██████████| 1000/1000 [04:41<00:00,  3.56it/s]


In [70]:
var_ctc

{(10, 1): 0.06868473959952008,
 (100, 1): 0.6377625731120373,
 (100, 10): 0.7929620046123323,
 (370, 1): 0.6824153498927058,
 (370, 10): 0.7864747298705266,
 (1348, 1): 0.6865655595291114,
 (1348, 10): 0.7765692260585267}

In [71]:
var_sent

{(10, 1): 0.3091822828372738,
 (100, 1): 0.4558751497996544,
 (100, 10): 0.7114891039330945,
 (370, 1): 0.6013871011759279,
 (370, 10): 0.7227868288842159,
 (1348, 1): 0.7222100609970917,
 (1348, 10): 0.7462541912677322}

In [72]:
var_token

{(10, 1): 0.7479468247559244,
 (100, 1): 0.7826973369312127,
 (100, 10): 0.7891808722509748,
 (370, 1): 0.9486531381462882,
 (370, 10): 0.834735564908792,
 (1348, 1): 0.8216657202401139,
 (1348, 10): 0.8248390147723607}