In [2]:
import os
import re
import random

import pandas as pd
import torchaudio
from tqdm import tqdm
import csv

In [3]:
def load_df_from_tsv(path: str, enc='utf-8'):
    return pd.read_csv(
        path,
        sep="\t",
        header=0,
        encoding=enc,
        escapechar="\\",
        quoting=csv.QUOTE_NONE,
        na_filter=False,
    )


def save_df_to_tsv(dataframe, path):
    dataframe.to_csv(
        path,
        sep="\t",
        header=True,
        index=False,
        encoding="utf-8",
        escapechar="\\",
        quoting=csv.QUOTE_NONE,
    )

In [4]:
path_to_iwslt_mt_en = "XXX"
path_to_mustc = "XXX"
path_to_commonvoice13 = "XXX"

# ASR from IWSLT

In [23]:
root = '{}/MT_EN'.format(path_to_iwslt_mt_en)

In [24]:
asr_audio_root = os.path.join(root, 'mt_only', 'Audio')
asr_root = os.path.join(root, 'mt_only', 'Transcription', 'Malti')
new_wav_root = '{}/mt-en/data/asr/wav'.format(path_to_mustc)
mustc_root = path_to_mustc

In [25]:
new_wav_root

'/mnt/data7/siqiouyang/datasets/must-c-v1.0/mt-en/data/asr/wav'

In [26]:
def detect_annotator(items):
    common_suffix = ""
    for item in items:
        # print(common_suffix, item)
        if 'annotator' in item.lower() or 'sp' in item.lower()[-3:] or 'speaker' in item.lower():
            if common_suffix == "":
                common_suffix = item
            else:
                for j in range(min(len(common_suffix), len(item))):
                    if common_suffix[len(common_suffix) - j - 1] != item[len(item) - j - 1] and not common_suffix[len(common_suffix) - j - 1].isdigit():
                        common_suffix = common_suffix[len(common_suffix) - j:]
                        break
    return common_suffix

In [27]:
def detect_machine(items):
    common_suffix = ""
    for item in items:
        if 'Machine' in item:
            if common_suffix == "":
                common_suffix = item
            else:
                for j in range(min(len(common_suffix), len(item))):
                    if common_suffix[len(common_suffix) - j - 1] != item[len(item) - j - 1] and not common_suffix[len(common_suffix) - j - 1].isdigit():
                        common_suffix = common_suffix[len(common_suffix) - j:]
                        break
    return common_suffix

In [28]:
def equal(s1, s2):
    if len(s1) != len(s2):
        return False
    for u, v in zip(s1, s2):
        if u != v and not u.isdigit():
            return False
    return True

In [29]:
def make_df(quads, wav_id, speaker_id, wav, rate):
    df = pd.DataFrame(columns=['id', 'audio', 'n_frames', 'src_text', 'speaker', 'src_lang'])

    id = []
    audio = []
    n_frames = []
    src_text = []
    speaker = []
    src_lang = []

    os.makedirs(new_wav_root, exist_ok=True)
    for idx, q in enumerate(quads):
        duration = q[1] - q[0]
        text = re.sub('<.*>', ' ', q[3]).strip()
        if duration > 0.5 and text != "" and duration < 30 and '/' not in text:
            subwav_id = wav_id + '_spk_{}_{}'.format(speaker_id, idx)
            wav_s, wav_e = int(q[0] * rate), int(q[1] * rate)

            # if 'dia_task_pics02_13f_10m_spk_0' in subwav_id:
            #     print(subwav_id, wav_s, wav_e, len(wav))

            torchaudio.save(os.path.join(new_wav_root, subwav_id + '.wav'), wav[wav_s : wav_e].unsqueeze(0), rate)
            # torchaudio.save(os.path.join('mt_en', subwav_id + '.wav'), wav[wav_s : wav_e].unsqueeze(0), rate)
            
            id.append(subwav_id)
            audio.append('mt-en/data/asr/wav/{}.wav'.format(subwav_id))
            n_frames.append(wav_e - wav_s)
            src_text.append(text)
            speaker.append(q[2])
            src_lang.append('mt')

            print(subwav_id, q[0], q[1], text, sep='\t')

    df['id'] = id
    df['audio'] = audio
    df['n_frames'] = n_frames
    df['src_text'] = src_text
    df['speaker'] = speaker
    df['src_lang'] = src_lang
    return df

In [30]:
all_df = None

In [31]:
sum_d = 0
for fn in os.listdir(asr_root):
    print(fn)

    with open(os.path.join(asr_root, fn), 'r', encoding='utf-16') as r:
        text = r.read()
    
    items = text.split('\t')
    
    # str_ann = detect_annotator(items[3::3])
    # str_mac = detect_machine(items[3::3])

    str_mac = detect_annotator(items[3::3])
    str_ann = detect_machine(items[3::3])

    quads = []

    for i in range(3, len(items), 3):
        if equal(items[i][-len(str_ann):], str_ann):
            if i + 3 < len(items):
                if equal(items[i + 3][-len(str_ann):], str_ann):
                    text = items[i + 3][:-len(str_ann)]
                elif equal(items[i + 3][-len(str_mac):], str_mac):
                    text = items[i + 3][:-len(str_mac)]
                else:
                    text = items[i + 3]
                    # print(text)

                quads.append((float(items[i + 1]), float(items[i + 2]), items[i][-len(str_ann):], text))

    quads = sorted(quads)
    annotators = set([q[2] for q in quads])
    print(annotators)

    wav, rate = torchaudio.load(os.path.join(asr_audio_root, fn[:-4] + '.wav'))
    resampled_wav = torchaudio.functional.resample(wav, orig_freq=rate, new_freq=16000)
    if wav.size()[0] == 2 and len(annotators) == 2:
        for i in range(2):
            cur_df = make_df([q for q in quads if str(i + 1) in q[2]], fn[:-4], i, resampled_wav[i], 16000)
            all_df = cur_df if all_df is None else pd.concat([all_df, cur_df], ignore_index=True)
    elif wav.size()[0] == 1 and len(annotators) == 1:
        cur_df = make_df(quads, fn[:-4], 0, resampled_wav[0], 16000)
        all_df = cur_df if all_df is None else pd.concat([all_df, cur_df], ignore_index=True)
    elif wav.size()[0] == 2 and len(annotators) == 1:
        cur_df = make_df(quads, fn[:-4], 0, resampled_wav[0] + resampled_wav[1], 16000)
        all_df = cur_df if all_df is None else pd.concat([all_df, cur_df], ignore_index=True)
    else:
        pass
        

dia_discuss_topic05_18f_12m.tsv
{'Spk 1 Machine', 'Spk 2 Machine'}
dia_discuss_topic05_18f_12m_spk_0_4	13.187847314404472	15.58	eżattament e
dia_discuss_topic05_18f_12m_spk_0_8	19.3	21.14	tirrifletti ill-karattru ta' dak li jkun u
dia_discuss_topic05_18f_12m_spk_0_10	22.92	25.86	u m'għandu jkun hemm ebda stigma fejn il-utatus huma x' inhuma ttatus
dia_discuss_topic05_18f_12m_spk_0_13	32.76	34.7	dil-ħaġa li jien naf pereżempju jiġi jgħidulek f' ċerti postijiet tax-xogħol ma jaċċettawx nies bit-tatus għalija msorija injolenza l-kbira naraha diskriminazzjoni kbira
dia_discuss_topic05_18f_12m_spk_0_15	37.88	49.56	għax bnidem li jkollu tatus kapaċi jkun
dia_discuss_topic05_18f_12m_spk_0_17	51.04	53.42	kompetenti fix-xogħol
dia_discuss_topic05_18f_12m_spk_0_21	57.5	58.82	jekk bniedem jagħmel tator jagħmilha
dia_discuss_topic05_18f_12m_spk_0_23	60.8	62.98	ħa bħala gost u aħdan bħal meta tagħmel heir stile bħal meta tagħmel tilbess ċertu ħwejjeġ
dia_discuss_topic05_18f_12m_spk_0_25	64.76	71.4	

In [32]:
# torchaudio.save('mt_en/dia_spont01_13m_14m_spk_0.wav', resampled_wav, 16000)
# torchaudio.save('mt_en/dia_spont01_13m_14m_spk_1.wav', resampled_wav[1:], 16000)

In [33]:
all_df['n_frames'].sum() / 16000 / 3600

0.8228403472222222

In [34]:
asr_iwslt_df = all_df.copy()

In [35]:
split_ratio = [0.9, 0.1]
n_sample = len(asr_iwslt_df)
indices = list(range(n_sample))
random.shuffle(indices)
asr_iwslt_train_df = asr_iwslt_df.iloc[indices[:int(n_sample * split_ratio[0])]].sort_index().copy()
asr_iwslt_dev_df = asr_iwslt_df.iloc[indices[int(n_sample * split_ratio[0]):]].sort_index().copy()

In [36]:
save_df_to_tsv(asr_iwslt_train_df, os.path.join(mustc_root, 'train_asr_mt_iwslt.tsv'))
save_df_to_tsv(asr_iwslt_dev_df, os.path.join(mustc_root, 'dev_asr_mt_iwslt.tsv'))

# ASR from CommonVoice

In [12]:
root = '{}/mt'.format(path_to_commonvoice)
asr_audio_root = os.path.join(root, 'clips')
new_wav_root = '{}/mt-en/data/asr/wav'.format(path_to_mustc)
mustc_root = path_to_mustc

In [13]:
ori_df = load_df_from_tsv(os.path.join(root, 'validated.tsv'))

In [14]:
ori_df

Unnamed: 0,client_id,path,sentence,up_votes,down_votes,age,gender,accents,variant,locale,segment
0,119ecbc0c91da642d62bcc7b87db0e6bc4c104715290ca...,common_voice_mt_22259676.mp3,Ix-xogħol mistenni jibda dan ix-xahar stess u ...,2,0,,,,,mt,
1,2812a0daa8849062529bbe704f20be91c8667a03445da9...,common_voice_mt_22463716.mp3,It-tifla rringrazzjatha ta' dak kollu li għaml...,2,0,,,,,mt,
2,4328dce099b8584ea25b687213a72ea7d7666c88f09ab3...,common_voice_mt_22196702.mp3,Bl-ikbar responsabbilta' nerġa' ntenni li jien...,2,0,fifties,male,,,mt,
3,4e65a29d25e88c47b77cda815a5f1ec02097c8b4102a66...,common_voice_mt_23742149.mp3,Jiena nsib ruħi idjota niddiskuti affarijiet l...,2,0,,,,,mt,
4,5136da6537f4a0d4acf487b0f8e9d6546405f0ee393d1d...,common_voice_mt_21921959.mp3,Jekk hemm problema hemm problema u wieħed irid...,2,0,,,,,mt,
...,...,...,...,...,...,...,...,...,...,...,...
6378,013358357c251280edcdc585291df70be4d52c209c3d39...,common_voice_mt_21991213.mp3,Issa ħalli nispjega xi ħaġa għax hija importanti.,2,0,fifties,female,,,mt,
6379,013358357c251280edcdc585291df70be4d52c209c3d39...,common_voice_mt_22176840.mp3,"Jekk nara dan it-taħżiż hawn, nista' nkun naf?",2,0,fifties,female,,,mt,
6380,013358357c251280edcdc585291df70be4d52c209c3d39...,common_voice_mt_22177053.mp3,"Ovvjament hemmhekk imbagħad inqabad, intbagħat...",2,0,fifties,female,,,mt,
6381,013358357c251280edcdc585291df70be4d52c209c3d39...,common_voice_mt_22205002.mp3,Kienu jgħidu: Aħna niddeċiedu mbagħad niġu qud...,2,0,fifties,female,,,mt,


In [15]:
new_df = pd.DataFrame(columns=['id', 'audio', 'n_frames', 'src_text', 'speaker', 'src_lang'])

id = []
audio = []
n_frames = []
src_text = []
speaker = []
src_lang = []

for row in tqdm(ori_df.iterrows(), total=len(ori_df)):
    idx, row = row
    
    ind = row['path'][:-4]

    wav, rate = torchaudio.load(os.path.join(asr_audio_root, ind + '.mp3'))
    resampled_wav = torchaudio.functional.resample(wav, orig_freq=rate, new_freq=16000)
    torchaudio.save(os.path.join(new_wav_root, ind + '.wav'), resampled_wav, 16000)
    # os.makedirs('/mnt/data/siqiouyang/datasets/must-c-v1.0/mt-en/data/asr/align_mfat/' + row['client_id'][-10:], exist_ok=True)
    # torchaudio.save(os.path.join('/mnt/data/siqiouyang/datasets/must-c-v1.0/mt-en/data/asr/align_mfat/', row['client_id'][-10:], ind + '.wav'), resampled_wav, 16000)

    id.append(ind)
    audio.append('mt-en/data/asr/wav/{}.wav'.format(ind))
    n_frames.append(int(resampled_wav.size(1)))
    src_text.append(row['sentence'])
    speaker.append(row['client_id'][-10:])
    src_lang.append('mt')

new_df['id'] = id
new_df['audio'] = audio
new_df['n_frames'] = n_frames
new_df['src_text'] = src_text
new_df['speaker'] = speaker
new_df['src_lang'] = src_lang

100%|██████████| 6383/6383 [00:59<00:00, 107.44it/s]


In [16]:
new_df['n_frames'].sum() / 16000 / 3600

8.276866666666667

In [17]:
asr_cv_df = new_df.copy()

In [18]:
split_ratio = [0.9, 0.1]
n_sample = len(asr_cv_df)
indices = list(range(n_sample))
random.shuffle(indices)
asr_cv_train_df = asr_cv_df.iloc[indices[:int(n_sample * split_ratio[0])]].sort_index().copy()
asr_cv_dev_df = asr_cv_df.iloc[indices[int(n_sample * split_ratio[0]):]].sort_index().copy()

In [19]:
save_df_to_tsv(asr_cv_train_df, os.path.join(mustc_root, 'train_asr_mt_cv.tsv'))
save_df_to_tsv(asr_cv_dev_df, os.path.join(mustc_root, 'dev_asr_mt_cv.tsv'))

# ST from IWSLT

In [20]:
root = '{}/MT_EN'.format(path_to_iwslt_mt_en)

In [21]:
st_audio_root = os.path.join(root, 'mt_en_parallel', 'Audio')
mt_root = os.path.join(root, 'mt_en_parallel', 'Transcription', 'Malti')
en_root = os.path.join(root, 'mt_en_parallel', 'Transcription', 'English')
new_wav_root = '{}/mt-en/data/st/wav'.format(path_to_mustc)
new_asr_wav_root = '{}/mt-en/data/asr/wav'.format(path_to_mustc)
mustc_root = path_to_mustc

In [22]:
def asr_make_df(mt_en_df, wav_id, speaker_id, wav, rate):
    df = pd.DataFrame(columns=['id', 'audio', 'n_frames', 'src_text', 'speaker', 'src_lang'])

    id = []
    audio = []
    n_frames = []
    src_text = []
    speaker = []
    src_lang = []

    os.makedirs(os.path.join('{}/mt-en/data/asr/align_mfat/'.format(path_to_mustc), wav_id + '_spk_{}'.format(speaker_id)), exist_ok=True)

    for q in tqdm(mt_en_df.iterrows(), total=len(mt_en_df)):
        idx, q = q[0], q[1]
        duration = q['End'] - q['Start']
        s_text = re.sub('<.*>', ' ', q['Text_y']).strip()
        if duration > 0.5 and duration < 30 and '/' not in s_text and s_text != "":
            subwav_id = wav_id + '_spk_{}_{}'.format(speaker_id, idx)
            wav_s, wav_e = int(q['Start'] * rate), int(q['End'] * rate)
            torchaudio.save(os.path.join(new_asr_wav_root, subwav_id + '.wav'), wav[wav_s : wav_e].unsqueeze(0), rate)
            torchaudio.save(os.path.join('{}/mt-en/data/asr/align_mfat/'.format(path_to_mustc), wav_id + '_spk_{}'.format(speaker_id), subwav_id + '.wav'), wav[wav_s : wav_e].unsqueeze(0), rate)
            
            id.append(subwav_id)
            audio.append('mt-en/data/asr/wav/{}.wav'.format(subwav_id))
            n_frames.append(wav_e - wav_s)
            src_text.append(s_text)
            speaker.append(q['Speaker'])
            src_lang.append('mt')

    df['id'] = id
    df['audio'] = audio
    df['n_frames'] = pd.Series(n_frames, dtype=int)
    df['src_text'] = src_text
    df['speaker'] = speaker
    df['src_lang'] = src_lang
    return df

In [75]:
def st_make_df(mt_en_df, wav_id, speaker_id, wav, rate, ):
    df = pd.DataFrame(columns=['id', 'audio', 'n_frames', 'src_text', 'tgt_text', 'speaker', 'src_lang', 'tgt_lang'])

    id = []
    audio = []
    n_frames = []
    src_text = []
    tgt_text = []
    speaker = []
    src_lang = []
    tgt_lang = []

    for q in tqdm(mt_en_df.iterrows(), total=len(mt_en_df)):
        idx, q = q[0], q[1]
        duration = q['End'] - q['Start']
        s_text = re.sub('<.*>', ' ', q['Text_y']).strip()
        t_text = re.sub('<.*>', ' ', q['Text_x']).strip()
        if duration > 0.5 and duration < 30 and '/' not in s_text + t_text and s_text != "" and t_text != "":
            subwav_id = wav_id + '_spk_{}_{}'.format(speaker_id, idx)
            wav_s, wav_e = int(q['Start'] * rate), int(q['End'] * rate)
            # torchaudio.save(os.path.join(new_wav_root, subwav_id + '.wav'), wav[wav_s : wav_e].unsqueeze(0), rate)
            
            id.append(subwav_id)
            audio.append('mt-en/data/st/wav/{}.wav'.format(subwav_id))
            n_frames.append(wav_e - wav_s)
            src_text.append(s_text)
            tgt_text.append(t_text)
            speaker.append(q['Speaker'])
            src_lang.append('mt')
            tgt_lang.append('en')

            print(subwav_id, q['Start'], q['End'], s_text)

    df['id'] = id
    df['audio'] = audio
    df['n_frames'] = pd.Series(n_frames, dtype=int)
    df['src_text'] = src_text
    df['tgt_text'] = tgt_text
    df['speaker'] = speaker
    df['src_lang'] = src_lang
    df['tgt_lang'] = tgt_lang
    return df

In [51]:
# ST DF
all_df = None
for fn in os.listdir(st_audio_root)[:1]:
    id = fn[:-4]
    try:
        en_df = load_df_from_tsv(os.path.join(en_root, id + '.tsv'))
        mt_df = load_df_from_tsv(os.path.join(mt_root, id + '.tsv'), enc='utf-16')
    except:
        print('Skip ' + fn)
        continue
    if 'Speaker' in en_df.columns:
        mt_en_df = pd.merge(en_df, mt_df, on=['Speaker', 'Start', 'End'])
    else:
        mt_en_df = pd.merge(en_df, mt_df, on=['Start', 'End'])

    wav, rate = torchaudio.load(os.path.join(st_audio_root, id + '.wav'))
    resampled_wav = torchaudio.functional.resample(wav, orig_freq=rate, new_freq=16000)

    annotators = mt_en_df['Speaker'].unique().tolist()
    if '2' in annotators[0]:
        annotators = annotators[::-1]
    print(fn, annotators)

    if wav.size(0) == 2 and len(annotators) == 2:
        for i in range(2):
            cur_df = st_make_df(mt_en_df[mt_en_df['Speaker'] == annotators[i]], id, i, resampled_wav[i], 16000)
            all_df = cur_df if all_df is None else pd.concat([all_df, cur_df], ignore_index=True)
    elif wav.size()[0] == 1 and len(annotators) == 1:
        cur_df = st_make_df(mt_en_df, id, 0, resampled_wav[0], 16000)
        all_df = cur_df if all_df is None else pd.concat([all_df, cur_df], ignore_index=True)
    elif wav.size()[0] == 2 and len(annotators) == 1:
        cur_df = st_make_df(mt_en_df, id, 0, resampled_wav[0] + resampled_wav[1], 16000)
        all_df = cur_df if all_df is None else pd.concat([all_df, cur_df], ignore_index=True)
    
    print(all_df['n_frames'].dtype)

Dia_Spont02_13M_14M.wav ['Annotator SP 1 ', 'Annotator SP 2 ']


100%|██████████| 170/170 [00:00<00:00, 4006.29it/s]


Dia_Spont02_13M_14M_spk_0_0 0.006881827 2.245248 Il-ħobż.
Dia_Spont02_13M_14M_spk_0_14 17.73556 19.25959 M'għandniex inħalluh haw'.
Dia_Spont02_13M_14M_spk_0_16 21.64732 22.34847 Eħe+
Dia_Spont02_13M_14M_spk_0_18 23.21309 24.53217 U jien ma ko...
Dia_Spont02_13M_14M_spk_0_20 24.53217 25.60337 mhux kulħadd kien jaf.
Dia_Spont02_13M_14M_spk_0_21 25.60337 26.69405 Jiena fil-bidu ma kontx naf
Dia_Spont02_13M_14M_spk_0_22 26.69405 28.3586 għax li kien qed jiġri
Dia_Spont02_13M_14M_spk_0_23 28.3586 30.20887 jiena tfajtu lura fil-ħamrija
Dia_Spont02_13M_14M_spk_0_25 30.69995 32.10041 u kuljum nerġa' mmur
Dia_Spont02_13M_14M_spk_0_26 32.10041 33.6001 l-iskola dejjem narah
Dia_Spont02_13M_14M_spk_0_27 33.6001 34.68986 resaq
Dia_Spont02_13M_14M_spk_0_28 34.68986 35.34974 pulzier
Dia_Spont02_13M_14M_spk_0_30 35.34974 37.53992 iktar lejna.
Dia_Spont02_13M_14M_spk_0_31 37.53992 38.82699 Kont ħa niġġennen
Dia_Spont02_13M_14M_spk_0_32 38.82699 41.14492 da... qis... qisu qed jiġri warajja wara li faqa

100%|██████████| 287/287 [00:00<00:00, 3620.14it/s]

Dia_Spont02_13M_14M_spk_1_1 0.007271497 1.02 Irrekordjata.
Dia_Spont02_13M_14M_spk_1_3 3.337748 5.090631 Mela [i]l-kantun sewwa
Dia_Spont02_13M_14M_spk_1_6 6.25335 8.23995 wara li faqa' subgħajh
Dia_Spont02_13M_14M_spk_1_7 8.23995 10.00644 għedna orrajt   x'ħa nagħmlu bih dal-kantun?
Dia_Spont02_13M_14M_spk_1_8 10.00644 11.98655 u lo... erġajna tfajnieh lura f'postu+.
Dia_Spont02_13M_14M_spk_1_9 11.98655 13.22717 Em imma mbagħad
Dia_Spont02_13M_14M_spk_1_10 13.22717 15.18162 ta' studenti bravi li konna għedna dan
Dia_Spont02_13M_14M_spk_1_13 16.57744 18.99346 Dan mhux... m'għandniex inħalluh fejn qiegħed.
Dia_Spont02_13M_14M_spk_1_15 18.99346 22.42 Għedna orrajt ejja nitfgħuh ik... ik... iktar viċin tagħna speċi
Dia_Spont02_13M_14M_spk_1_19 23.5 24.59778 taħt siġra.
Dia_Spont02_13M_14M_spk_1_24 30.06418 30.80429 Eżatt
Dia_Spont02_13M_14M_spk_1_41 49.17134 51.32488 Le mhux jien mhux jien kont qed inressqu.
Dia_Spont02_13M_14M_spk_1_48 57.52443 58.14281 Eżatt
Dia_Spont02_13M_14M_spk_1_54




In [None]:
st_iwslt_df = all_df.copy()

In [96]:
# ASR DF
all_df = None
for fn in os.listdir(st_audio_root):
    id = fn[:-4]
    try:
        en_df = load_df_from_tsv(os.path.join(en_root, id + '.tsv'))
        mt_df = load_df_from_tsv(os.path.join(mt_root, id + '.tsv'), enc='utf-16')
    except:
        print('Skip ' + fn)
        continue
    if 'Speaker' in en_df.columns:
        mt_en_df = pd.merge(en_df, mt_df, on=['Speaker', 'Start', 'End'])
    else:
        mt_en_df = pd.merge(en_df, mt_df, on=['Start', 'End'])

    wav, rate = torchaudio.load(os.path.join(st_audio_root, id + '.wav'))
    resampled_wav = torchaudio.functional.resample(wav, orig_freq=rate, new_freq=16000)

    annotators = mt_en_df['Speaker'].unique().tolist()
    if '2' in annotators[0]:
        annotators = annotators[::-1]
    print(fn, annotators)

    if wav.size(0) == 2 and len(annotators) == 2:
        for i in range(2):
            cur_df = asr_make_df(mt_en_df[mt_en_df['Speaker'] == annotators[i]], id, i, resampled_wav[i], 16000)
            all_df = cur_df if all_df is None else pd.concat([all_df, cur_df], ignore_index=True)
    elif wav.size()[0] == 1 and len(annotators) == 1:
        cur_df = asr_make_df(mt_en_df, id, 0, resampled_wav[0], 16000)
        all_df = cur_df if all_df is None else pd.concat([all_df, cur_df], ignore_index=True)
    elif wav.size()[0] == 2 and len(annotators) == 1:
        cur_df = asr_make_df(mt_en_df, id, 0, resampled_wav[0] + resampled_wav[1], 16000)
        all_df = cur_df if all_df is None else pd.concat([all_df, cur_df], ignore_index=True)
    
    print(all_df['n_frames'].dtype)

Dia_Spont02_13M_14M.wav ['Annotator SP 1 ', 'Annotator SP 2 ']


100%|██████████| 170/170 [00:00<00:00, 1835.54it/s]
100%|██████████| 287/287 [00:00<00:00, 1946.97it/s]


int64
Dia_Task_MMap03_13F_10M.wav ['Spk 1 Annotator ', 'Spk 2 Annotator ']


100%|██████████| 76/76 [00:00<00:00, 1189.12it/s]
100%|██████████| 41/41 [00:00<00:00, 329.33it/s]

int64





Dia_Task_Pics01_14F_03M.wav ['Spk 1 Annotator ', 'Spk 2 Annotator ']


100%|██████████| 500/500 [00:00<00:00, 3925.80it/s]
100%|██████████| 450/450 [00:00<00:00, 28187.53it/s]

int64





Dia_Task_Pics02_13F_10M.wav ['Spk 1 Annotator ', 'Spk 2 Annotator ']


100%|██████████| 85/85 [00:00<00:00, 1190.77it/s]
100%|██████████| 81/81 [00:00<00:00, 1544.49it/s]

int64





Dia_Task_Pics02_14F_03M.wav ['Spk 1 Annotator ', 'Spk 2 Annotator ', '2 ']
int64
Dia_Task_Pics03_01F_06M.wav ['Spk 1 Annotator ', 'Spk 2 Annotator ']


100%|██████████| 84/84 [00:00<00:00, 759.25it/s]
100%|██████████| 97/97 [00:00<00:00, 20788.28it/s]

int64





Mono_Discuss_Topic02_03M.wav ['Annotator ']


100%|██████████| 153/153 [00:00<00:00, 1226.29it/s]

int64





Mono_Discuss_Topic03_07M.wav ['Annotator ']


100%|██████████| 81/81 [00:00<00:00, 1237.77it/s]


int64
Mono_Discuss_Topic04_03M.wav ['Annotator ']


100%|██████████| 65/65 [00:00<00:00, 763.72it/s]


int64
Mono_Discuss_Topic05_02F.wav ['SP1 ']


100%|██████████| 42/42 [00:00<00:00, 2181.61it/s]


int64
Mono_Discuss_Topic06_01M.wav ['Annotator ']


100%|██████████| 115/115 [00:00<00:00, 949.87it/s]

int64
Mono_Recipe_05M.wav ['1', 'SP1 ']
int64





Mono_Recipe_17F.wav ['SP1 ']


100%|██████████| 22/22 [00:00<00:00, 455.02it/s]


int64
Mono_Recipe_22F.wav ['1', 'Annotator ']
int64
Mono_Retell_Vid02_21F.wav ['Annotator ']


100%|██████████| 48/48 [00:00<00:00, 594.52it/s]

int64





Mono_Spont01_21F.wav ['Annotator ']


100%|██████████| 11/11 [00:00<00:00, 475.43it/s]

int64
Skip Mono_Spont01_22F.wav





Mono_Task_MMap02_13M.wav ['Annotator ']


100%|██████████| 174/174 [00:00<00:00, 1039.22it/s]

int64





In [69]:
asr_iwslt_df = all_df.copy()

In [70]:
split_ratio = [0.9, 0.1]
n_sample = len(asr_iwslt_df)
indices = list(range(n_sample))
random.shuffle(indices)
asr_iwslt_train_df = asr_iwslt_df.iloc[indices[:int(n_sample * split_ratio[0])]].sort_index().copy()
asr_iwslt_dev_df = asr_iwslt_df.iloc[indices[int(n_sample * split_ratio[0]):]].sort_index().copy()

In [71]:
save_df_to_tsv(asr_iwslt_train_df, os.path.join(mustc_root, 'train_asr_mt_iwslt.tsv'))
save_df_to_tsv(asr_iwslt_dev_df, os.path.join(mustc_root, 'dev_asr_mt_iwslt.tsv'))

In [28]:
st_iwslt_df

Unnamed: 0,id,audio,n_frames,src_text,tgt_text,speaker,src_lang,tgt_lang
0,Dia_Spont02_13M_14M_spk_0_0,mt-en/data/st/wav/Dia_Spont02_13M_14M_spk_0_0.wav,35813,Il-ħobż.,The bread.,Annotator SP 1,mt,en
1,Dia_Spont02_13M_14M_spk_0_14,mt-en/data/st/wav/Dia_Spont02_13M_14M_spk_0_14...,24385,M'għandniex inħalluh haw'.,We should not leave it here.,Annotator SP 1,mt,en
2,Dia_Spont02_13M_14M_spk_0_16,mt-en/data/st/wav/Dia_Spont02_13M_14M_spk_0_16...,11218,Eħe+,Ok,Annotator SP 1,mt,en
3,Dia_Spont02_13M_14M_spk_0_18,mt-en/data/st/wav/Dia_Spont02_13M_14M_spk_0_18...,21105,U jien ma ko...,And I didn't kno...,Annotator SP 1,mt,en
4,Dia_Spont02_13M_14M_spk_0_20,mt-en/data/st/wav/Dia_Spont02_13M_14M_spk_0_20...,17139,mhux kulħadd kien jaf.,not everyone knew.,Annotator SP 1,mt,en
...,...,...,...,...,...,...,...,...
1106,Mono_Task_MMap02_13M_spk_0_161,mt-en/data/st/wav/Mono_Task_MMap02_13M_spk_0_1...,71012,mela... mela dik &għadna kif iffaċilitajna ix-...,so... so that's how we facilitated the work of...,Annotator,mt,en
1107,Mono_Task_MMap02_13M_spk_0_163,mt-en/data/st/wav/Mono_Task_MMap02_13M_spk_0_1...,124963,Kif ukoll imbagħad però realistikament jekk......,As well as then however realistically if... le...,Annotator,mt,en
1108,Mono_Task_MMap02_13M_spk_0_166,mt-en/data/st/wav/Mono_Task_MMap02_13M_spk_0_1...,78795,jekk dawn qegħdin jikkomunikaw kollha flimkien...,if they are all communicating together they wi...,Annotator,mt,en
1109,Mono_Task_MMap02_13M_spk_0_168,mt-en/data/st/wav/Mono_Task_MMap02_13M_spk_0_1...,115712,Imbagħad jekk ner... n... naqbdu dan l-argumen...,Then if we... n... We pick the argument of the...,Annotator,mt,en


In [150]:
split_ratio = [0.8, 0.1, 0.1]

In [158]:
n_sample = len(st_iwslt_df)
indices = list(range(n_sample))
random.shuffle(indices)

In [162]:
st_train_df = all_df.iloc[indices[:int(n_sample * split_ratio[0])]].sort_index().copy()
st_dev_df = all_df.iloc[indices[int(n_sample * split_ratio[0]):int(n_sample * (split_ratio[0] + split_ratio[1]))]].sort_index().copy()
st_test_df = all_df.iloc[indices[int(n_sample * (split_ratio[0] + split_ratio[1])):]].sort_index().copy()

In [170]:
save_df_to_tsv(st_train_df, os.path.join(mustc_root, 'train_st_mt_en.tsv'))
save_df_to_tsv(st_dev_df, os.path.join(mustc_root, 'dev_st_mt_en.tsv'))
save_df_to_tsv(st_test_df, os.path.join(mustc_root, 'test_st_mt_en.tsv'))

In [17]:
st_train_df

NameError: name 'st_train_df' is not defined