In [4]:
# pip install praat-textgrids

In [1]:
import os
import random
from string import punctuation

from tqdm import tqdm

import sentencepiece
import torchaudio
import textgrids

import numpy as np
import torch as th
import pandas as pd

from g2p_en import G2p

from fairseq.data import PhonemeDictionary
from ConST.prepare_data.data_utils import load_df_from_tsv, save_df_to_tsv

In [39]:
root = '/mnt/data/siqiouyang/datasets/must-c-v1.0'
lang = 'es'

In [40]:
spm = sentencepiece.SentencePieceProcessor(os.path.join(root, 'spm_unigram10000_st_{}.model'.format(lang)))

In [41]:
split = 'train'

In [42]:
df = load_df_from_tsv(os.path.join(root, '{}_st_{}.tsv'.format(split, lang)))
# df = load_df_from_tsv(os.path.join(root, 'train-1h_asr_10h.tsv'))

In [43]:
df['n_frames'].sum() / 16000 / 3600

479.9419181770833

In [18]:
n_frames = 0
for fn in os.listdir('/mnt/data/siqiouyang/datasets/must-c-v1.0/en-de/data/dev/wav'):
    info = torchaudio.info('/mnt/data/siqiouyang/datasets/must-c-v1.0/en-de/data/dev/wav/' + fn)
    n_frames += info.num_frames
n_frames / 16000 / 3600

2.8958341319444445

In [30]:
import yaml
with open('/mnt/data/siqiouyang/datasets/must-c-v1.0/en-de/data/train/txt/train.yaml') as r:
    y = yaml.load(r, Loader=yaml.Loader)

In [33]:
duration = 0
for x in y:
    duration += x['duration']
duration / 3600

400.0411619297576

In [16]:
# indices = list(range(train_df.shape[0]))
# random.shuffle(indices)
# save_df_to_tsv(train_df.iloc[indices[:10000]], os.path.join(root, 'train-tiny_asr.tsv'))

In [17]:
save_dir = os.path.join(root, 'en-{}'.format(lang), 'data', split, 'align_mfat_10h')

In [18]:
os.makedirs(save_dir, exist_ok=True)

In [19]:
last_audio_path = None
for idx in tqdm(range(len(df))):
    audio_path, offset, num_frames = os.path.join(root, df['audio'][idx]).split(':')
    offset, num_frames = int(offset), int(num_frames)
    if last_audio_path is None or audio_path != last_audio_path:
        waveform, frame_rate = torchaudio.load(os.path.join(root, audio_path))
        last_audio_path = audio_path
    torchaudio.save(os.path.join(save_dir, '{}.wav'.format(df['id'][idx])), waveform[:, offset : offset + num_frames], sample_rate=frame_rate)

100%|██████████| 1418/1418 [00:02<00:00, 627.47it/s]


In [20]:
sentences = df['src_text'].tolist()

In [21]:
def covered(s, punctuation):
    for c in s:
        if c not in punctuation:
            return False
    return True

space = '▁'
tokenized_sentences = []
segmentss = []
punctuation = punctuation + '—’'
for sent in tqdm(df['src_text'].tolist()):
    tokens = spm.EncodeAsPieces(sent)
    segments = []
    last = -1
    for idx, token in enumerate(tokens):
        if token.startswith(space) or covered(token, punctuation):
            if last != -1 and last <= idx - 1:
                segments.append((last, idx - 1))
            last = idx + (token == space or covered(token, punctuation) or \
                (token.startswith(space) and len(token) > 1 and covered(token[1:], punctuation)))    
    
    if last < len(tokens):
        segments.append((last, len(tokens) - 1))

    tokenized_sentence = []
    for seg in segments:
        token = ''.join(tokens[seg[0] : seg[1] + 1]).replace(space, '')
        if token.replace(',', '').isnumeric():
            token = token.replace(',', '')
        tokenized_sentence.append(token)

    tokenized_sentences.append(tokenized_sentence)
    segmentss.append(segments)

100%|██████████| 1418/1418 [00:00<00:00, 17489.68it/s]


In [22]:
for i, id in enumerate(tqdm(df['id'])):
    with open(os.path.join(save_dir, '{}.txt'.format(id)), 'w') as w:
        w.write(' '.join(tokenized_sentences[i]))

100%|██████████| 1418/1418 [00:00<00:00, 29121.98it/s]


```bash
mfa align . english_mfa english_mfa textgrids --clean
mfa train -o model/acoustic_model --phone_set IPA --output_format long_textgrid --include_original_text -t /mnt/data/siqiouyang/cache/MFA/mfat_enes ./ english_mfa ./textgrids --clean
```

In [23]:
filtered_grids = []
n_outlier = 0
for i, id in enumerate(tqdm(df['id'])):
    grid_path = os.path.join(save_dir, 'textgrids/{}.TextGrid'.format(id))
    if os.path.exists(grid_path):
        grid = textgrids.TextGrid(grid_path)
        filtered_grid = [tok for tok in grid['words'] if tok.text != '']

        if len(filtered_grid) != len(tokenized_sentences[i]):
            # print(i, [w.text for w in filtered_grid], tokenized_sentences[i], sep='\n')
            n_outlier += 1
            continue

        interval = np.array([(word.xmin, word.xmax) for word in filtered_grid])
        audio_path = os.path.join(save_dir, '{}.wav'.format(id))
        info = torchaudio.info(audio_path)
        duration = info.num_frames / info.sample_rate
        interval = interval / duration

        th.save([segmentss[i], interval], os.path.join(save_dir, '{}.pt'.format(id)))

100%|██████████| 1418/1418 [00:00<00:00, 2190.55it/s]


In [27]:
n_outlier / len(df)

0.0007768005814467718

In [28]:
n_outlier

202

In [None]:
filtered_grids = []
n_outlier = 0
for i, id in enumerate(tqdm(df['id'])):
    grid_path = os.path.join(save_dir, 'textgrids/{}.TextGrid'.format(id))
    if os.path.exists(grid_path):
        grid = textgrids.TextGrid(grid_path)

        phones = [phone.text if phone.text != '' else '<empty>' for phone in grid['phones']]

        interval = np.array([(phone.xmin, phone.xmax) for phone in grid['phones']])
        audio_path = os.path.join(save_dir, '{}.wav'.format(id))
        info = torchaudio.info(audio_path)
        duration = info.num_frames / info.sample_rate
        interval = interval / duration

        th.save([segmentss[i], interval], os.path.join(save_dir, '{}.phone.pt'.format(id)))

In [24]:
with open('/mnt/data/siqiouyang/datasets/must-c-v1.0/phone.txt', 'r') as r:
    all_phones = [p.strip() for p in r.readlines() if p.strip() != '']
    phone_dict = {p : i for i, p in enumerate(all_phones)}

In [5]:
g2p = G2p()
src_dict = PhonemeDictionary.load(os.path.join(root, 'phonemes.txt'))

In [6]:
for fn in os.listdir(root):
    # if fn.endswith('tsv') and ('asr' in fn or 'de.' in fn):
    if 'ls' in fn:

        df = load_df_from_tsv(os.path.join(root, fn))
        list_of_phonemes = []
        for src_text in tqdm(df['src_text'], desc=fn):
            raw_phonemes = g2p(src_text)
            phonemes = []
            for idx in range(len(raw_phonemes)):
                if raw_phonemes[idx] in src_dict:
                    p = raw_phonemes[idx]
                    if idx == 0 or raw_phonemes[idx - 1] not in src_dict:
                        p = '▁' + p
                    phonemes.append(p)
            list_of_phonemes.append(' '.join(phonemes))
        df['src_phoneme'] = list_of_phonemes
        save_df_to_tsv(df, os.path.join(root, fn))

train_ls960_asr.tsv: 100%|██████████| 281241/281241 [10:31<00:00, 445.27it/s]
dev_ls960_asr.tsv: 100%|██████████| 5567/5567 [00:07<00:00, 721.19it/s]


In [4]:
# For Librispeech

df = load_df_from_tsv('/mnt/data/siqiouyang/datasets/must-c-v1.0/dev_ls960_asr.tsv')

def covered(s, punctuation):
    for c in s:
        if c not in punctuation:
            return False
    return True

space = '▁'
tokenized_sentences = []
segmentss = []
punctuation = punctuation + '—’'
punctuation = punctuation.replace("'", '')
for sent in tqdm(df['src_text'].tolist()):
    tokens = spm.EncodeAsPieces(sent)
    segments = []
    last = -1
    for idx, token in enumerate(tokens):
        if token.startswith(space) or covered(token, punctuation):
            if last != -1 and last <= idx - 1:
                segments.append((last, idx - 1))
            last = idx + (token == space or covered(token, punctuation) or \
                (token.startswith(space) and len(token) > 1 and covered(token[1:], punctuation)))    
    
    if last < len(tokens):
        segments.append((last, len(tokens) - 1))

    tokenized_sentence = []
    for seg in segments:
        token = ''.join(tokens[seg[0] : seg[1] + 1]).replace(space, '')
        if token.replace(',', '').isnumeric():
            token = token.replace(',', '')
        tokenized_sentence.append(token)

    tokenized_sentences.append(tokenized_sentence)
    segmentss.append(segments)

100%|██████████| 5567/5567 [00:00<00:00, 18686.26it/s]


In [5]:
filtered_grids = []
n_outlier = 0
iterator = tqdm(df['id'])
for i, id in enumerate(iterator):
    grid_path = os.path.join('/mnt/data/siqiouyang/datasets/librispeech/LibriSpeech/librispeech_mfa/textgrids/{}'.format(df['speaker'][i]), '{}.TextGrid'.format(id))
    if os.path.exists(grid_path):
        grid = textgrids.TextGrid(grid_path)
        filtered_grid = [tok for tok in grid['words'] if tok.text != '']

        u = v = 0
        intervals = []
        fail = False
        while u < len(filtered_grid) and v < len(tokenized_sentences[i]):
            if filtered_grid[u].text == tokenized_sentences[i][v]:
                intervals.append((filtered_grid[u].xmin, filtered_grid[u].xmax))
                u += 1
                v += 1
            elif tokenized_sentences[i][v].startswith(filtered_grid[u].text):
                if u < len(filtered_grid) - 1 and tokenized_sentences[i][v] == filtered_grid[u].text + filtered_grid[u + 1].text:
                    intervals.append((filtered_grid[u].xmin, filtered_grid[u + 1].xmax))
                    u += 2
                    v += 1
                else:
                    fail = True
                    break
            else:
                fail = True
                break
        
        if u < len(filtered_grid) or v < len(tokenized_sentences[i]):
            fail = True

        iterator.set_description('{:.2f}'.format(n_outlier / (i + 1)))

        if fail:
            # print(i, [w.text for w in filtered_grid], tokenized_sentences[i], sep='\n')
            # break
            n_outlier += 1
            continue
            

        interval = np.array(intervals)
        
        audio_path = os.path.join('/mnt/data/siqiouyang/datasets/librispeech', df['audio'][i])
        info = torchaudio.info(audio_path)
        duration = info.num_frames / info.sample_rate
        interval = interval / duration

        assert len(segmentss[i]) == len(interval)
        th.save([segmentss[i], interval], os.path.join('/mnt/data/siqiouyang/datasets/librispeech/LibriSpeech/librispeech_mfa/{}'.format(df['speaker'][i]), '{}.pt'.format(id)))
print(n_outlier / len(df))

0.00: 100%|██████████| 5567/5567 [00:06<00:00, 895.10it/s] 

0.0014370396982216634





In [8]:
interval

array([[0.03123891, 0.05679801],
       [0.05679801, 0.07880724],
       [0.10365637, 0.12992545],
       [0.12992545, 0.15264466],
       [0.15264466, 0.17607384],
       [0.18033369, 0.19027334],
       [0.19027334, 0.23358182],
       [0.3542776 , 0.38125666],
       [0.38125666, 0.40397586],
       [0.40397586, 0.43237487],
       [0.44373447, 0.46077387],
       [0.46645367, 0.49272275],
       [0.49272275, 0.50479233],
       [0.50479233, 0.51828186],
       [0.51828186, 0.53319134],
       [0.53319134, 0.58146965],
       [0.58146965, 0.60773873],
       [0.62193823, 0.63968761],
       [0.64820731, 0.66950657],
       [0.66950657, 0.68512602],
       [0.68512602, 0.68938587],
       [0.68938587, 0.70642528],
       [0.70642528, 0.73837416],
       [0.76038339, 0.79233227],
       [0.79233227, 0.80085197],
       [0.80085197, 0.8342208 ],
       [0.8342208 , 0.8455804 ],
       [0.8455804 , 0.87042953],
       [0.87042953, 0.90876819],
       [0.90876819, 0.91444799],
       [0.

# Combine Librispeech

In [65]:
root = '/mnt/data/siqiouyang/datasets/must-c-v1.0'
train_splits = ["train-clean-100", "train-clean-360", "train-other-500"]
dev_splits = ["dev-clean", "dev-other"]

In [67]:
dfs = []
for split in train_splits:
    df = load_df_from_tsv(os.path.join(root, split + '.tsv'))
    dfs.append(df)
combined_df = pd.concat(dfs)
save_df_to_tsv(combined_df, os.path.join(root, 'train_ls960_asr.tsv'))

In [72]:
dfs = []
for split in dev_splits:
    df = load_df_from_tsv(os.path.join(root, split + '.tsv'))
    dfs.append(df)
combined_df = pd.concat(dfs)
save_df_to_tsv(combined_df, os.path.join(root, 'dev_ls960_asr.tsv'))

# Produce Low Resource TSV

In [5]:
split = 'train'

In [6]:
df = load_df_from_tsv(os.path.join(root, '{}_st_{}.tsv'.format(split, lang)))

In [7]:
for duration in [0]: # in hours
    limit = duration * 60 * 60 * 16000
    indices = list(range(df.shape[0]))
    sel_mask = np.zeros((df.shape[0]), dtype=bool)
    if duration > 0:
        random.shuffle(indices)
        total = 0
        for i, idx in enumerate(indices):
            total += df['n_frames'][idx]
            sel_mask[idx] = True
            if total > limit:
                break
    st_df = df.iloc[sel_mask]
    asr_df = df.iloc[~sel_mask]

    # filter those without .pt files
    filter_mask = np.zeros((asr_df.shape[0]), dtype=bool)
    for i, id in enumerate(asr_df['id']):
        pt_path = os.path.join(root, 'en-{}'.format(lang), 'data/{}/align'.format(split), '{}.pt'.format(id))
        if not os.path.exists(pt_path):
            filter_mask[i] = True
    asr_df = asr_df.iloc[~filter_mask]

    if duration > 0:
        save_df_to_tsv(st_df, os.path.join(root, 'train-{}h_st.tsv'.format(duration)))
    save_df_to_tsv(asr_df, os.path.join(root, 'train-{}h_asr.tsv'.format(duration)))

In [8]:
df.shape

(225271, 8)

In [9]:
asr_df.shape

(215748, 8)

In [3]:
df = load_df_from_tsv(os.path.join(root, 'train-1h_asr.tsv'))

In [5]:
limit = 100 * 60 * 60 * 16000
indices = list(range(df.shape[0]))
sel_mask = np.zeros((df.shape[0]), dtype=bool)
random.shuffle(indices)
total = 0
for i, idx in enumerate(indices):
    total += df['n_frames'][idx]
    sel_mask[idx] = True
    if total > limit:
        break
asr_df = df.iloc[sel_mask]

In [6]:
asr_df['n_frames'].sum() / 16000 / 3600

100.00037677083333

In [7]:
save_df_to_tsv(asr_df, os.path.join(root, 'train-1h_asr_100h.tsv'))