In [1]:
import os
import re
import random
from argparse import Namespace
from string import punctuation
from collections import defaultdict

import IPython

import torch as th
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from tqdm import tqdm
import torchaudio
import numpy as np

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import sentencepiece as spm

import textgrids
import faiss

from sentence_transformers import SentenceTransformer
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast

from fairseq.data import Dictionary
from fairseq.models.mST.w2v2_phone_transformer import W2V2Transformer
from fairseq.data.audio.multilingual_triplet_v2_phone_dataset import (
    MultilingualTripletDataConfig,
    MultilingualTripletDataset,
    MultilingualTripletDatasetCreator
)
from fairseq.data.audio.speech_to_text_dataset import get_features_or_waveform
from examples.speech_to_text.data_utils import load_df_from_tsv, save_df_to_tsv
from fairseq.checkpoint_utils import load_checkpoint_to_cpu
from fairseq.data.encoders.sentencepiece_bpe import SentencepieceBPE, SentencepieceConfig

from fairseq.models.mST.w2v2_phone_transformer import W2V2Transformer
from fairseq.data.audio.multilingual_triplet_v2_phone_dataset import (
    MultilingualTripletDataConfig,
    MultilingualTripletDataset,
    MultilingualTripletDatasetCreator
)

# Build Code-Switch for CV 9.0 Language

In [2]:
root = '/mnt/data/siqiouyang/datasets/cv-corpus-9.0-2022-04-27/'

# Tokenization

In [3]:
tokenizer = spm.SentencePieceProcessor()
tokenizer.Load('/mnt/data/siqiouyang/runs/mST/pretrained/mbart50.ft.n1/sentence.bpe.model')

True

In [4]:
lang = 'de'
lang_root = os.path.join(root, lang)

In [5]:
lang_df = load_df_from_tsv(os.path.join(lang_root, 'validated.tsv'))

In [6]:
# Optionally resample the audio to 16kHz
# os.makedirs(os.path.join(dict_lang_root, '16kHz'))
# for path in tqdm(dict_lang_df['path']):
#     audio_path = os.path.join(dict_lang_root, 'clips/{}'.format(path))
#     waveform, sample_rate = torchaudio.load(audio_path)
#     resampled_waveform = torchaudio.functional.resample(waveform, sample_rate, 16000)
#     torchaudio.save(os.path.join(dict_lang_root, '16kHz/{}.wav'.format(path[:-4])), resampled_waveform, sample_rate=16000)

In [7]:
sentences = lang_df['sentence'].tolist()
sentences

['Zieht euch bitte draußen die Schuhe aus.',
 'Es gibt auch mehrere Campingplätze.',
 'Es kommt zum Showdown in Gstaad.',
 'Ihre Fotostrecken erschienen in Modemagazinen wie der Vogue, Harper’s Bazaar und Marie Claire.',
 'Aber weißt du, wer den Stein wirklich ins Rollen gebracht hat?',
 'Felipe hat eine auch für Monarchen ungewöhnlich lange Titelliste.',
 'Was solls, ich bin bereit.',
 'Sein Atelier diente nebenher zur Tarnung unerlaubter politischer Aktivitäten.',
 'Das Internet besteht aus vielen Computern, die miteinander verbunden sind.',
 'Jörg van Essen ist stets über die Landesliste Nordrhein-Westfalen in den Deutschen Bundestag eingezogen.',
 'Der Uranus ist der siebente Planet in unserem Sonnensystem.',
 'Sie gewannen das Match mit sechs Löchern Vorsprung.',
 'Die Wagen erhielten ein einheitliches Erscheinungsbild in weiß mit rotem Fensterband.',
 'Seinen Vornamen erhielt er in Gedenken an seinen früh verstorbenen Onkel.',
 'Mit den Senators nahm er an drei All-Star-Spielen t

In [8]:
tokenized_sentences = []
segmentss = []
space = tokenizer.IdToPiece(5)

# specifically for zh-CN
# pattern = ",|\?|・|\u3002|\uff1f|\uff01|\uff0c|\u3001|\uff1b|\uff1a|\u201c|\u201d|\u2018|\u2019|\uff08|\uff09|\u300a|\u300b|\u3008|\u3009|\u3010|\u3011|\u300e|\u300f|\u300c|\u300d|\ufe43|\ufe44|\u3014|\u3015|\u2026|\u2014|\uff5e|\ufe4f|\uffe5" + punctuation
# for sent in sentences:
#     tokens = tokenizer.EncodeAsPieces(sent)
#     segments = []
#     for idx, token in enumerate(tokens):
#         if token != space and token not in pattern:
#             segments.append((idx, idx))
#     tokenized_sentence = []
#     for seg in segments:
#         tokenized_sentence.append(''.join(tokens[seg[0] : seg[1] + 1]).replace(space, ''))

#     segmentss.append(segments)
#     tokenized_sentences.append(tokenized_sentence)

# for other languages
for sent in tqdm(sentences):
    tokens = tokenizer.EncodeAsPieces(sent)
    segments = []
    last = -1
    for idx, token in enumerate(tokens):
        if token.startswith(space) or token in punctuation:
            if last != -1 and last <= idx - 1:
                segments.append((last, idx - 1))
            last = idx + (token in punctuation)
    if last < len(tokens):
        segments.append((last, len(tokens) - 1))

    tokenized_sentence = []
    for seg in segments:
        tokenized_sentence.append(''.join(tokens[seg[0] : seg[1] + 1]).replace(space, ''))

    tokenized_sentences.append(tokenized_sentence)
    segmentss.append(segments)

100%|██████████| 765790/765790 [00:21<00:00, 35349.43it/s]


In [9]:
tokenized_sentences

[['Zieht', 'euch', 'bitte', 'draußen', 'die', 'Schuhe', 'aus'],
 ['Es', 'gibt', 'auch', 'mehrere', 'Campingplätze'],
 ['Es', 'kommt', 'zum', 'Showdown', 'in', 'Gstaad'],
 ['Ihre',
  'Fotostrecken',
  'erschienen',
  'in',
  'Modemagazinen',
  'wie',
  'der',
  'Vogue',
  'Harper’s',
  'Bazaar',
  'und',
  'Marie',
  'Claire'],
 ['Aber',
  'weißt',
  'du',
  'wer',
  'den',
  'Stein',
  'wirklich',
  'ins',
  'Rollen',
  'gebracht',
  'hat'],
 ['Felipe',
  'hat',
  'eine',
  'auch',
  'für',
  'Monarchen',
  'ungewöhnlich',
  'lange',
  'Titelliste'],
 ['Was', 'solls', 'ich', 'bin', 'bereit'],
 ['Sein',
  'Atelier',
  'diente',
  'nebenher',
  'zur',
  'Tarnung',
  'unerlaubter',
  'politischer',
  'Aktivitäten'],
 ['Das',
  'Internet',
  'besteht',
  'aus',
  'vielen',
  'Computern',
  'die',
  'miteinander',
  'verbunden',
  'sind'],
 ['Jörg',
  'van',
  'Essen',
  'ist',
  'stets',
  'über',
  'die',
  'Landesliste',
  'Nordrhein',
  'Westfalen',
  'in',
  'den',
  'Deutschen',
  'Bu

In [13]:
for idx, path in enumerate(tqdm(lang_df['path'])):
    id = path[:-4]
    with open(os.path.join(lang_root, '16kHz', '{}.txt'.format(id)), 'w') as w:
        w.write(' '.join(tokenized_sentences[idx]))

100%|██████████| 765790/765790 [01:08<00:00, 11104.13it/s]


In [15]:
filtered_grids = []
n_outlier = 0
for i, path in enumerate(tqdm(lang_df['path'])):
    id = path[:-4]
    # grid_path = os.path.join(dict_lang_root, '16kHz/align_wmseg/{}.TextGrid'.format(id))
    grid_path = os.path.join(lang_root, '16kHz/align_spm/{}.TextGrid'.format(id))
    if os.path.exists(grid_path):
        grid = textgrids.TextGrid(grid_path)
        filtered_grid = [tok for tok in grid['words'] if tok.text != '']

        if len(filtered_grid) != len(tokenized_sentences[i]):
            # print(filtered_grid, tokenized_sentences[i], sep='\n')
            continue

        interval = np.array([(word.xmin, word.xmax) for word in filtered_grid])
        audio_path = os.path.join(lang_root, '16kHz/{}.wav'.format(id))
        info = torchaudio.info(audio_path)
        duration = info.num_frames / info.sample_rate
        interval = interval / duration

        th.save([segmentss[i], interval], os.path.join(lang_root, '16kHz', '{}.pt'.format(id)))

100%|██████████| 765790/765790 [07:33<00:00, 1689.58it/s]


In [16]:
df = load_df_from_tsv(os.path.join(lang_root, 'train_asr_{}-cv.tsv'.format(lang)))

In [17]:
mask = np.ones(len(df), dtype=bool)

In [18]:
for i, id in enumerate(df['id']):
    if not os.path.exists(os.path.join(lang_root, '16kHz', '{}.pt'.format(id))):
        mask[i] = False

In [19]:
save_df_to_tsv(df[mask], os.path.join(lang_root, 'train_asr_{}-cv.tsv'.format(lang)))

# Remove data points in valid and test

In [7]:
root = '/mnt/data/siqiouyang/datasets/covost2/'

In [29]:
lang = 'zh-CN'
cv2_lang_root = os.path.join(root, lang)
cv9_lang_root = os.path.join(root, lang + '-cv')

In [30]:
cv2_lang_root, cv9_lang_root

('/mnt/data/siqiouyang/datasets/covost2/zh-CN',
 '/mnt/data/siqiouyang/datasets/covost2/zh-CN-cv')

In [31]:
dev_df = load_df_from_tsv(os.path.join(cv2_lang_root, 'dev_st_{}_en.tsv'.format(lang)))
test_df = load_df_from_tsv(os.path.join(cv2_lang_root, 'test_st_{}_en.tsv'.format(lang)))

In [32]:
to_remove = set(dev_df['id'].tolist() + test_df['id'].tolist())

In [33]:
len(to_remove)

9741

In [34]:
def clean(name):
    tsv_path = os.path.join(cv9_lang_root, name)
    train_df = load_df_from_tsv(tsv_path)
    mask = [True] * len(train_df)
    for i in range(len(train_df)):
        if train_df['id'][i] in to_remove:
            mask[i] = False
    train_df = train_df[mask]
    save_df_to_tsv(train_df, tsv_path)

In [35]:
clean('train_st_{}-cv_en.tsv'.format(lang))

In [36]:
clean('train_asr_{}-cv.tsv'.format(lang))

In [37]:
clean('train_asr_{}-cv_full.tsv'.format(lang))