In [1]:
import os
import re
import random
from argparse import Namespace
from string import punctuation
from collections import defaultdict

import IPython

import torch as th
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from tqdm import tqdm
import torchaudio
import numpy as np

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import sentencepiece as spm

import textgrids
import faiss

from sentence_transformers import SentenceTransformer
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast

from fairseq.data import Dictionary
from fairseq.models.mST.w2v2_phone_transformer import W2V2Transformer
from fairseq.data.audio.multilingual_triplet_v2_phone_dataset import (
    MultilingualTripletDataConfig,
    MultilingualTripletDataset,
    MultilingualTripletDatasetCreator
)
from fairseq.data.audio.speech_to_text_dataset import get_features_or_waveform
from examples.speech_to_text.data_utils import load_df_from_tsv, save_df_to_tsv
from fairseq.checkpoint_utils import load_checkpoint_to_cpu
from fairseq.data.encoders.sentencepiece_bpe import SentencepieceBPE, SentencepieceConfig

from fairseq.models.mST.w2v2_phone_transformer import W2V2Transformer
from fairseq.data.audio.multilingual_triplet_v2_phone_dataset import (
    MultilingualTripletDataConfig,
    MultilingualTripletDataset,
    MultilingualTripletDatasetCreator
)

# Build Code-Switch for CV 9.0 Language

In [2]:
root = '/mnt/data/siqiouyang/datasets/covost2/'

# Tokenization

In [3]:
tokenizer = spm.SentencePieceProcessor()
tokenizer.Load('/mnt/data/siqiouyang/runs/mST/pretrained/mbart50.ft.n1/sentence.bpe.model')

True

In [12]:
langs = ['de', 'fr', 'es', 'it', 'ru', 'zh-CN', 'pt', '-fa-', '-et-', '-mn-', 'nl', 'tr', '-ar-', 'sv-SE', '-lv-', '-sl-', 'ta', '-ja-', '-id-']

In [34]:
lang = 'fr'
lang_root = os.path.join(root, lang)

In [35]:
lang_df = load_df_from_tsv(os.path.join(lang_root, 'train_st_{}_en.tsv'.format(lang)))

In [6]:
# Optionally resample the audio to 16kHz
# os.makedirs(os.path.join(dict_lang_root, '16kHz'))
# for path in tqdm(dict_lang_df['path']):
#     audio_path = os.path.join(dict_lang_root, 'clips/{}'.format(path))
#     waveform, sample_rate = torchaudio.load(audio_path)
#     resampled_waveform = torchaudio.functional.resample(waveform, sample_rate, 16000)
#     torchaudio.save(os.path.join(dict_lang_root, '16kHz/{}.wav'.format(path[:-4])), resampled_waveform, sample_rate=16000)

In [36]:
lang_df

Unnamed: 0,id,audio,n_frames,src_text,tgt_text,speaker,src_lang,tgt_lang
0,common_voice_fr_17732749,common_voice_fr_17732749.wav,62976,Madame la baronne Pfeffers.,Madam Pfeffers the baroness.,796d9ae74694eadea3f343a7e0dfe35f89a6a321d53484...,fr,en
1,common_voice_fr_17732750,common_voice_fr_17732750.wav,112896,Vous savez aussi bien que moi que de nombreuse...,You know as well as I do that many new molecul...,796d9ae74694eadea3f343a7e0dfe35f89a6a321d53484...,fr,en
2,common_voice_fr_17732751,common_voice_fr_17732751.wav,64512,"Oh ! parce que maintenant, quand on parle de b...","Oh! Because now, when we talk about drinking, ...",796d9ae74694eadea3f343a7e0dfe35f89a6a321d53484...,fr,en
3,common_voice_fr_17732752,common_voice_fr_17732752.wav,87936,Les questions sanitaires placent l’enfant au c...,The sanitary issues put the child at the heart...,796d9ae74694eadea3f343a7e0dfe35f89a6a321d53484...,fr,en
4,common_voice_fr_17732753,common_voice_fr_17732753.wav,90240,J’ai au moins une satisfaction personnelle : j...,I;ve got at least one personal satisfaction: I...,796d9ae74694eadea3f343a7e0dfe35f89a6a321d53484...,fr,en
...,...,...,...,...,...,...,...,...
207367,common_voice_fr_20001537,common_voice_fr_20001537.wav,41088,"L’année suivante, il se maria.",He got married the following year.,55e311cdba34f70be4760c1af7f0bcf66afb2bf5b94c35...,fr,en
207368,common_voice_fr_20001539,common_voice_fr_20001539.wav,62592,"Pour les amateurs de rouge, un vin de Bourgogn...","For lovers of red wine, a Burgundy wine, such ...",55e311cdba34f70be4760c1af7f0bcf66afb2bf5b94c35...,fr,en
207369,common_voice_fr_20001541,common_voice_fr_20001541.wav,90240,"""Elle borde les côtés sud et ouest de l'hôpita...",It borders the south and west sides of the Geo...,55e311cdba34f70be4760c1af7f0bcf66afb2bf5b94c35...,fr,en
207370,common_voice_fr_20001542,common_voice_fr_20001542.wav,85632,Le drapeau brésilien est alors porté par la fo...,The Brazilian flag is then carried by cross-co...,55e311cdba34f70be4760c1af7f0bcf66afb2bf5b94c35...,fr,en


In [37]:
sentences = lang_df['src_text'].tolist()
sentences

['Madame la baronne Pfeffers.',
 'Vous savez aussi bien que moi que de nombreuses molécules innovantes ont malheureusement déçu.',
 'Oh ! parce que maintenant, quand on parle de boire, je m’en vais !',
 'Les questions sanitaires placent l’enfant au cœur de la problématique de l’évolution humaine.',
 'J’ai au moins une satisfaction personnelle : j’ai ému Monsieur Piron.',
 'Avenue du Louvre au numéro trente-huit',
 'Devant la seconde tour du Louvre… à l’heure du couvre-feu.',
 'Cette décision déclenche la colère des colons, puis des campagnes de presse enflammées.',
 'Ce dispositif joue un rôle très important dans la réduction de bruit.',
 'Il est organisé autour de plusieurs allées.',
 'Ailleurs ils sont presque transparents.',
 '"Ces retrouvailles sont très liées à toute l\'affaire des vols."',
 '"Cette collaboration s\'est poursuivie tout au long de sa vie."',
 '"Deafheaven attire aussi l\'attention en jouant à plusieurs festivals."',
 'C’est pourquoi elle a débouté les appelants.',


In [38]:
tokenized_sentences = []
segmentss = []
space = tokenizer.IdToPiece(5)

# specifically for zh-CN
# pattern = ",|\?|・|\u3002|\uff1f|\uff01|\uff0c|\u3001|\uff1b|\uff1a|\u201c|\u201d|\u2018|\u2019|\uff08|\uff09|\u300a|\u300b|\u3008|\u3009|\u3010|\u3011|\u300e|\u300f|\u300c|\u300d|\ufe43|\ufe44|\u3014|\u3015|\u2026|\u2014|\uff5e|\ufe4f|\uffe5" + punctuation
# for sent in sentences:
#     tokens = tokenizer.EncodeAsPieces(sent)
#     segments = []
#     for idx, token in enumerate(tokens):
#         if token != space and token not in pattern:
#             segments.append((idx, idx))
#     tokenized_sentence = []
#     for seg in segments:
#         tokenized_sentence.append(''.join(tokens[seg[0] : seg[1] + 1]).replace(space, ''))

#     segmentss.append(segments)
#     tokenized_sentences.append(tokenized_sentence)

# for other languages
for sent in tqdm(sentences):
    tokens = tokenizer.EncodeAsPieces(sent)
    segments = []
    last = -1
    for idx, token in enumerate(tokens):
        if token.startswith(space) or token in punctuation:
            if last != -1 and last <= idx - 1:
                segments.append((last, idx - 1))
            last = idx + (token in punctuation)
    if last < len(tokens):
        segments.append((last, len(tokens) - 1))

    tokenized_sentence = []
    for seg in segments:
        tokenized_sentence.append(''.join(tokens[seg[0] : seg[1] + 1]).replace(space, ''))

    tokenized_sentences.append(tokenized_sentence)
    segmentss.append(segments)

100%|██████████| 207372/207372 [00:05<00:00, 35612.91it/s]


In [50]:
for i, id in enumerate(tqdm(lang_df['id'])):
    with open(os.path.join(lang_root, '16kHz', '{}.txt'.format(id)), 'w') as w:
        w.write(' '.join(tokenized_sentences[i]))

100%|██████████| 207372/207372 [00:08<00:00, 23581.16it/s]


Doing MFA alignment.

In [27]:
filtered_grids = []
n_outlier = 0
for i, id in enumerate(tqdm(lang_df['id'])):
    # grid_path = os.path.join(dict_lang_root, '16kHz/align_wmseg/{}.TextGrid'.format(id))
    grid_path = os.path.join(lang_root, '16kHz/align_spm/{}.TextGrid'.format(id))
    if os.path.exists(grid_path):
        grid = textgrids.TextGrid(grid_path)
        filtered_grid = [tok for tok in grid['words'] if tok.text != '']

        if len(filtered_grid) != len(tokenized_sentences[i]):
            # print(filtered_grid, tokenized_sentences[i], sep='\n')
            continue

        interval = np.array([(word.xmin, word.xmax) for word in filtered_grid])
        audio_path = os.path.join(lang_root, '16kHz/{}.wav'.format(id))
        info = torchaudio.info(audio_path)
        duration = info.num_frames / info.sample_rate
        interval = interval / duration

        th.save([segmentss[i], interval], os.path.join(lang_root, '16kHz', '{}.pt'.format(id)))

100%|██████████| 127824/127824 [01:13<00:00, 1734.18it/s]
