In [201]:
import os
import re
import random
from string import punctuation
from argparse import Namespace
from collections import defaultdict

import IPython

import torch as th
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from tqdm import tqdm
import torchaudio
import numpy as np

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import sentencepiece as spm

import textgrids
import faiss

from sentence_transformers import SentenceTransformer
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast

from fairseq.data import Dictionary
from fairseq.models.mST.w2v2_phone_transformer import W2V2Transformer
from fairseq.data.audio.multilingual_triplet_v2_phone_dataset import (
    MultilingualTripletDataConfig,
    MultilingualTripletDataset,
    MultilingualTripletDatasetCreator
)
from fairseq.data.audio.speech_to_text_dataset import get_features_or_waveform
from examples.speech_to_text.data_utils import load_df_from_tsv, save_df_to_tsv
from fairseq.checkpoint_utils import load_checkpoint_to_cpu
from fairseq.data.encoders.sentencepiece_bpe import SentencepieceBPE, SentencepieceConfig

from fairseq.models.mST.w2v2_phone_transformer import W2V2Transformer
from fairseq.data.audio.multilingual_triplet_v2_phone_dataset import (
    MultilingualTripletDataConfig,
    MultilingualTripletDataset,
    MultilingualTripletDatasetCreator
)

# Build Code-Switch for CV 9.0 Language

In [5]:
root = '/mnt/data/siqiouyang/datasets/covost2/'

# Tokenization

In [6]:
tokenizer = spm.SentencePieceProcessor()
tokenizer.Load('/mnt/data/siqiouyang/runs/mST/pretrained/mbart50.ft.n1/sentence.bpe.model')

True

In [7]:
langs = ['de', 'fr', 'es', '-it-', 'ru', 'zh-CN', 'pt', '-fa-', '-et-', '-mn-', 'nl', 'tr', '-ar-', 'sv-SE', '-lv-', '-sl-', '-ta-', '-ja-', '-id-']

In [254]:
lang = 'pt'
lang_root = os.path.join(root, lang)

In [255]:
lang_df = load_df_from_tsv(os.path.join(lang_root, 'train_st_{}_en.tsv'.format(lang)))

In [256]:
lang_df

Unnamed: 0,id,audio,n_frames,src_text,tgt_text,speaker,src_lang,tgt_lang
0,common_voice_pt_19277034,common_voice_pt_19277034.wav,68352,Altere o dispositivo de saída de áudio.,Change the audio output device.,939991dfcf12d8250739f3b4e5ed47e0dc4bf8ffb69810...,pt,en
1,common_voice_pt_19277037,common_voice_pt_19277037.wav,95232,"Até a data-limite, prevista no tratado, vários...","By the deadline shown in the treaty, various m...",939991dfcf12d8250739f3b4e5ed47e0dc4bf8ffb69810...,pt,en
2,common_voice_pt_19277041,common_voice_pt_19277041.wav,43392,Eu balancei a cabeça.,I shook my head.,939991dfcf12d8250739f3b4e5ed47e0dc4bf8ffb69810...,pt,en
3,common_voice_pt_19277042,common_voice_pt_19277042.wav,74880,Adicione The Cranberries Zombie a playlist de ...,Add The Cranberries Zombie to the classics pla...,939991dfcf12d8250739f3b4e5ed47e0dc4bf8ffb69810...,pt,en
4,common_voice_pt_19277044,common_voice_pt_19277044.wav,69120,Um cinegrafista filmando um jogo de basquete.,A cameraman filming a basketball game.,939991dfcf12d8250739f3b4e5ed47e0dc4bf8ffb69810...,pt,en
...,...,...,...,...,...,...,...,...
9153,common_voice_pt_19860863,common_voice_pt_19860863.wav,71040,Existe alguma coisa que eu possa fazer por você?,Is there something I can do for you?,9d91497cb25399ad6cf6a4d88548fa84ee3c3687363015...,pt,en
9154,common_voice_pt_19860882,common_voice_pt_19860882.wav,59520,Poesia e verdade.,Poetry and truth.,9d91497cb25399ad6cf6a4d88548fa84ee3c3687363015...,pt,en
9155,common_voice_pt_19860892,common_voice_pt_19860892.wav,89472,Um surfista está fazendo cache de uma onda.,A surfer is riding a wave.,9d91497cb25399ad6cf6a4d88548fa84ee3c3687363015...,pt,en
9156,common_voice_pt_19860899,common_voice_pt_19860899.wav,130560,Dois homens vestindo roupas escocesas tradicio...,Two men in traditional Scottish clothes are ho...,9d91497cb25399ad6cf6a4d88548fa84ee3c3687363015...,pt,en


In [257]:
sentences = lang_df['src_text'].tolist()
sentences

['Altere o dispositivo de saída de áudio.',
 'Até a data-limite, prevista no tratado, vários mísseis foram destruídos.',
 'Eu balancei a cabeça.',
 'Adicione The Cranberries Zombie a playlist de clássicas.',
 'Um cinegrafista filmando um jogo de basquete.',
 'Ajude meu filho',
 'Embora o sol seja muito grande',
 'Cerca de um em vinte.',
 'O Facebook tem muitos problemas de privacidade.',
 'Você acabou de dar um tapinha nele.',
 'Não pode mudar de carro a cada ano',
 'A mulher com cabelo loiro mantém seus óculos escuros.',
 'Isso me impressiona!',
 '"O termo ""apenas meus dois centavos"" é sobre a opinião? não sobre dinheiro."',
 'Diga-me a previsão do tempo para a França',
 'Jornal diário desportivo que segue toda a atualidade desportiva com independência e rigor jornalístico.',
 'O garoto recordou o brilho que havia notado no dia anterior.',
 'Quanto vale o real no momento?',
 'Isso não poderia ter esperado até a manhã?',
 'A areia flutua sobre as soleiras da velha casa.',
 '"""Não de

In [247]:
tokenized_sentences = []
segmentss = []
space = tokenizer.IdToPiece(5)

# specifically for zh-CN
# pattern = ",|\?|・|\u3002|\uff1f|\uff01|\uff0c|\u3001|\uff1b|\uff1a|\u201c|\u201d|\u2018|\u2019|\uff08|\uff09|\u300a|\u300b|\u3008|\u3009|\u3010|\u3011|\u300e|\u300f|\u300c|\u300d|\ufe43|\ufe44|\u3014|\u3015|\u2026|\u2014|\uff5e|\ufe4f|\uffe5" + punctuation
# for sent in sentences:
#     tokens = tokenizer.EncodeAsPieces(sent)
#     segments = []
#     for idx, token in enumerate(tokens):
#         if token != space and token not in pattern:
#             segments.append((idx, idx))
#     tokenized_sentence = []
#     for seg in segments:
#         tokenized_sentence.append(''.join(tokens[seg[0] : seg[1] + 1]).replace(space, ''))

#     segmentss.append(segments)
#     tokenized_sentences.append(tokenized_sentence)

# for other languages
def skip(token, punctuation, space):
    if token.startswith(space):
        token = token[1:]
        if token == '':
            return 1
    for c in token:
        if c not in punctuation:
            return 0
    return 1

# punctuation = '!"#$%&,.:;?'
for sent in tqdm(sentences):
    tokens = tokenizer.EncodeAsPieces(sent)
    segments = []
    last = -1
    for idx, token in enumerate(tokens):
        if token.startswith(space) or token in punctuation:
            if last != -1 and last <= idx - 1:
                segments.append((last, idx - 1))
            last = idx + (token in punctuation)
    if last < len(tokens):
        segments.append((last, len(tokens) - 1))

    tokenized_sentence = []
    for seg in segments:
        tokenized_sentence.append(''.join(tokens[seg[0] : seg[1] + 1]).replace(space, ''))

    tokenized_sentences.append(tokenized_sentence)
    segmentss.append(segments)

In [248]:
print(*tokenized_sentences[10:20], sep='\n')

['收到', '许多人', '指控', '政府', '死', '者', '与', '被', '拘留', '者', '家属', '提出', '抗议', '并', '威胁', '该', '国', '的一些', '人', '权', '护', '卫', '者']
['为了', '迈', '向', '环球', '小姐', '而', '努力', '不懈', '日', '夜', '都', '十分', '憧', '憬', '的', '女', '优', '幸', '田', '真理', '子']
['果', '敢', '的', '杨', '氏', '土', '司', '原', '为', '兴', '达', '户', '土', '目']
['然而', '告知', '旁', '人均', '不', '被', '重视']
['该', '网站', '的公司', '还', '开拓', '研究', '服务', '并', '预测', '电影', '的', '可能', '收入']
['默', '特', '尔', '敦', '是', '位于', '美国', '加', '利', '福', '尼亚', '州', '洪', '堡', '县', '的一个', '人口', '普', '查', '指定', '地区']
['查', '茨', '沃', '思', '是一个', '位于', '美国', '乔治', '亚', '州', '莫', '雷', '县', '的城市']
['印', '第', '安', '松', '是', '位于', '美国', '亚', '利', '桑', '那', '州', '纳', '瓦', '霍', '县', '的一个', '非', '建', '制', '地区']
['按', '省', '道', '养', '护', '管理', '权', '限', '的不同', '上海', '的', '省', '道', '可以', '分为', '市', '管', '省', '道', '和', '区', '管', '省', '道']
['龚', '明', '鑫', '中华', '民', '国', '政治', '人物', '现', '任', '行政院', '政务', '委员']


In [249]:
print(*sentences[10:20], sep='\n')

收到许多人指控政府死者与被拘留者家属提出抗议，并威胁该国的一些人权护卫者。
为了迈向环球小姐而努力不懈，日夜都十分憧憬的女优：幸田真理子。
果敢的杨氏土司原为兴达户土目。
然而告知旁人均不被重视。
该网站的公司还开拓研究服务并预测电影的可能收入。
默特尔敦是位于美国加利福尼亚州洪堡县的一个人口普查指定地区。
查茨沃思是一个位于美国乔治亚州莫雷县的城市。
印第安松是位于美国亚利桑那州纳瓦霍县的一个非建制地区。
按省道养护管理权限的不同，上海的省道可以分为市管省道和区管省道。
龚明鑫，中华民国政治人物，现任行政院政务委员。


In [250]:
# for i, id in enumerate(tqdm(lang_df['id'])):
#     with open(os.path.join(lang_root, '16kHz', '{}.txt'.format(id)), 'w') as w:
#         w.write(' '.join(tokenized_sentences[i]))

Doing MFA alignment.

In [251]:
filtered_grids = []
n_outlier = 0
for i, id in enumerate(tqdm(lang_df['id'])):
    # grid_path = os.path.join(dict_lang_root, '16kHz/align_wmseg/{}.TextGrid'.format(id))
    grid_path = os.path.join(lang_root, '16kHz/align_spm/{}.TextGrid'.format(id))
    if os.path.exists(grid_path):
        grid = textgrids.TextGrid(grid_path)
        filtered_grid = [tok for tok in grid['words'] if tok.text != '']

        if len(filtered_grid) != len(tokenized_sentences[i]):
            # print([w.text for w in filtered_grid], tokenized_sentences[i], sep='\n')
            n_outlier += 1
            continue

        interval = np.array([(word.xmin, word.xmax) for word in filtered_grid])
        audio_path = os.path.join(lang_root, '16kHz/{}.wav'.format(id))
        info = torchaudio.info(audio_path)
        duration = info.num_frames / info.sample_rate
        interval = interval / duration

        th.save([segmentss[i], interval], os.path.join(lang_root, '16kHz', '{}.pt'.format(id)))

100%|██████████| 7085/7085 [00:03<00:00, 2248.74it/s]


In [252]:
n_outlier / len(lang_df)

0.009738884968242767

In [253]:
n_outlier

69