In [1]:
import os
import csv

import torch as th
from tqdm import tqdm
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

from TTS.tts.configs.shared_configs import BaseDatasetConfig
from TTS.tts.datasets import load_tts_samples

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def load_df_from_tsv(path: str):
    return pd.read_csv(
        path,
        sep="\t",
        header=0,
        encoding="utf-8",
        escapechar="\\",
        quoting=csv.QUOTE_NONE,
        na_filter=False,
    )

In [3]:
device = 'cuda:0'

# Generate German texts translated from English references in train_st_sv-SE_en

In [4]:
tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-de")
model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-de").to(device)

In [5]:
input = ["Hello world.", "This is good!"]
tokenized_input = tokenizer(input, padding=True, return_tensors="pt").to(device)
output = model.generate(**tokenized_input)
translation = tokenizer.batch_decode(output, skip_special_tokens=True)

In [6]:
translation

['Hallo Welt.', 'Das ist gut!']

In [7]:
path = '/mnt/raid0/siqi/datasets/covost2/sv-SE/train_st_sv-SE_en.tsv'
sv_df = load_df_from_tsv(path)
en_ref = sv_df['tgt_text'].tolist()

In [8]:
batch_size = 500
de_trans = []
for idx in tqdm(range(0, len(en_ref), batch_size)):
    input = en_ref[idx : idx + batch_size]
    tokenized_input = tokenizer(input, padding=True, return_tensors="pt").to(device)
    output = model.generate(**tokenized_input)
    translation = tokenizer.batch_decode(output, skip_special_tokens=True)
    de_trans.extend(translation)

100%|██████████| 5/5 [00:11<00:00,  2.32s/it]


# Train TTS model on German texts

## Build TTS Dataset (TODO)

In [10]:
path = '/mnt/raid0/siqi/datasets/covost2/de/train_st_de_en.tsv'
df = load_df_from_tsv(path)
de_transcript = df['src_text'].tolist()
de_audio = df['audio'].tolist()

In [12]:
metadata_path = '/mnt/raid0/siqi/datasets/covost2/de/train_tts_de.txt'
with open(metadata_path, 'w') as w:
    for transcript, audio in zip(de_transcript, de_audio):
        w.write('{}|{}\n'.format(audio, transcript))

In [18]:
de_root = '/mnt/raid0/siqi/datasets/covost2/de'
dataset_config = BaseDatasetConfig(
    name="ljspeech", meta_file_train='train_tts_de.txt', language="de", path=de_root
)

In [20]:
def formatter(root_path, manifest_file, **kwargs):  # pylint: disable=unused-argument
    """Assumes each line as ```<filename>|<transcription>```
    """
    txt_file = os.path.join(root_path, manifest_file)
    items = []
    speaker_name = "my_speaker"
    with open(txt_file, "r", encoding="utf-8") as ttf:
        for line in ttf:
            cols = line.split("|")
            wav_file = os.path.join(root_path, "16kHz", cols[0])
            text = cols[1]
            items.append({"text":text, "audio_file":wav_file, "speaker_name":speaker_name})
    return items

In [21]:
train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True, formatter=formatter)

 | > Found 127824 files in /mnt/raid0/siqi/datasets/covost2/de


In [23]:
len(train_samples), len(eval_samples)

(126546, 1278)

## Inference with TTS Model

In [9]:
dest_dir = '/mnt/raid0/siqi/datasets/covost2/sv-SE/16kHz_de'
sv_audio = sv_df['audio'].tolist()
for de, audio in tqdm(zip(de_trans, sv_audio), total=len(de_trans)):
    os.system('tts --text "{}" --model_name tts_models/de/thorsten/tacotron2-DCA --out_path {}'.format(
        de, os.path.join(dest_dir, audio)
    ))

  0%|          | 0/2160 [00:00<?, ?it/s]usage: tts [-h] [--list_models [LIST_MODELS]] [--text TEXT]
           [--model_name MODEL_NAME] [--vocoder_name VOCODER_NAME]
           [--config_path CONFIG_PATH] [--model_path MODEL_PATH]
           [--out_path OUT_PATH] [--use_cuda USE_CUDA]
           [--vocoder_path VOCODER_PATH]
           [--vocoder_config_path VOCODER_CONFIG_PATH]
           [--encoder_path ENCODER_PATH]
           [--encoder_config_path ENCODER_CONFIG_PATH]
           [--speakers_file_path SPEAKERS_FILE_PATH]
           [--language_ids_file_path LANGUAGE_IDS_FILE_PATH]
           [--speaker_idx SPEAKER_IDX] [--language_idx LANGUAGE_IDX]
           [--speaker_wav SPEAKER_WAV [SPEAKER_WAV ...]]
           [--gst_style GST_STYLE] [--list_speaker_idxs [LIST_SPEAKER_IDXS]]
           [--list_language_idxs [LIST_LANGUAGE_IDXS]]
           [--save_spectogram SAVE_SPECTOGRAM]
tts: error: unrecognized arguments: solltest nicht fliegen!
  0%|          | 1/2160 [00:02<1:35:11,  2