Merge branch 'dev'

mozilla · Nov 25, 2020 · f6c96b0 · f6c96b0
2 parents cfbc660 + e3b7157
commit f6c96b0
Show file tree

Hide file tree

Showing 71 changed files with 5,002 additions and 420 deletions.
diff --git a/.compute b/.compute
@@ -1,14 +1,14 @@
 #!/bin/bash
 yes | apt-get install sox
 yes | apt-get install ffmpeg
-yes | apt-get install espeak 
+yes | apt-get install espeak
 yes | apt-get install tmux
 yes | apt-get install zsh
 sh -c "$(curl -fsSL https://raw.githubusercontent.com/robbyrussell/oh-my-zsh/master/tools/install.sh)"
 pip3 install https://download.pytorch.org/whl/cu100/torch-1.3.0%2Bcu100-cp36-cp36m-linux_x86_64.whl
 sudo sh install.sh
-pip install pytorch==1.3.0+cu100
-python3 setup.py develop
+# pip install pytorch==1.7.0+cu100
+# python3 setup.py develop
 # python3 distribute.py --config_path config.json  --data_path /data/ro/shared/data/keithito/LJSpeech-1.1/
 # cp -R ${USER_DIR}/Mozilla_22050 ../tmp/
 # python3 distribute.py --config_path config_tacotron_gst.json  --data_path ../tmp/Mozilla_22050/

diff --git a/.travis/script b/.travis/script
@@ -17,5 +17,6 @@ fi
 if [[ "$TEST_SUITE" == "testscripts" ]]; then
    # test model training scripts
   ./tests/test_tts_train.sh
-  ./tests/test_vocoder_train.sh
+  ./tests/test_vocoder_gan_train.sh
+  ./tests/test_vocoder_wavernn_train.sh
 fi
diff --git a/README.md b/README.md
@@ -26,7 +26,7 @@ TTS paper collection: https://github.com/erogol/TTS-papers
 ## TTS Performance
 <p align="center"><img src="https://discourse-prod-uploads-81679984178418.s3.dualstack.us-west-2.amazonaws.com/optimized/3X/6/4/6428f980e9ec751c248e591460895f7881aec0c6_2_1035x591.png" width="800" /></p>
 
-"Mozilla*" and "Judy*" are our models. 
+"Mozilla*" and "Judy*" are our models.
 [Details...](https://github.com/mozilla/TTS/wiki/Mean-Opinion-Score-Results)
 
 ## Provided Models and Methods
@@ -47,7 +47,10 @@ Speaker Encoder:
 Vocoders:
 - MelGAN: [paper](https://arxiv.org/abs/1710.10467)
 - MultiBandMelGAN: [paper](https://arxiv.org/abs/2005.05106)
+- ParallelWaveGAN: [paper](https://arxiv.org/abs/1910.11480)
 - GAN-TTS discriminators: [paper](https://arxiv.org/abs/1909.11646)
+- WaveRNN: [origin][https://github.com/fatchord/WaveRNN/]
+- WaveGrad: [paper][https://arxiv.org/abs/2009.00713]
 
 You can also help us implement more models. Some TTS related work can be found [here](https://github.com/erogol/TTS-papers).
 
@@ -70,8 +73,8 @@ You can also help us implement more models. Some TTS related work can be found [
 ## Main Requirements and Installation
 Highly recommended to use [miniconda](https://conda.io/miniconda.html) for easier installation.
   * python>=3.6
-  * pytorch>=1.4.1
-  * tensorflow>=2.2
+  * pytorch>=1.5.0
+  * tensorflow>=2.3
   * librosa
   * tensorboard
   * tensorboardX
@@ -149,23 +152,25 @@ head -n 12000 metadata_shuf.csv > metadata_train.csv
 tail -n 1100 metadata_shuf.csv > metadata_val.csv
 ```
 
-To train a new model, you need to define your own ```config.json``` file (check the example) and call with the command below. You also set the model architecture in  ```config.json```.
+To train a new model, you need to define your own ```config.json``` to define model details, trainin configuration and more (check the examples). Then call the corressponding train script.
 
-```python TTS/bin/train_tts.py --config_path TTS/tts/configs/config.json```
+For instance, in order to train a tacotron or tacotron2 model on LJSpeech dataset, follow these steps.
+
+```python TTS/bin/train_tacotron.py --config_path TTS/tts/configs/config.json```
 
 To fine-tune a model, use ```--restore_path```.
 
-```python TTS/bin/train_tts.py --config_path TTS/tts/configs/config.json --restore_path /path/to/your/model.pth.tar```
+```python TTS/bin/train_tacotron.py --config_path TTS/tts/configs/config.json --restore_path /path/to/your/model.pth.tar```
 
 To continue an old training run, use ```--continue_path```.
 
-```python TTS/bin/train_tts.py --continue_path /path/to/your/run_folder/```
+```python TTS/bin/train_tacotron.py --continue_path /path/to/your/run_folder/```
 
-For multi-GPU training use ```distribute.py```. It enables process based multi-GPU training where each process uses a single GPU.
+For multi-GPU training, call ```distribute.py```. It runs any provided train script in multi-GPU setting.
 
-```CUDA_VISIBLE_DEVICES="0,1,4" TTS/bin/distribute.py --config_path TTS/tts/configs/config.json```
+```CUDA_VISIBLE_DEVICES="0,1,4" python TTS/bin/distribute.py --script train_tacotron.py --config_path TTS/tts/configs/config.json```
 
-Each run creates a new output folder and ```config.json``` is copied under this folder.
+Each run creates a new output folder accomodating used ```config.json```, model checkpoints and tensorboard logs.
 
 In case of any error or intercepted execution, if there is no checkpoint yet under the output folder, the whole folder is going to be removed.
 
@@ -199,7 +204,7 @@ If you like to use TTS to try a new idea and like to share your experiments with
 - [x] Train TTS with r=1 successfully.
 - [x] Enable process based distributed training. Similar to (https://github.com/fastai/imagenet-fast/).
 - [x] Adapting Neural Vocoder. TTS works with WaveRNN and ParallelWaveGAN (https://github.com/erogol/WaveRNN and https://github.com/erogol/ParallelWaveGAN)
-- [ ] Multi-speaker embedding.
+- [x] Multi-speaker embedding.
 - [x] Model optimization (model export, model pruning etc.)
 
 <!--## References
@@ -218,3 +223,4 @@ If you like to use TTS to try a new idea and like to share your experiments with
 - https://github.com/r9y9/tacotron_pytorch (Initial Tacotron architecture)
 - https://github.com/kan-bayashi/ParallelWaveGAN (vocoder library)
 - https://github.com/jaywalnut310/glow-tts (Original Glow-TTS implementation)
+- https://github.com/fatchord/WaveRNN/ (Original WaveRNN implementation)
diff --git a/TTS/bin/compute_embeddings.py b/TTS/bin/compute_embeddings.py
@@ -0,0 +1,130 @@
+import argparse
+import glob
+import os
+
+import numpy as np
+from tqdm import tqdm
+
+import torch
+from TTS.speaker_encoder.model import SpeakerEncoder
+from TTS.utils.audio import AudioProcessor
+from TTS.utils.io import load_config
+from TTS.tts.utils.speakers import save_speaker_mapping
+from TTS.tts.datasets.preprocess import load_meta_data
+
+parser = argparse.ArgumentParser(
+    description='Compute embedding vectors for each wav file in a dataset. If "target_dataset" is defined, it generates "speakers.json" necessary for training a multi-speaker model.')
+parser.add_argument(
+    'model_path',
+    type=str,
+    help='Path to model outputs (checkpoint, tensorboard etc.).')
+parser.add_argument(
+    'config_path',
+    type=str,
+    help='Path to config file for training.',
+)
+parser.add_argument(
+    'data_path',
+    type=str,
+    help='Data path for wav files - directory or CSV file')
+parser.add_argument(
+    'output_path',
+    type=str,
+    help='path for training outputs.')
+parser.add_argument(
+    '--target_dataset',
+    type=str,
+    default='',
+    help='Target dataset to pick a processor from TTS.tts.dataset.preprocess. Necessary to create a speakers.json file.'
+)
+parser.add_argument(
+    '--use_cuda', type=bool, help='flag to set cuda.', default=False
+)
+parser.add_argument(
+    '--separator', type=str, help='Separator used in file if CSV is passed for data_path', default='|'
+)
+args = parser.parse_args()
+
+
+c = load_config(args.config_path)
+ap = AudioProcessor(**c['audio'])
+
+data_path = args.data_path
+split_ext = os.path.splitext(data_path)
+sep = args.separator
+
+if args.target_dataset != '':
+    # if target dataset is defined
+    dataset_config = [
+        {
+            "name": args.target_dataset,
+            "path": args.data_path,
+            "meta_file_train": None,
+            "meta_file_val": None
+        },
+    ]
+    wav_files, _ = load_meta_data(dataset_config, eval_split=False)
+    output_files = [wav_file[1].replace(data_path, args.output_path).replace(
+        '.wav', '.npy') for wav_file in wav_files]
+else:
+    # if target dataset is not defined
+    if len(split_ext) > 0 and split_ext[1].lower() == '.csv':
+        # Parse CSV
+        print(f'CSV file: {data_path}')
+        with open(data_path) as f:
+            wav_path = os.path.join(os.path.dirname(data_path), 'wavs')
+            wav_files = []
+            print(f'Separator is: {sep}')
+            for line in f:
+                components = line.split(sep)
+                if len(components) != 2:
+                    print("Invalid line")
+                    continue
+                wav_file = os.path.join(wav_path, components[0] + '.wav')
+                #print(f'wav_file: {wav_file}')
+                if os.path.exists(wav_file):
+                    wav_files.append(wav_file)
+        print(f'Count of wavs imported: {len(wav_files)}')
+    else:
+        # Parse all wav files in data_path
+        wav_files = glob.glob(data_path + '/**/*.wav', recursive=True)
+
+        output_files = [wav_file.replace(data_path, args.output_path).replace(
+            '.wav', '.npy') for wav_file in wav_files]
+
+for output_file in output_files:
+    os.makedirs(os.path.dirname(output_file), exist_ok=True)
+
+# define Encoder model
+model = SpeakerEncoder(**c.model)
+model.load_state_dict(torch.load(args.model_path)['model'])
+model.eval()
+if args.use_cuda:
+    model.cuda()
+
+# compute speaker embeddings
+speaker_mapping = {}
+for idx, wav_file in enumerate(tqdm(wav_files)):
+    if isinstance(wav_file, list):
+        speaker_name = wav_file[2]
+        wav_file = wav_file[1]
+
+    mel_spec = ap.melspectrogram(ap.load_wav(wav_file, sr=ap.sample_rate)).T
+    mel_spec = torch.FloatTensor(mel_spec[None, :, :])
+    if args.use_cuda:
+        mel_spec = mel_spec.cuda()
+    embedd = model.compute_embedding(mel_spec)
+    embedd = embedd.detach().cpu().numpy()
+    np.save(output_files[idx], embedd)
+
+    if args.target_dataset != '':
+        # create speaker_mapping if target dataset is defined
+        wav_file_name = os.path.basename(wav_file)
+        speaker_mapping[wav_file_name] = {}
+        speaker_mapping[wav_file_name]['name'] = speaker_name
+        speaker_mapping[wav_file_name]['embedding'] = embedd.flatten().tolist()
+
+if args.target_dataset != '':
+    # save speaker_mapping if target dataset is defined
+    mapping_file_path = os.path.join(args.output_path, 'speakers.json')
+    save_speaker_mapping(args.output_path, speaker_mapping)
diff --git a/TTS/bin/compute_statistics.py b/TTS/bin/compute_statistics.py
@@ -2,6 +2,7 @@
 # -*- coding: utf-8 -*-
 
 import os
+import glob
 import argparse
 
 import numpy as np
@@ -11,6 +12,7 @@
 from TTS.utils.io import load_config
 from TTS.utils.audio import AudioProcessor
 
+
 def main():
     """Run preprocessing process."""
     parser = argparse.ArgumentParser(
@@ -30,7 +32,10 @@ def main():
     ap = AudioProcessor(**CONFIG.audio)
 
     # load the meta data of target dataset
-    dataset_items = load_meta_data(CONFIG.datasets)[0]  # take only train data
+    if 'data_path' in CONFIG.keys():
+        dataset_items = glob.glob(os.path.join(CONFIG.data_path, '**', '*.wav'), recursive=True)
+    else:
+        dataset_items = load_meta_data(CONFIG.datasets)[0]  # take only train data
     print(f" > There are {len(dataset_items)} files.")
 
     mel_sum = 0
@@ -40,7 +45,7 @@ def main():
     N = 0
     for item in tqdm(dataset_items):
         # compute features
-        wav = ap.load_wav(item[1])
+        wav = ap.load_wav(item if isinstance(item, str) else item[1])
         linear = ap.spectrogram(wav)
         mel = ap.melspectrogram(wav)
 
@@ -56,7 +61,7 @@ def main():
     linear_mean = linear_sum / N
     linear_scale = np.sqrt(linear_square_sum / N - linear_mean ** 2)
 
-    output_file_path = os.path.join(args.out_path, "scale_stats.npy")
+    output_file_path = args.out_path
     stats = {}
     stats['mel_mean'] = mel_mean
     stats['mel_std'] = mel_scale
@@ -78,7 +83,7 @@ def main():
     del CONFIG.audio['clip_norm']
     stats['audio_config'] = CONFIG.audio
     np.save(output_file_path, stats, allow_pickle=True)
-    print(f' > scale_stats.npy is saved to {output_file_path}')
+    print(f' > stats saved to {output_file_path}')
 
 
 if __name__ == "__main__":

diff --git a/TTS/bin/synthesize.py b/TTS/bin/synthesize.py
@@ -10,7 +10,7 @@
 
 import torch
 
-from TTS.tts.utils.generic_utils import setup_model
+from TTS.tts.utils.generic_utils import setup_model, is_tacotron
 from TTS.tts.utils.synthesis import synthesis
 from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols
 from TTS.utils.audio import AudioProcessor
@@ -125,7 +125,8 @@ def tts(model, vocoder_model, text, CONFIG, use_cuda, ap, use_gl, speaker_fileid
     model.eval()
     if args.use_cuda:
         model.cuda()
-    model.decoder.set_r(cp['r'])
+    if is_tacotron(C):
+        model.decoder.set_r(cp['r'])
 
     # load vocoder model
     if args.vocoder_path != "":
@@ -153,7 +154,10 @@ def tts(model, vocoder_model, text, CONFIG, use_cuda, ap, use_gl, speaker_fileid
         args.speaker_fileid = None
 
     if args.gst_style is None:
-        gst_style = C.gst['gst_style_input']
+        if is_tacotron(C):
+            gst_style = C.gst['gst_style_input']
+        else:
+            gst_style = None
     else:
         # check if gst_style string is a dict, if is dict convert  else use string
         try:

diff --git a/TTS/bin/train_encoder.py b/TTS/bin/train_encoder.py
@@ -35,7 +35,7 @@
 print(" > Number of GPUs: ", num_gpus)
 
 
-def setup_loader(ap, is_val=False, verbose=False):
+def setup_loader(ap: AudioProcessor, is_val: bool=False, verbose: bool=False):
     if is_val:
         loader = None
     else:
@@ -212,6 +212,7 @@ def main(args):  # pylint: disable=redefined-outer-name
     parser.add_argument(
         '--config_path',
         type=str,
+        required=True,
         help='Path to config file for training.',
     )
     parser.add_argument('--debug',