openvpi · yqzhishen · Feb 25, 2024 · Feb 6, 2024 · Feb 6, 2024 · Feb 7, 2024
diff --git a/augmentation/spec_stretch.py b/augmentation/spec_stretch.py
@@ -38,7 +38,7 @@ def process_item(self, item: dict, key_shift=0., speed=1., replace_spk_id=None)
 
         aug_item['mel'] = mel
 
-        if speed != 1. or hparams.get('use_speed_embed', False):
+        if speed != 1. or hparams['use_speed_embed']:
             aug_item['length'] = mel.shape[0]
             aug_item['speed'] = int(np.round(hparams['hop_size'] * speed)) / hparams['hop_size']  # real speed
             aug_item['seconds'] /= aug_item['speed']
@@ -50,7 +50,7 @@ def process_item(self, item: dict, key_shift=0., speed=1., replace_spk_id=None)
             f0, _ = self.pe.get_pitch(
                 wav, samplerate=hparams['audio_sample_rate'], length=aug_item['length'],
                 hop_size=hparams['hop_size'], f0_min=hparams['f0_min'], f0_max=hparams['f0_max'],
-                speed=speed, interp_uv=hparams['interp_uv']
+                speed=speed, interp_uv=True
             )
             aug_item['f0'] = f0.astype(np.float32)
 
@@ -83,7 +83,7 @@ def process_item(self, item: dict, key_shift=0., speed=1., replace_spk_id=None)
                         align_length=aug_item['length']
                     )
 
-        if key_shift != 0. or hparams.get('use_key_shift_embed', False):
+        if key_shift != 0. or hparams['use_key_shift_embed']:
             if replace_spk_id is None:
                 aug_item['key_shift'] = key_shift
             else:

diff --git a/basics/base_binarizer.py b/basics/base_binarizer.py
@@ -173,7 +173,6 @@ def process(self):
         self._train_item_names, self._valid_item_names = self.split_train_valid_set(self.item_names)
 
         if self.binarization_args['shuffle']:
-            random.seed(hparams['seed'])
             random.shuffle(self.item_names)
 
         self.binary_data_dir.mkdir(parents=True, exist_ok=True)

diff --git a/basics/base_task.py b/basics/base_task.py
@@ -3,7 +3,6 @@
 import pathlib
 import shutil
 import sys
-from datetime import datetime
 from typing import Dict
 
 import matplotlib
@@ -16,7 +15,6 @@
 import torch.utils.data
 from torchmetrics import Metric, MeanMetric
 import lightning.pytorch as pl
-from lightning.pytorch.callbacks import LearningRateMonitor
 from lightning.pytorch.utilities.rank_zero import rank_zero_debug, rank_zero_info, rank_zero_only
 
 from basics.base_module import CategorizedModule
@@ -87,8 +85,8 @@ def _finish_init(self):
     # Training, validation and testing
     ###########
     def setup(self, stage):
-        self.train_dataset = self.dataset_cls(hparams['train_set_name'])
-        self.valid_dataset = self.dataset_cls(hparams['valid_set_name'])
+        self.train_dataset = self.dataset_cls('train')
+        self.valid_dataset = self.dataset_cls('valid')
         self.num_replicas = (self.trainer.distributed_sampler_kwargs or {}).get('num_replicas', 1)
 
     def get_need_freeze_state_dict_key(self, model_state_dict) -> list:
@@ -130,9 +128,9 @@ def load_finetune_ckpt(
         self.load_state_dict(state_dict, strict=False)
 
     def load_pre_train_model(self):
-        pre_train_ckpt_path = hparams.get('finetune_ckpt_path')
-        blacklist = hparams.get('finetune_ignored_params')
-        # whitelist=hparams.get('pre_train_whitelist')
+        pre_train_ckpt_path = hparams['finetune_ckpt_path']
+        blacklist = hparams['finetune_ignored_params']
+        # whitelist=hparams['pre_train_whitelist']
         if blacklist is None:
             blacklist = []
         # if whitelist is  None:
@@ -344,8 +342,7 @@ def train_dataloader(self):
             size_reversed=True,
             required_batch_count_multiple=hparams['accumulate_grad_batches'],
             shuffle_sample=True,
-            shuffle_batch=True,
-            seed=hparams['seed']
+            shuffle_batch=True
         )
         return torch.utils.data.DataLoader(
             self.train_dataset,
@@ -396,7 +393,6 @@ def on_test_end(self):
 
     @classmethod
     def start(cls):
-        pl.seed_everything(hparams['seed'], workers=True)
         task = cls()
 
         # if pre_train is not None:
@@ -451,14 +447,6 @@ def start(cls):
         if not hparams['infer']:  # train
             @rank_zero_only
             def train_payload_copy():
-                # copy_code = input(f'{hparams["save_codes"]} code backup? y/n: ') == 'y'
-                copy_code = True  # backup code every time
-                if copy_code:
-                    code_dir = work_dir / 'codes' / datetime.now().strftime('%Y%m%d%H%M%S')
-                    code_dir.mkdir(exist_ok=True, parents=True)
-                    for c in hparams['save_codes']:
-                        shutil.copytree(c, code_dir / c, dirs_exist_ok=True)
-                    print(f'| Copied codes to {code_dir}.')
                 # Copy spk_map.json and dictionary.txt to work dir
                 binary_dir = pathlib.Path(hparams['binary_data_dir'])
                 spk_map = work_dir / 'spk_map.json'

diff --git a/configs/acoustic.yaml b/configs/acoustic.yaml
@@ -39,26 +39,22 @@ augmentation_args:
   random_time_stretching:
     enabled: false
     range: [0.5, 2.]
-    domain: log  # or linear
     scale: 0.75
 
 raw_data_dir: 'data/opencpop/raw'
 binary_data_dir: 'data/opencpop/binary'
 binarizer_cls: preprocessing.acoustic_binarizer.AcousticBinarizer
 dictionary: dictionaries/opencpop-extension.txt
-num_pad_tokens: 1
 spec_min: [-5]
 spec_max: [0]
 mel_vmin: -6. #-6.
 mel_vmax: 1.5
-interp_uv: true
 energy_smooth_width: 0.12
 breathiness_smooth_width: 0.12
 voicing_smooth_width: 0.12
 tension_smooth_width: 0.12
 
 use_spk_id: false
-f0_embed_type: continuous
 use_energy_embed: false
 use_breathiness_embed: false
 use_voicing_embed: false
@@ -70,7 +66,7 @@ timesteps: 1000
 max_beta: 0.02
 rel_pos: true
 diff_accelerator: ddim
-pndm_speedup: 10
+diff_speedup: 10
 hidden_size: 256
 residual_layers: 20
 residual_channels: 512

diff --git a/configs/base.yaml b/configs/base.yaml
@@ -1,11 +1,5 @@
 # task
 task_cls: null
-seed: 1234
-save_codes:
-  - configs
-  - modules
-  - training
-  - utils
 
 #############
 # dataset
@@ -36,7 +30,6 @@ enc_layers: 4
 num_heads: 2
 enc_ffn_kernel_size: 9
 ffn_act: gelu
-ffn_padding: 'SAME'
 use_spk_id: false
 
 ###########
@@ -67,8 +60,6 @@ max_batch_frames: 32000
 max_batch_size: 100000
 max_val_batch_frames: 60000
 max_val_batch_size: 1
-train_set_name: 'train'
-valid_set_name: 'valid'
 pe: 'parselmouth'
 pe_ckpt: ''
 f0_min: 65

diff --git a/configs/templates/config_acoustic.yaml b/configs/templates/config_acoustic.yaml
@@ -43,7 +43,6 @@ augmentation_args:
   random_time_stretching:
     enabled: true
     range: [0.5, 2.]
-    domain: log  # or linear
     scale: 0.75
 
 residual_channels: 512

diff --git a/configs/variance.yaml b/configs/variance.yaml
@@ -29,7 +29,6 @@ raw_data_dir: 'data/opencpop_variance/raw'
 binary_data_dir: 'data/opencpop_variance/binary'
 binarizer_cls: preprocessing.variance_binarizer.VarianceBinarizer
 dictionary: dictionaries/opencpop-extension.txt
-num_pad_tokens: 1
 
 use_spk_id: false
 
@@ -80,10 +79,6 @@ energy_smooth_width: 0.12
 breathiness_db_min: -96.0
 breathiness_db_max: -20.0
 breathiness_smooth_width: 0.12
-tension_logit_min: -10.0
-tension_logit_max: 10.0
-tension_smooth_width: 0.12
-
 voicing_db_min: -96.0
 voicing_db_max: -12.0
 voicing_smooth_width: 0.12
@@ -109,7 +104,7 @@ max_beta: 0.02
 diff_decoder_type: 'wavenet'
 diff_loss_type: l2
 diff_accelerator: ddim
-pndm_speedup: 10
+diff_speedup: 10
 
 # train and eval
 num_sanity_val_steps: 1

diff --git a/deployment/exporters/acoustic_exporter.py b/deployment/exporters/acoustic_exporter.py
@@ -58,7 +58,7 @@ def __init__(
             if hparams['use_spk_id'] else None
         self.export_spk: List[Tuple[str, Dict[str, float]]] = export_spk \
             if hparams['use_spk_id'] and export_spk is not None else []
-        if hparams.get('use_key_shift_embed', False) and not self.expose_gender:
+        if hparams['use_key_shift_embed'] and not self.expose_gender:
             shift_min, shift_max = hparams['augmentation_args']['random_pitch_shifting']['range']
             key_shift = freeze_gender * shift_max if freeze_gender >= 0. else freeze_gender * abs(shift_min)
             key_shift = max(min(key_shift, shift_max), shift_min)  # clip key shift
@@ -143,14 +143,14 @@ def _torch_export_model(self):
                 for v_name in self.model.fs2.variance_embed_list
             }
         }
-        if hparams.get('use_key_shift_embed', False):
+        if hparams['use_key_shift_embed']:
             if self.expose_gender:
                 kwargs['gender'] = torch.rand((1, n_frames), dtype=torch.float32, device=self.device)
                 input_names.append('gender')
                 dynamix_axes['gender'] = {
                     1: 'n_frames'
                 }
-        if hparams.get('use_speed_embed', False):
+        if hparams['use_speed_embed']:
             if self.expose_velocity:
                 kwargs['velocity'] = torch.rand((1, n_frames), dtype=torch.float32, device=self.device)
                 input_names.append('velocity')

diff --git a/deployment/modules/fastspeech2.py b/deployment/modules/fastspeech2.py
@@ -1,17 +1,22 @@
 import copy
 
+import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 
+from modules.commons.common_layers import NormalInitEmbedding as Embedding
 from modules.fastspeech.acoustic_encoder import FastSpeech2Acoustic
 from modules.fastspeech.variance_encoder import FastSpeech2Variance
 from utils.hparams import hparams
-from utils.pitch_utils import (
-    f0_bin, f0_mel_min, f0_mel_max
-)
 from utils.text_encoder import PAD_INDEX
 
+f0_bin = 256
+f0_max = 1100.0
+f0_min = 50.0
+f0_mel_min = 1127 * np.log(1 + f0_min / 700)
+f0_mel_max = 1127 * np.log(1 + f0_max / 700)
+
 
 def f0_to_coarse(f0):
     f0_mel = 1127 * (1 + f0 / 700).log()
@@ -38,10 +43,16 @@ def forward(self, dur):
 class FastSpeech2AcousticONNX(FastSpeech2Acoustic):
     def __init__(self, vocab_size):
         super().__init__(vocab_size=vocab_size)
+
+        # for temporary compatibility; will be completely removed in the future
+        self.f0_embed_type = hparams['f0_embed_type']
+        if self.f0_embed_type == 'discrete':
+            self.pitch_embed = Embedding(300, hparams['hidden_size'], PAD_INDEX)
+
         self.lr = LengthRegulator()
-        if hparams.get('use_key_shift_embed', False):
+        if hparams['use_key_shift_embed']:
             self.shift_min, self.shift_max = hparams['augmentation_args']['random_pitch_shifting']['range']
-        if hparams.get('use_speed_embed', False):
+        if hparams['use_speed_embed']:
             self.speed_min, self.speed_max = hparams['augmentation_args']['random_time_stretching']['range']
 
     # noinspection PyMethodOverriding
@@ -71,7 +82,7 @@ def forward(self, tokens, durations, f0, variances: dict, gender=None, velocity=
             ], dim=-1).sum(-1)
             condition += variance_embeds
 
-        if hparams.get('use_key_shift_embed', False):
+        if hparams['use_key_shift_embed']:
             if hasattr(self, 'frozen_key_shift'):
                 key_shift_embed = self.key_shift_embed(self.frozen_key_shift[:, None, None])
             else:
@@ -81,7 +92,7 @@ def forward(self, tokens, durations, f0, variances: dict, gender=None, velocity=
                 key_shift_embed = self.key_shift_embed(key_shift[:, :, None])
             condition += key_shift_embed
 
-        if hparams.get('use_speed_embed', False):
+        if hparams['use_speed_embed']:
             if velocity is not None:
                 velocity = torch.clip(velocity, min=self.speed_min, max=self.speed_max)
                 speed_embed = self.speed_embed(velocity[:, :, None])

diff --git a/docs/BestPractices.md b/docs/BestPractices.md
@@ -132,11 +132,7 @@ Once the coverage checks passed, a phoneme distribution summary will be saved in
 
 ![phoneme-distribution](resources/phoneme-distribution.jpg)
 
-During the binarization process, each phoneme will be assigned with a unique phoneme ID according the order of their names. By default, there are one padding index before all real phonemes IDs. You may edit the number of padding indices, but it is not recommended to do so:
-
-```yaml
-num_pad_tokens: 1
-```
+During the binarization process, each phoneme will be assigned with a unique phoneme ID according the order of their names. There are one padding index (marked as `<PAD`) before all real phonemes IDs.
 
 The dictionary used to binarize the dataset will be copied to the binary data directory by the binarizer, and will be copied again to the experiment directory by the trainer. When exported to ONNX, the dictionary and the phoneme sequence ordered by IDs will be saved to the artifact directory. You do not need to carry the original dictionary file for training and inference.