Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Drop support for some old features and behaviors #172

Merged
merged 16 commits into from
Feb 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions augmentation/spec_stretch.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def process_item(self, item: dict, key_shift=0., speed=1., replace_spk_id=None)

aug_item['mel'] = mel

if speed != 1. or hparams.get('use_speed_embed', False):
if speed != 1. or hparams['use_speed_embed']:
aug_item['length'] = mel.shape[0]
aug_item['speed'] = int(np.round(hparams['hop_size'] * speed)) / hparams['hop_size'] # real speed
aug_item['seconds'] /= aug_item['speed']
Expand All @@ -50,7 +50,7 @@ def process_item(self, item: dict, key_shift=0., speed=1., replace_spk_id=None)
f0, _ = self.pe.get_pitch(
wav, samplerate=hparams['audio_sample_rate'], length=aug_item['length'],
hop_size=hparams['hop_size'], f0_min=hparams['f0_min'], f0_max=hparams['f0_max'],
speed=speed, interp_uv=hparams['interp_uv']
speed=speed, interp_uv=True
)
aug_item['f0'] = f0.astype(np.float32)

Expand Down Expand Up @@ -83,7 +83,7 @@ def process_item(self, item: dict, key_shift=0., speed=1., replace_spk_id=None)
align_length=aug_item['length']
)

if key_shift != 0. or hparams.get('use_key_shift_embed', False):
if key_shift != 0. or hparams['use_key_shift_embed']:
if replace_spk_id is None:
aug_item['key_shift'] = key_shift
else:
Expand Down
1 change: 0 additions & 1 deletion basics/base_binarizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,6 @@ def process(self):
self._train_item_names, self._valid_item_names = self.split_train_valid_set(self.item_names)

if self.binarization_args['shuffle']:
random.seed(hparams['seed'])
random.shuffle(self.item_names)

self.binary_data_dir.mkdir(parents=True, exist_ok=True)
Expand Down
24 changes: 6 additions & 18 deletions basics/base_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import pathlib
import shutil
import sys
from datetime import datetime
from typing import Dict

import matplotlib
Expand All @@ -16,7 +15,6 @@
import torch.utils.data
from torchmetrics import Metric, MeanMetric
import lightning.pytorch as pl
from lightning.pytorch.callbacks import LearningRateMonitor
from lightning.pytorch.utilities.rank_zero import rank_zero_debug, rank_zero_info, rank_zero_only

from basics.base_module import CategorizedModule
Expand Down Expand Up @@ -87,8 +85,8 @@ def _finish_init(self):
# Training, validation and testing
###########
def setup(self, stage):
self.train_dataset = self.dataset_cls(hparams['train_set_name'])
self.valid_dataset = self.dataset_cls(hparams['valid_set_name'])
self.train_dataset = self.dataset_cls('train')
self.valid_dataset = self.dataset_cls('valid')
self.num_replicas = (self.trainer.distributed_sampler_kwargs or {}).get('num_replicas', 1)

def get_need_freeze_state_dict_key(self, model_state_dict) -> list:
Expand Down Expand Up @@ -130,9 +128,9 @@ def load_finetune_ckpt(
self.load_state_dict(state_dict, strict=False)

def load_pre_train_model(self):
pre_train_ckpt_path = hparams.get('finetune_ckpt_path')
blacklist = hparams.get('finetune_ignored_params')
# whitelist=hparams.get('pre_train_whitelist')
pre_train_ckpt_path = hparams['finetune_ckpt_path']
blacklist = hparams['finetune_ignored_params']
# whitelist=hparams['pre_train_whitelist']
if blacklist is None:
blacklist = []
# if whitelist is None:
Expand Down Expand Up @@ -344,8 +342,7 @@ def train_dataloader(self):
size_reversed=True,
required_batch_count_multiple=hparams['accumulate_grad_batches'],
shuffle_sample=True,
shuffle_batch=True,
seed=hparams['seed']
shuffle_batch=True
)
return torch.utils.data.DataLoader(
self.train_dataset,
Expand Down Expand Up @@ -396,7 +393,6 @@ def on_test_end(self):

@classmethod
def start(cls):
pl.seed_everything(hparams['seed'], workers=True)
task = cls()

# if pre_train is not None:
Expand Down Expand Up @@ -451,14 +447,6 @@ def start(cls):
if not hparams['infer']: # train
@rank_zero_only
def train_payload_copy():
# copy_code = input(f'{hparams["save_codes"]} code backup? y/n: ') == 'y'
copy_code = True # backup code every time
if copy_code:
code_dir = work_dir / 'codes' / datetime.now().strftime('%Y%m%d%H%M%S')
code_dir.mkdir(exist_ok=True, parents=True)
for c in hparams['save_codes']:
shutil.copytree(c, code_dir / c, dirs_exist_ok=True)
print(f'| Copied codes to {code_dir}.')
# Copy spk_map.json and dictionary.txt to work dir
binary_dir = pathlib.Path(hparams['binary_data_dir'])
spk_map = work_dir / 'spk_map.json'
Expand Down
6 changes: 1 addition & 5 deletions configs/acoustic.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -39,26 +39,22 @@ augmentation_args:
random_time_stretching:
enabled: false
range: [0.5, 2.]
domain: log # or linear
scale: 0.75

raw_data_dir: 'data/opencpop/raw'
binary_data_dir: 'data/opencpop/binary'
binarizer_cls: preprocessing.acoustic_binarizer.AcousticBinarizer
dictionary: dictionaries/opencpop-extension.txt
num_pad_tokens: 1
spec_min: [-5]
spec_max: [0]
mel_vmin: -6. #-6.
mel_vmax: 1.5
interp_uv: true
energy_smooth_width: 0.12
breathiness_smooth_width: 0.12
voicing_smooth_width: 0.12
tension_smooth_width: 0.12

use_spk_id: false
f0_embed_type: continuous
use_energy_embed: false
use_breathiness_embed: false
use_voicing_embed: false
Expand All @@ -70,7 +66,7 @@ timesteps: 1000
max_beta: 0.02
rel_pos: true
diff_accelerator: ddim
pndm_speedup: 10
diff_speedup: 10
hidden_size: 256
residual_layers: 20
residual_channels: 512
Expand Down
9 changes: 0 additions & 9 deletions configs/base.yaml
Original file line number Diff line number Diff line change
@@ -1,11 +1,5 @@
# task
task_cls: null
seed: 1234
save_codes:
- configs
- modules
- training
- utils

#############
# dataset
Expand Down Expand Up @@ -36,7 +30,6 @@ enc_layers: 4
num_heads: 2
enc_ffn_kernel_size: 9
ffn_act: gelu
ffn_padding: 'SAME'
use_spk_id: false

###########
Expand Down Expand Up @@ -67,8 +60,6 @@ max_batch_frames: 32000
max_batch_size: 100000
max_val_batch_frames: 60000
max_val_batch_size: 1
train_set_name: 'train'
valid_set_name: 'valid'
pe: 'parselmouth'
pe_ckpt: ''
f0_min: 65
Expand Down
1 change: 0 additions & 1 deletion configs/templates/config_acoustic.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@ augmentation_args:
random_time_stretching:
enabled: true
range: [0.5, 2.]
domain: log # or linear
scale: 0.75

residual_channels: 512
Expand Down
7 changes: 1 addition & 6 deletions configs/variance.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@ raw_data_dir: 'data/opencpop_variance/raw'
binary_data_dir: 'data/opencpop_variance/binary'
binarizer_cls: preprocessing.variance_binarizer.VarianceBinarizer
dictionary: dictionaries/opencpop-extension.txt
num_pad_tokens: 1

use_spk_id: false

Expand Down Expand Up @@ -80,10 +79,6 @@ energy_smooth_width: 0.12
breathiness_db_min: -96.0
breathiness_db_max: -20.0
breathiness_smooth_width: 0.12
tension_logit_min: -10.0
tension_logit_max: 10.0
tension_smooth_width: 0.12

voicing_db_min: -96.0
voicing_db_max: -12.0
voicing_smooth_width: 0.12
Expand All @@ -109,7 +104,7 @@ max_beta: 0.02
diff_decoder_type: 'wavenet'
diff_loss_type: l2
diff_accelerator: ddim
pndm_speedup: 10
diff_speedup: 10

# train and eval
num_sanity_val_steps: 1
Expand Down
6 changes: 3 additions & 3 deletions deployment/exporters/acoustic_exporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def __init__(
if hparams['use_spk_id'] else None
self.export_spk: List[Tuple[str, Dict[str, float]]] = export_spk \
if hparams['use_spk_id'] and export_spk is not None else []
if hparams.get('use_key_shift_embed', False) and not self.expose_gender:
if hparams['use_key_shift_embed'] and not self.expose_gender:
shift_min, shift_max = hparams['augmentation_args']['random_pitch_shifting']['range']
key_shift = freeze_gender * shift_max if freeze_gender >= 0. else freeze_gender * abs(shift_min)
key_shift = max(min(key_shift, shift_max), shift_min) # clip key shift
Expand Down Expand Up @@ -143,14 +143,14 @@ def _torch_export_model(self):
for v_name in self.model.fs2.variance_embed_list
}
}
if hparams.get('use_key_shift_embed', False):
if hparams['use_key_shift_embed']:
if self.expose_gender:
kwargs['gender'] = torch.rand((1, n_frames), dtype=torch.float32, device=self.device)
input_names.append('gender')
dynamix_axes['gender'] = {
1: 'n_frames'
}
if hparams.get('use_speed_embed', False):
if hparams['use_speed_embed']:
if self.expose_velocity:
kwargs['velocity'] = torch.rand((1, n_frames), dtype=torch.float32, device=self.device)
input_names.append('velocity')
Expand Down
25 changes: 18 additions & 7 deletions deployment/modules/fastspeech2.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,22 @@
import copy

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

from modules.commons.common_layers import NormalInitEmbedding as Embedding
from modules.fastspeech.acoustic_encoder import FastSpeech2Acoustic
from modules.fastspeech.variance_encoder import FastSpeech2Variance
from utils.hparams import hparams
from utils.pitch_utils import (
f0_bin, f0_mel_min, f0_mel_max
)
from utils.text_encoder import PAD_INDEX

f0_bin = 256
f0_max = 1100.0
f0_min = 50.0
f0_mel_min = 1127 * np.log(1 + f0_min / 700)
f0_mel_max = 1127 * np.log(1 + f0_max / 700)


def f0_to_coarse(f0):
f0_mel = 1127 * (1 + f0 / 700).log()
Expand All @@ -38,10 +43,16 @@ def forward(self, dur):
class FastSpeech2AcousticONNX(FastSpeech2Acoustic):
def __init__(self, vocab_size):
super().__init__(vocab_size=vocab_size)

# for temporary compatibility; will be completely removed in the future
self.f0_embed_type = hparams['f0_embed_type']
if self.f0_embed_type == 'discrete':
self.pitch_embed = Embedding(300, hparams['hidden_size'], PAD_INDEX)

self.lr = LengthRegulator()
if hparams.get('use_key_shift_embed', False):
if hparams['use_key_shift_embed']:
self.shift_min, self.shift_max = hparams['augmentation_args']['random_pitch_shifting']['range']
if hparams.get('use_speed_embed', False):
if hparams['use_speed_embed']:
self.speed_min, self.speed_max = hparams['augmentation_args']['random_time_stretching']['range']

# noinspection PyMethodOverriding
Expand Down Expand Up @@ -71,7 +82,7 @@ def forward(self, tokens, durations, f0, variances: dict, gender=None, velocity=
], dim=-1).sum(-1)
condition += variance_embeds

if hparams.get('use_key_shift_embed', False):
if hparams['use_key_shift_embed']:
if hasattr(self, 'frozen_key_shift'):
key_shift_embed = self.key_shift_embed(self.frozen_key_shift[:, None, None])
else:
Expand All @@ -81,7 +92,7 @@ def forward(self, tokens, durations, f0, variances: dict, gender=None, velocity=
key_shift_embed = self.key_shift_embed(key_shift[:, :, None])
condition += key_shift_embed

if hparams.get('use_speed_embed', False):
if hparams['use_speed_embed']:
if velocity is not None:
velocity = torch.clip(velocity, min=self.speed_min, max=self.speed_max)
speed_embed = self.speed_embed(velocity[:, :, None])
Expand Down
6 changes: 1 addition & 5 deletions docs/BestPractices.md
Original file line number Diff line number Diff line change
Expand Up @@ -132,11 +132,7 @@ Once the coverage checks passed, a phoneme distribution summary will be saved in

![phoneme-distribution](resources/phoneme-distribution.jpg)

During the binarization process, each phoneme will be assigned with a unique phoneme ID according the order of their names. By default, there are one padding index before all real phonemes IDs. You may edit the number of padding indices, but it is not recommended to do so:

```yaml
num_pad_tokens: 1
```
During the binarization process, each phoneme will be assigned with a unique phoneme ID according the order of their names. There are one padding index (marked as `<PAD`) before all real phonemes IDs.

The dictionary used to binarize the dataset will be copied to the binary data directory by the binarizer, and will be copied again to the experiment directory by the trainer. When exported to ONNX, the dictionary and the phoneme sequence ordered by IDs will be saved to the artifact directory. You do not need to carry the original dictionary file for training and inference.

Expand Down
Loading