Skip to content

Commit

Permalink
Merge pull request #151 from nnsvs/icassp2023-exp
Browse files Browse the repository at this point in the history
Add recipes for reproducing experiments reported in our NNSVS paper
  • Loading branch information
r9y9 committed Nov 1, 2022
2 parents b0d240c + 8bb081a commit 992a4e3
Show file tree
Hide file tree
Showing 66 changed files with 3,407 additions and 0 deletions.
109 changes: 109 additions & 0 deletions recipes/_common/hed/jp_icassp2023.hed
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
# Feature dim:: 86 for acoustic model, 82 for duration/timelag
# in_rest_idx: 0
# in_lf0_idx: 51

QS "C-Phone_Muon" {*-sil+*,*-pau+*}

QS "C-VUV_Voiced" {*-a+*,*-i+*,*-u+*,*-e+*,*-o+*,*-v+*,*-b+*,*-by+*,*-m+*,*-my+*,*-w+*,*-z+*,*-j+*,*-d+*,*-dy+*,*-n+*,*-ny+*,*-N+*,*-r+*,*-ry+*,*-g+*,*-gy+*,*-y+*}
# NOTE: rename "C-NOFIX_VUV_Unvoiced" to "C-VUV_Unvoiced" to enable auto correcting unvoied phonemes
QS "C-NOFIX_VUV_Unvoiced" {*-A+*,*-I+*,*-U+*,*-E+*,*-O+*,*-f+*,*-p+*,*-py+*,*-s+*,*-sh+*,*-ts+*,*-ch+*,*-t+*,*-ty+*,*-k+*,*-ky+*,*-h+*,*-hy+*}

QS "C-Phone_sil" {*-sil+*}
QS "C-Phone_pau" {*-pau+*}
QS "C-Phone_A" {*-A+*}
QS "C-Phone_E" {*-E+*}
QS "C-Phone_I" {*-I+*}
QS "C-Phone_N" {*-N+*}
QS "C-Phone_O" {*-O+*}
QS "C-Phone_U" {*-U+*}
QS "C-Phone_a" {*-a+*}
QS "C-Phone_b" {*-b+*}
QS "C-Phone_br" {*-br+*}
QS "C-Phone_by" {*-by+*}
QS "C-Phone_ch" {*-ch+*}
QS "C-Phone_cl" {*-cl+*}
QS "C-Phone_d" {*-d+*}
QS "C-Phone_dy" {*-dy+*}
QS "C-Phone_e" {*-e+*}
QS "C-Phone_f" {*-f+*}
QS "C-Phone_g" {*-g+*}
QS "C-Phone_gy" {*-gy+*}
QS "C-Phone_h" {*-h+*}
QS "C-Phone_hy" {*-hy+*}
QS "C-Phone_i" {*-i+*}
QS "C-Phone_j" {*-j+*}
QS "C-Phone_k" {*-k+*}
QS "C-Phone_ky" {*-ky+*}
QS "C-Phone_m" {*-m+*}
QS "C-Phone_my" {*-my+*}
QS "C-Phone_n" {*-n+*}
QS "C-Phone_ny" {*-ny+*}
QS "C-Phone_o" {*-o+*}
QS "C-Phone_p" {*-p+*}
QS "C-Phone_py" {*-py+*}
QS "C-Phone_r" {*-r+*}
QS "C-Phone_ry" {*-ry+*}
QS "C-Phone_s" {*-s+*}
QS "C-Phone_sh" {*-sh+*}
QS "C-Phone_t" {*-t+*}
QS "C-Phone_ts" {*-ts+*}
QS "C-Phone_ty" {*-ty+*}
QS "C-Phone_u" {*-u+*}
QS "C-Phone_v" {*-v+*}
QS "C-Phone_w" {*-w+*}
QS "C-Phone_y" {*-y+*}
QS "C-Phone_z" {*-z+*}
QS "C-Phone_GlottalStop" {*-GlottalStop+*}
QS "C-Phone_Edge" {*-Edge+*}

# absolute pitch (L/C/R)
CQS "d1" {/D:(\NOTE)!}
CQS "e1" {/E:(\NOTE)]}
CQS "f1" {/F:(\NOTE)#}

# relative pitch (C)
CQS "e2" {](\d+)^}

# phoneme-level positional features (C)
CQS "p12" {-(\d+)!}
CQS "p13" {!(\d+)[}

# distance between consonant and vowel
CQS "p14" {[(\d+)$}
CQS "p15" {$(\d+)]}

# number of phonemes in a syllable (C)
CQS "b1" {/B:(\d+)_}

# syllable potional features (C)
CQS "b2" {_(\d+)_}
CQS "b3" {_(\d+)@}

# length of current note (C)
CQS "e6" {!(\d+)@}
CQS "e7" {@(\d+)#}
CQS "e8" {#(\d+)+}

# note-level positional features in measures (C)
CQS "e10_position_by_note_in_measure" {](\d+)$}
CQS "e11_position_by_note_in_measure" {$(\d+)|}
CQS "e12_position_by_10ms_in_measure" {|(\d+)[}
CQS "e13_position_by_10ms_in_measure" {[(\d+)&}
CQS "e14_position_by_96th_note_in_measure" {&(\d+)]}
CQS "e15_position_by_96th_note_in_measure" {](\d+)=}
CQS "e16_position_by_percent_in_measure" {=(\d+)^}
CQS "e17_position_by_percent_in_measure" {^(\d+)~}

# note-level positional features in phrase (C)
CQS "e18_position_by_note" {~(\d+)#}
CQS "e19_position_by_note" {#(\d+)_}
CQS "e20_position_by_10ms" {_(\d+);}
CQS "e21_position_by_10ms" {;(\d+)$}
CQS "e22_position_by_96th_note" {$(\d+)&}
CQS "e23_position_by_96th_note" {&(\d+)%}
CQS "e24_position_by_percent" {%(\d+)[}
CQS "e25_position_by_percent" {[(\d+)|}

# pitch diff
CQS "e57" {~([pm]\d+)+}
CQS "e58" {+([pm]\d+)!}
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
### General settings.
## The name of singer
spk: "namine_ritsu"

## exp tag(for managing experiments)
tag:

## Directory of Unzipped singing voice database
# PLEASE CHANGE THE PATH BASED ON YOUR ENVIRONMENT
db_root: "~/data/「波音リツ」歌声データベースVer2"

## Output directory
# All the generated labels, intermediate files, and segmented wav files
# will be saved in the following directory
out_dir: "./data"

## Songs to be excluded from training.
exclude_songs: []

### Utaupy related settings.
## Utaupy table path
# This singing voice database contains the unvoiced vowels "I" and "U".
# To enable the unvoiced vowels, modified version of sinsy dictionary files
# are needed.
utaupy_table_path: "../../_common/no2/utaupy/kana2phonemes_002_oto2lab.table"

## HTS-style question used for extracting musical/linguistic context from musicxml files
question_path: "../../_common/hed/jp_icassp2023.hed"

### Data preparation related settings.
## Song segmentation by silence durations.
# Split song by silences (in sec)
segmentation_threshold: 0.1
# Min duration for a segment
# note: there could be some exceptions (e.g., the last segment of a song)
segment_min_duration: 5.0
# Force split segments if long silence is found regardless of min_duration
force_split_threshold: 5.0
# Offset correction
# If True, offset is computed in an entire song
# otherwise offset is computed for each segment
global_offset_correction: False
offset_correction_threshold: 0.01
# Time-lag constraints to filter outliers
timelag_allowed_range: [-20, 19]
timelag_allowed_range_rest: [-40, 39]

suppress_start_end_pau: True
start_end_pau_suppression_ratio: 0.2

# Audio sampling rate
# CAUTION: Changing sample_rate may affect the dimension number of acoustic features.
# DO NOT CHANGE this unless you know the relationship between the dim of bap and sample_rate.
sample_rate: 24000

gain_normalize: False

###########################################################
# FEATURE EXTRACTION SETTING #
###########################################################

timelag_features: defaults
duration_features: defaults
acoustic_features: nnsvs_contrib_melf0_24k_diffsinger_compat

# Parameter trajectory smoothing
# Ref: The NAIST Text-to-Speech System for the Blizzard Challenge 2015
trajectory_smoothing: false
trajectory_smoothing_cutoff: 50

###########################################################
# TRAINING SETTING #
###########################################################

# Models
# To customize, put your config or change ones in
# conf/train/{timelag,duration,acoustic}/ and
# specify the config name below
# NOTE: *_model: model definition, *_train: general train configs,
# *_data: data configs (e.g., batch size)

timelag_model: timelag_vp_mdn
timelag_train: myconfig
timelag_data: myconfig

duration_model: duration_vp_mdn
duration_train: myconfig
duration_data: myconfig

acoustic_model: acoustic_nnsvs_ar
acoustic_train: myconfig
acoustic_data: melf0

postfilter_model: postfilter_mgc
postfilter_train: mgc
postfilter_data: myconfig

# Pretrained model dir (leave empty to disable)
pretrained_expdir:

# Advanced settings for hyperparameter search with Hydra and Optuna.
# https://hydra.cc/docs/plugins/optuna_sweeper/
# NOTE: Don't use spaces for each search space configuration.
# OK: data.batch_size=range(1,16)
# NG: data.batch_size=range(1, 16)
# Example 1: data.batch_size=range(1,16) model.netG.hidden_dim=choice(32,64,128)
# Example 2: train.optim.optimizer.params.lr=interval(0.0001,0.01)
timelag_hydra_optuna_sweeper_args:
timelag_hydra_optuna_sweeper_n_trials: 100
duration_hydra_optuna_sweeper_args:
duration_hydra_optuna_sweeper_n_trials: 100
acoustic_hydra_optuna_sweeper_args:
acoustic_hydra_optuna_sweeper_n_trials: 100

###########################################################
# SYNTHESIS SETTING #
###########################################################
# conf/synthesis/synthesis/${synthesis}
synthesis: world_gv

# latest.pth or best.pth
timelag_eval_checkpoint: latest.pth
duration_eval_checkpoint: latest.pth
acoustic_eval_checkpoint: latest.pth
postfilter_eval_checkpoint: latest.pth

###########################################################
# VOCODER SETTING #
###########################################################

# NOTE: conf/parallel_wavegan/${vocoder_model}.yaml must exist.
vocoder_model:
# Pretrained checkpoint path for the vocoder model
# NOTE: if you want to try fine-tuning, please specify the path here
pretrained_vocoder_checkpoint:
# absolute/relative path to the checkpoint
# NOTE: the checkpoint is used for synthesis and packing
# This doesn't have any effect on training
vocoder_eval_checkpoint:
1 change: 1 addition & 0 deletions recipes/namine_ritsu_utagoe_db/icassp2023-24k-mel/conf

0 comments on commit 992a4e3

Please sign in to comment.