Merge pull request #151 from nnsvs/icassp2023-exp

Add recipes for reproducing experiments reported in our NNSVS paper
nnsvs · Nov 1, 2022 · 992a4e3 · 992a4e3
2 parents b0d240c + 8bb081a
commit 992a4e3
Show file tree

Hide file tree

Showing 66 changed files with 3,407 additions and 0 deletions.
diff --git a/recipes/_common/hed/jp_icassp2023.hed b/recipes/_common/hed/jp_icassp2023.hed
@@ -0,0 +1,109 @@
+# Feature dim:: 86 for acoustic model, 82 for duration/timelag
+# in_rest_idx: 0
+# in_lf0_idx: 51
+
+QS "C-Phone_Muon"     {*-sil+*,*-pau+*}
+
+QS "C-VUV_Voiced" {*-a+*,*-i+*,*-u+*,*-e+*,*-o+*,*-v+*,*-b+*,*-by+*,*-m+*,*-my+*,*-w+*,*-z+*,*-j+*,*-d+*,*-dy+*,*-n+*,*-ny+*,*-N+*,*-r+*,*-ry+*,*-g+*,*-gy+*,*-y+*}
+# NOTE: rename "C-NOFIX_VUV_Unvoiced" to "C-VUV_Unvoiced" to enable auto correcting unvoied phonemes
+QS "C-NOFIX_VUV_Unvoiced"  {*-A+*,*-I+*,*-U+*,*-E+*,*-O+*,*-f+*,*-p+*,*-py+*,*-s+*,*-sh+*,*-ts+*,*-ch+*,*-t+*,*-ty+*,*-k+*,*-ky+*,*-h+*,*-hy+*}
+
+QS "C-Phone_sil" {*-sil+*}
+QS "C-Phone_pau" {*-pau+*}
+QS "C-Phone_A"   {*-A+*}
+QS "C-Phone_E"   {*-E+*}
+QS "C-Phone_I"   {*-I+*}
+QS "C-Phone_N"   {*-N+*}
+QS "C-Phone_O"   {*-O+*}
+QS "C-Phone_U"   {*-U+*}
+QS "C-Phone_a"   {*-a+*}
+QS "C-Phone_b"   {*-b+*}
+QS "C-Phone_br"  {*-br+*}
+QS "C-Phone_by"  {*-by+*}
+QS "C-Phone_ch"  {*-ch+*}
+QS "C-Phone_cl"  {*-cl+*}
+QS "C-Phone_d"   {*-d+*}
+QS "C-Phone_dy"  {*-dy+*}
+QS "C-Phone_e"   {*-e+*}
+QS "C-Phone_f"   {*-f+*}
+QS "C-Phone_g"   {*-g+*}
+QS "C-Phone_gy"  {*-gy+*}
+QS "C-Phone_h"   {*-h+*}
+QS "C-Phone_hy"  {*-hy+*}
+QS "C-Phone_i"   {*-i+*}
+QS "C-Phone_j"   {*-j+*}
+QS "C-Phone_k"   {*-k+*}
+QS "C-Phone_ky"  {*-ky+*}
+QS "C-Phone_m"   {*-m+*}
+QS "C-Phone_my"  {*-my+*}
+QS "C-Phone_n"   {*-n+*}
+QS "C-Phone_ny"  {*-ny+*}
+QS "C-Phone_o"   {*-o+*}
+QS "C-Phone_p"   {*-p+*}
+QS "C-Phone_py"  {*-py+*}
+QS "C-Phone_r"   {*-r+*}
+QS "C-Phone_ry"  {*-ry+*}
+QS "C-Phone_s"   {*-s+*}
+QS "C-Phone_sh"  {*-sh+*}
+QS "C-Phone_t"   {*-t+*}
+QS "C-Phone_ts"  {*-ts+*}
+QS "C-Phone_ty"  {*-ty+*}
+QS "C-Phone_u"   {*-u+*}
+QS "C-Phone_v"   {*-v+*}
+QS "C-Phone_w"   {*-w+*}
+QS "C-Phone_y"   {*-y+*}
+QS "C-Phone_z"   {*-z+*}
+QS "C-Phone_GlottalStop"   {*-GlottalStop+*}
+QS "C-Phone_Edge"          {*-Edge+*}
+
+# absolute pitch (L/C/R)
+CQS "d1" {/D:(\NOTE)!}
+CQS "e1" {/E:(\NOTE)]}
+CQS "f1" {/F:(\NOTE)#}
+
+# relative pitch (C)
+CQS "e2" {](\d+)^}
+
+# phoneme-level positional features (C)
+CQS "p12" {-(\d+)!}
+CQS "p13" {!(\d+)[}
+
+# distance between consonant and vowel
+CQS "p14" {[(\d+)$}
+CQS "p15" {$(\d+)]}
+
+# number of phonemes in a syllable (C)
+CQS "b1" {/B:(\d+)_}
+
+# syllable potional features (C)
+CQS "b2" {_(\d+)_}
+CQS "b3" {_(\d+)@}
+
+# length of current note (C)
+CQS "e6" {!(\d+)@}
+CQS "e7" {@(\d+)#}
+CQS "e8" {#(\d+)+}
+
+# note-level positional features in measures (C)
+CQS "e10_position_by_note_in_measure"      {](\d+)$}
+CQS "e11_position_by_note_in_measure"      {$(\d+)|}
+CQS "e12_position_by_10ms_in_measure"      {|(\d+)[}
+CQS "e13_position_by_10ms_in_measure"      {[(\d+)&}
+CQS "e14_position_by_96th_note_in_measure" {&(\d+)]}
+CQS "e15_position_by_96th_note_in_measure" {](\d+)=}
+CQS "e16_position_by_percent_in_measure"   {=(\d+)^}
+CQS "e17_position_by_percent_in_measure"   {^(\d+)~}
+
+# note-level positional features in phrase (C)
+CQS "e18_position_by_note"      {~(\d+)#}
+CQS "e19_position_by_note"      {#(\d+)_}
+CQS "e20_position_by_10ms"      {_(\d+);}
+CQS "e21_position_by_10ms"      {;(\d+)$}
+CQS "e22_position_by_96th_note" {$(\d+)&}
+CQS "e23_position_by_96th_note" {&(\d+)%}
+CQS "e24_position_by_percent"   {%(\d+)[}
+CQS "e25_position_by_percent"   {[(\d+)|}
+
+# pitch diff
+CQS "e57" {~([pm]\d+)+}
+CQS "e58" {+([pm]\d+)!}
diff --git a/recipes/namine_ritsu_utagoe_db/icassp2023-24k-mel-diffsinger-compat/README.md b/recipes/namine_ritsu_utagoe_db/icassp2023-24k-mel-diffsinger-compat/README.md
@@ -0,0 +1 @@
+../icassp2023-24k-world/README.md
diff --git a/recipes/namine_ritsu_utagoe_db/icassp2023-24k-mel-diffsinger-compat/anasyn_icassp2023.sh b/recipes/namine_ritsu_utagoe_db/icassp2023-24k-mel-diffsinger-compat/anasyn_icassp2023.sh
@@ -0,0 +1 @@
+../icassp2023-24k-world/anasyn_icassp2023.sh
diff --git a/recipes/namine_ritsu_utagoe_db/icassp2023-24k-mel-diffsinger-compat/conf b/recipes/namine_ritsu_utagoe_db/icassp2023-24k-mel-diffsinger-compat/conf
@@ -0,0 +1 @@
+../icassp2023-24k-world/conf
diff --git a/recipes/namine_ritsu_utagoe_db/icassp2023-24k-mel-diffsinger-compat/config.yaml b/recipes/namine_ritsu_utagoe_db/icassp2023-24k-mel-diffsinger-compat/config.yaml
@@ -0,0 +1,139 @@
+### General settings.
+## The name of singer
+spk: "namine_ritsu"
+
+## exp tag(for managing experiments)
+tag:
+
+## Directory of Unzipped singing voice database
+# PLEASE CHANGE THE PATH BASED ON YOUR ENVIRONMENT
+db_root: "~/data/「波音リツ」歌声データベースVer2"
+
+## Output directory
+# All the generated labels, intermediate files, and segmented wav files
+# will be saved in the following directory
+out_dir: "./data"
+
+## Songs to be excluded from training.
+exclude_songs: []
+
+### Utaupy related settings.
+## Utaupy table path
+# This singing voice database contains the unvoiced vowels "I" and "U".
+# To enable the unvoiced vowels, modified version of sinsy dictionary files
+# are needed.
+utaupy_table_path: "../../_common/no2/utaupy/kana2phonemes_002_oto2lab.table"
+
+## HTS-style question used for extracting musical/linguistic context from musicxml files
+question_path: "../../_common/hed/jp_icassp2023.hed"
+
+### Data preparation related settings.
+## Song segmentation by silence durations.
+# Split song by silences (in sec)
+segmentation_threshold: 0.1
+# Min duration for a segment
+# note: there could be some exceptions (e.g., the last segment of a song)
+segment_min_duration: 5.0
+# Force split segments if long silence is found regardless of min_duration
+force_split_threshold: 5.0
+# Offset correction
+# If True, offset is computed in an entire song
+# otherwise offset is computed for each segment
+global_offset_correction: False
+offset_correction_threshold: 0.01
+# Time-lag constraints to filter outliers
+timelag_allowed_range: [-20, 19]
+timelag_allowed_range_rest: [-40, 39]
+
+suppress_start_end_pau: True
+start_end_pau_suppression_ratio: 0.2
+
+# Audio sampling rate
+# CAUTION: Changing sample_rate may affect the dimension number of acoustic features.
+# DO NOT CHANGE this unless you know the relationship between the dim of bap and sample_rate.
+sample_rate: 24000
+
+gain_normalize: False
+
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+
+timelag_features: defaults
+duration_features: defaults
+acoustic_features: nnsvs_contrib_melf0_24k_diffsinger_compat
+
+# Parameter trajectory smoothing
+# Ref: The NAIST Text-to-Speech System for the Blizzard Challenge 2015
+trajectory_smoothing: false
+trajectory_smoothing_cutoff: 50
+
+###########################################################
+#                TRAINING SETTING                         #
+###########################################################
+
+# Models
+# To customize, put your config or change ones in
+# conf/train/{timelag,duration,acoustic}/ and
+# specify the config name below
+# NOTE: *_model: model definition, *_train: general train configs,
+# *_data: data configs (e.g., batch size)
+
+timelag_model: timelag_vp_mdn
+timelag_train: myconfig
+timelag_data: myconfig
+
+duration_model: duration_vp_mdn
+duration_train: myconfig
+duration_data: myconfig
+
+acoustic_model: acoustic_nnsvs_ar
+acoustic_train: myconfig
+acoustic_data: melf0
+
+postfilter_model: postfilter_mgc
+postfilter_train: mgc
+postfilter_data: myconfig
+
+# Pretrained model dir (leave empty to disable)
+pretrained_expdir:
+
+# Advanced settings for hyperparameter search with Hydra and Optuna.
+# https://hydra.cc/docs/plugins/optuna_sweeper/
+# NOTE: Don't use spaces for each search space configuration.
+# OK: data.batch_size=range(1,16)
+# NG: data.batch_size=range(1, 16)
+# Example 1: data.batch_size=range(1,16) model.netG.hidden_dim=choice(32,64,128)
+# Example 2: train.optim.optimizer.params.lr=interval(0.0001,0.01)
+timelag_hydra_optuna_sweeper_args:
+timelag_hydra_optuna_sweeper_n_trials: 100
+duration_hydra_optuna_sweeper_args:
+duration_hydra_optuna_sweeper_n_trials: 100
+acoustic_hydra_optuna_sweeper_args:
+acoustic_hydra_optuna_sweeper_n_trials: 100
+
+###########################################################
+#                SYNTHESIS SETTING                        #
+###########################################################
+# conf/synthesis/synthesis/${synthesis}
+synthesis: world_gv
+
+# latest.pth or best.pth
+timelag_eval_checkpoint: latest.pth
+duration_eval_checkpoint: latest.pth
+acoustic_eval_checkpoint: latest.pth
+postfilter_eval_checkpoint: latest.pth
+
+###########################################################
+#                VOCODER SETTING                          #
+###########################################################
+
+# NOTE: conf/parallel_wavegan/${vocoder_model}.yaml must exist.
+vocoder_model:
+# Pretrained checkpoint path for the vocoder model
+# NOTE: if you want to try fine-tuning, please specify the path here
+pretrained_vocoder_checkpoint:
+# absolute/relative path to the checkpoint
+# NOTE: the checkpoint is used for synthesis and packing
+# This doesn't have any effect on training
+vocoder_eval_checkpoint:
diff --git a/recipes/namine_ritsu_utagoe_db/icassp2023-24k-mel-diffsinger-compat/run.sh b/recipes/namine_ritsu_utagoe_db/icassp2023-24k-mel-diffsinger-compat/run.sh
@@ -0,0 +1 @@
+../icassp2023-24k-world/run.sh
diff --git a/recipes/namine_ritsu_utagoe_db/icassp2023-24k-mel-diffsinger-compat/synthesis_icassp2023.sh b/recipes/namine_ritsu_utagoe_db/icassp2023-24k-mel-diffsinger-compat/synthesis_icassp2023.sh
@@ -0,0 +1 @@
+../icassp2023-24k-world/synthesis_icassp2023.sh
diff --git a/recipes/namine_ritsu_utagoe_db/icassp2023-24k-mel/README.md b/recipes/namine_ritsu_utagoe_db/icassp2023-24k-mel/README.md
@@ -0,0 +1 @@
+../icassp2023-24k-world/README.md
diff --git a/recipes/namine_ritsu_utagoe_db/icassp2023-24k-mel/anasyn_icassp2023.sh b/recipes/namine_ritsu_utagoe_db/icassp2023-24k-mel/anasyn_icassp2023.sh
@@ -0,0 +1 @@
+../icassp2023-24k-world/anasyn_icassp2023.sh
diff --git a/recipes/namine_ritsu_utagoe_db/icassp2023-24k-mel/conf b/recipes/namine_ritsu_utagoe_db/icassp2023-24k-mel/conf
@@ -0,0 +1 @@
+../icassp2023-24k-world/conf