-
Notifications
You must be signed in to change notification settings - Fork 81
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #44 from r9y9/mdn-improvements
Dim-wise MDN: attempt to improve MDN-based models
- Loading branch information
Showing
21 changed files
with
534 additions
and
83 deletions.
There are no files selected for viewing
Empty file.
16 changes: 16 additions & 0 deletions
16
egs/nit-song070/svs-world-conv-mdn/conf/train/acoustic/data/myconfig.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
# @package _group_ | ||
|
||
# training set | ||
train_no_dev: | ||
in_dir: | ||
out_dir: | ||
|
||
# development set | ||
dev: | ||
in_dir: | ||
out_dir: | ||
|
||
# data loader | ||
num_workers: 2 | ||
batch_size: 2 | ||
pin_memory: true |
17 changes: 17 additions & 0 deletions
17
egs/nit-song070/svs-world-conv-mdn/conf/train/acoustic/model/acoustic_baseline.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
# @package _group_ | ||
# (mgc, lf0, vuv, bap) | ||
stream_sizes: [180, 3, 1, 15] | ||
has_dynamic_features: [true, true, false, true] | ||
num_windows: 3 | ||
# If None, automatically set based on stream sizes | ||
stream_weights: | ||
|
||
netG: | ||
_target_: nnsvs.model.Conv1dResnetMDN | ||
in_dim: 299 | ||
out_dim: 199 | ||
hidden_dim: 128 | ||
num_layers: 6 | ||
dropout: 0.1 | ||
num_gaussians: 4 | ||
dim_wise: false |
17 changes: 17 additions & 0 deletions
17
egs/nit-song070/svs-world-conv-mdn/conf/train/acoustic/model/acoustic_mdn.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
# @package _group_ | ||
# (mgc, lf0, vuv, bap) | ||
stream_sizes: [180, 3, 1, 15] | ||
has_dynamic_features: [true, true, false, true] | ||
num_windows: 3 | ||
# If None, automatically set based on stream sizes | ||
stream_weights: | ||
|
||
netG: | ||
_target_: nnsvs.model.Conv1dResnetMDN | ||
in_dim: 299 | ||
out_dim: 199 | ||
hidden_dim: 128 | ||
num_layers: 6 | ||
dropout: 0.1 | ||
num_gaussians: 4 | ||
dim_wise: true |
29 changes: 29 additions & 0 deletions
29
egs/nit-song070/svs-world-conv-mdn/conf/train/acoustic/train/myconfig.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
# @package _group_ | ||
|
||
out_dir: exp | ||
nepochs: 200 | ||
checkpoint_epoch_interval: 20 | ||
|
||
stream_wise_loss: false | ||
use_detect_anomaly: true | ||
|
||
optim: | ||
optimizer: | ||
name: Adam | ||
params: | ||
lr: 0.001 | ||
betas: [0.9, 0.999] | ||
weight_decay: 0.0 | ||
lr_scheduler: | ||
name: StepLR | ||
params: | ||
step_size: 50 | ||
gamma: 0.5 | ||
|
||
resume: | ||
checkpoint: | ||
load_optimizer: false | ||
|
||
cudnn: | ||
benchmark: false | ||
deterministic: true |
16 changes: 16 additions & 0 deletions
16
egs/nit-song070/svs-world-conv-mdn/conf/train/duration/data/myconfig.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
# @package _group_ | ||
|
||
# training set | ||
train_no_dev: | ||
in_dir: | ||
out_dir: | ||
|
||
# development set | ||
dev: | ||
in_dir: | ||
out_dir: | ||
|
||
# data loader | ||
num_workers: 2 | ||
batch_size: 2 | ||
pin_memory: true |
14 changes: 14 additions & 0 deletions
14
egs/nit-song070/svs-world-conv-mdn/conf/train/duration/model/duration_mdn.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
# @package _group_ | ||
|
||
stream_sizes: [1] | ||
has_dynamic_features: [false] | ||
stream_weights: [1] | ||
|
||
netG: | ||
_target_: nnsvs.model.MDN | ||
in_dim: 295 | ||
out_dim: 1 | ||
hidden_dim: 1024 | ||
num_layers: 4 | ||
dropout: 0.5 | ||
num_gaussians: 4 |
29 changes: 29 additions & 0 deletions
29
egs/nit-song070/svs-world-conv-mdn/conf/train/duration/train/myconfig.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
# @package _group_ | ||
|
||
out_dir: exp | ||
nepochs: 50 | ||
checkpoint_epoch_interval: 20 | ||
|
||
stream_wise_loss: false | ||
use_detect_anomaly: true | ||
|
||
optim: | ||
optimizer: | ||
name: Adam | ||
params: | ||
lr: 0.001 | ||
betas: [0.9, 0.999] | ||
weight_decay: 0.0 | ||
lr_scheduler: | ||
name: StepLR | ||
params: | ||
step_size: 20 | ||
gamma: 0.5 | ||
|
||
resume: | ||
checkpoint: | ||
load_optimizer: false | ||
|
||
cudnn: | ||
benchmark: false | ||
deterministic: true |
16 changes: 16 additions & 0 deletions
16
egs/nit-song070/svs-world-conv-mdn/conf/train/timelag/data/myconfig.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
# @package _group_ | ||
|
||
# training set | ||
train_no_dev: | ||
in_dir: | ||
out_dir: | ||
|
||
# development set | ||
dev: | ||
in_dir: | ||
out_dir: | ||
|
||
# data loader | ||
num_workers: 2 | ||
batch_size: 2 | ||
pin_memory: true |
14 changes: 14 additions & 0 deletions
14
egs/nit-song070/svs-world-conv-mdn/conf/train/timelag/model/timelag_mdn.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
# @package _group_ | ||
|
||
stream_sizes: [1] | ||
has_dynamic_features: [false] | ||
stream_weights: [1] | ||
|
||
netG: | ||
_target_: nnsvs.model.MDN | ||
in_dim: 295 | ||
out_dim: 1 | ||
hidden_dim: 1024 | ||
num_layers: 4 | ||
dropout: 0.5 | ||
num_gaussians: 4 |
29 changes: 29 additions & 0 deletions
29
egs/nit-song070/svs-world-conv-mdn/conf/train/timelag/train/myconfig.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
# @package _group_ | ||
|
||
out_dir: exp | ||
nepochs: 50 | ||
checkpoint_epoch_interval: 20 | ||
|
||
stream_wise_loss: false | ||
use_detect_anomaly: true | ||
|
||
optim: | ||
optimizer: | ||
name: Adam | ||
params: | ||
lr: 0.001 | ||
betas: [0.9, 0.999] | ||
weight_decay: 0.0 | ||
lr_scheduler: | ||
name: StepLR | ||
params: | ||
step_size: 20 | ||
gamma: 0.5 | ||
|
||
resume: | ||
checkpoint: | ||
load_optimizer: false | ||
|
||
cudnn: | ||
benchmark: false | ||
deterministic: true |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
# General settings. | ||
spk: "yoko" | ||
|
||
# exp tag(for managing experiments) | ||
tag: | ||
|
||
########################################################### | ||
# DATA PREPARATION SETTING # | ||
########################################################### | ||
|
||
# Directory of Unzipped singing voice database | ||
# PLEASE CHANGE THE PATH BASED ON YOUR ENVIRONMENT | ||
db_root: "downloads/HTS-demo_NIT-SONG070-F001" | ||
|
||
# Output directory | ||
out_dir: "./data" | ||
|
||
########################################################### | ||
# FEATURE EXTRACTION SETTING # | ||
########################################################### | ||
|
||
# HTS-style question used for extracting musical/linguistic context from musicxml files | ||
question_path: "../../_common/hed/jp_qst003_nnsvs.hed" | ||
|
||
timelag_features: defaults | ||
duration_features: defaults | ||
acoustic_features: static_deltadelta | ||
|
||
########################################################### | ||
# TRAINING SETTING # | ||
########################################################### | ||
|
||
# Models | ||
# To customize, put your config or change ones in | ||
# conf/train/{timelag,duration,acoustic}/ and | ||
# specify the config name below | ||
# NOTE: *_model: model definition, *_train: general train configs, | ||
# *_data: data configs (e.g., batch size) | ||
|
||
timelag_model: timelag_mdn | ||
timelag_train: myconfig | ||
timelag_data: myconfig | ||
|
||
duration_model: duration_mdn | ||
duration_train: myconfig | ||
duration_data: myconfig | ||
|
||
acoustic_model: acoustic_mdn | ||
acoustic_train: myconfig | ||
acoustic_data: myconfig | ||
|
||
# Pretrained model dir (leave empty to disable) | ||
pretrained_expdir: | ||
|
||
########################################################### | ||
# SYNTHESIS SETTING # | ||
########################################################### | ||
timelag_synthesis: defaults | ||
duration_synthesis: defaults | ||
acoustic_synthesis: defaults | ||
|
||
# latest.pth or best.pth | ||
timelag_eval_checkpoint: latest.pth | ||
duration_eval_checkpoint: latest.pth | ||
acoustic_eval_checkpoint: latest.pth |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
../svs-world-conv |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,100 @@ | ||
#!/bin/bash | ||
|
||
# Set bash to 'debug' mode, it will exit on : | ||
# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands', | ||
set -e | ||
set -u | ||
set -o pipefail | ||
|
||
function xrun () { | ||
set -x | ||
$@ | ||
set +x | ||
} | ||
|
||
script_dir=$(cd $(dirname ${BASH_SOURCE:-$0}); pwd) | ||
NNSVS_ROOT=$script_dir/../../../ | ||
NNSVS_COMMON_ROOT=$NNSVS_ROOT/egs/_common/spsvs | ||
. $NNSVS_ROOT/utils/yaml_parser.sh || exit 1; | ||
|
||
eval $(parse_yaml "./config.yaml" "") | ||
|
||
train_set="train_no_dev" | ||
dev_set="dev" | ||
eval_set="eval" | ||
datasets=($train_set $dev_set $eval_set) | ||
testsets=($dev_set $eval_set) | ||
|
||
dumpdir=dump | ||
|
||
dump_org_dir=$dumpdir/$spk/org | ||
dump_norm_dir=$dumpdir/$spk/norm | ||
|
||
stage=0 | ||
stop_stage=0 | ||
|
||
. $NNSVS_ROOT/utils/parse_options.sh || exit 1; | ||
|
||
# exp name | ||
if [ -z ${tag:=} ]; then | ||
expname=${spk} | ||
else | ||
expname=${spk}_${tag} | ||
fi | ||
expdir=exp/$expname | ||
|
||
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then | ||
if [ ! -e downloads/HTS-demo_NIT-SONG070-F001 ]; then | ||
echo "stage -1: Downloading data" | ||
mkdir -p downloads | ||
cd downloads | ||
curl -LO http://hts.sp.nitech.ac.jp/archives/2.3/HTS-demo_NIT-SONG070-F001.tar.bz2 | ||
tar jxvf HTS-demo_NIT-SONG070-F001.tar.bz2 | ||
cd $script_dir | ||
fi | ||
fi | ||
|
||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then | ||
echo "stage 0: Data preparation" | ||
# the following three directories will be created | ||
# 1) data/timelag 2) data/duration 3) data/acoustic | ||
python local/data_prep.py $hts_demo_root ./data --gain-normalize | ||
|
||
echo "train/dev/eval split" | ||
mkdir -p data/list | ||
find data/acoustic/ -type f -name "*.wav" -exec basename {} .wav \; \ | ||
| sort > data/list/utt_list.txt | ||
grep _003 data/list/utt_list.txt > data/list/$eval_set.list | ||
grep _004 data/list/utt_list.txt > data/list/$dev_set.list | ||
grep -v _003 data/list/utt_list.txt | grep -v _004 > data/list/$train_set.list | ||
fi | ||
|
||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then | ||
echo "stage 1: Feature generation" | ||
. $NNSVS_COMMON_ROOT/feature_generation.sh | ||
fi | ||
|
||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then | ||
echo "stage 2: Training time-lag model" | ||
. $NNSVS_COMMON_ROOT/train_timelag.sh | ||
fi | ||
|
||
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then | ||
echo "stage 3: Training duration model" | ||
. $NNSVS_COMMON_ROOT/train_duration.sh | ||
fi | ||
|
||
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then | ||
echo "stage 4: Training acoustic model" | ||
. $NNSVS_COMMON_ROOT/train_acoustic.sh | ||
fi | ||
|
||
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then | ||
echo "stage 5: Generate features from timelag/duration/acoustic models" | ||
. $NNSVS_COMMON_ROOT/generate.sh | ||
fi | ||
|
||
if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then | ||
echo "stage 6: Synthesis waveforms" | ||
. $NNSVS_COMMON_ROOT/synthesis.sh | ||
fi |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,3 +8,4 @@ defaults: | |
- data: defaults | ||
|
||
verbose: 100 | ||
seed: 773 |
Oops, something went wrong.