Skip to content

Commit

Permalink
Merge pull request #123 from r9y9/correct-vuv
Browse files Browse the repository at this point in the history
Add correct_vuv option for data pre-processing
  • Loading branch information
r9y9 committed Jun 26, 2022
2 parents f7e0449 + 767e1a6 commit 18aa4f7
Show file tree
Hide file tree
Showing 7 changed files with 52 additions and 15 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -46,3 +46,7 @@ vibrato_mode: none
# Ref: The NAIST Text-to-Speech System for the Blizzard Challenge 2015
trajectory_smoothing: false
trajectory_smoothing_cutoff: 50

# Correct V/UV based on the musical score.
# This is to prevent unwanted F0 estimation failures on silence regions.
correct_vuv: false
Original file line number Diff line number Diff line change
Expand Up @@ -46,3 +46,7 @@ vibrato_mode: diff
# Ref: The NAIST Text-to-Speech System for the Blizzard Challenge 2015
trajectory_smoothing: false
trajectory_smoothing_cutoff: 50

# Correct V/UV based on the musical score.
# This is to prevent unwanted F0 estimation failures on silence regions.
correct_vuv: false
Original file line number Diff line number Diff line change
Expand Up @@ -46,3 +46,7 @@ vibrato_mode: sine
# Ref: The NAIST Text-to-Speech System for the Blizzard Challenge 2015
trajectory_smoothing: false
trajectory_smoothing_cutoff: 50

# Correct V/UV based on the musical score.
# This is to prevent unwanted F0 estimation failures on silence regions.
correct_vuv: false
4 changes: 4 additions & 0 deletions nnsvs/bin/conf/prepare_features/acoustic/static_only.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -46,3 +46,7 @@ vibrato_mode: none
# Ref: The NAIST Text-to-Speech System for the Blizzard Challenge 2015
trajectory_smoothing: false
trajectory_smoothing_cutoff: 50

# Correct V/UV based on the musical score.
# This is to prevent unwanted F0 estimation failures on silence regions.
correct_vuv: false
Original file line number Diff line number Diff line change
Expand Up @@ -46,3 +46,7 @@ vibrato_mode: sine
# Ref: The NAIST Text-to-Speech System for the Blizzard Challenge 2015
trajectory_smoothing: false
trajectory_smoothing_cutoff: 50

# Correct V/UV based on the musical score.
# This is to prevent unwanted F0 estimation failures on silence regions.
correct_vuv: false
1 change: 1 addition & 0 deletions nnsvs/bin/prepare_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,7 @@ def my_app(config: DictConfig) -> None:
d4c_threshold=config.acoustic.d4c_threshold,
trajectory_smoothing=config.acoustic.trajectory_smoothing,
trajectory_smoothing_cutoff=config.acoustic.trajectory_smoothing_cutoff,
correct_vuv=config.acoustic.correct_vuv,
)
in_acoustic = FileSourceDataset(in_acoustic_source)
out_acoustic = FileSourceDataset(out_acoustic_source)
Expand Down
46 changes: 31 additions & 15 deletions nnsvs/data/data_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,7 @@ def __init__(
d4c_threshold=0.85,
trajectory_smoothing=False,
trajectory_smoothing_cutoff=50,
correct_vuv=False,
):
self.utt_list = utt_list
self.wav_root = wav_root
Expand All @@ -155,6 +156,7 @@ def __init__(
self.d4c_threshold = d4c_threshold
self.trajectory_smoothing = trajectory_smoothing
self.trajectory_smoothing_cutoff = trajectory_smoothing_cutoff
self.correct_vuv = correct_vuv

def collect_files(self):
wav_paths = _collect_files(self.wav_root, self.utt_list, ".wav")
Expand Down Expand Up @@ -205,6 +207,34 @@ def collect_features(self, wav_path, label_path):
# Workaround for https://github.com/r9y9/nnsvs/issues/7
f0 = np.maximum(f0, 0)

# Correct V/UV (and F0) based on the musical score information
# treat frames where musical notes are not assigned as unvoiced
if self.correct_vuv:
# Use smoothed mask so that we don't mask out overshoot or something
# that could happen at the start/end of notes
# 0.1 sec. window (could be tuned for better results)
win_length = int(0.1 / (self.frame_period * 0.001))
mask = np.convolve(f0_score, np.ones(win_length) / win_length, "same")
if len(f0) > len(mask):
mask = np.pad(mask, (0, len(f0) - len(mask)), "constant")
f0 = f0 * np.sign(mask)

spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs, f0_floor=min_f0)
aperiodicity = pyworld.d4c(x, f0, timeaxis, fs, threshold=self.d4c_threshold)

f0 = f0[:, None]
lf0 = f0.copy()
nonzero_indices = np.nonzero(f0)
lf0[nonzero_indices] = np.log(f0[nonzero_indices])
if self.use_harvest:
# https://github.com/mmorise/World/issues/35#issuecomment-306521887
vuv = (aperiodicity[:, 0] < 0.5).astype(np.float32)[:, None]
else:
vuv = (lf0 != 0).astype(np.float32)

# F0 -> continuous F0
lf0 = interp1d(lf0, kind="slinear")

# Vibrato parameter extraction
sr_f0 = int(1 / (self.frame_period * 0.001))
if self.vibrato_mode == "sine":
Expand Down Expand Up @@ -250,25 +280,11 @@ def collect_features(self, wav_path, label_path):
else:
raise RuntimeError("Unknown vibrato mode: {}".format(self.vibrato_mode))

spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs, f0_floor=min_f0)
aperiodicity = pyworld.d4c(x, f0, timeaxis, fs, threshold=self.d4c_threshold)

mgc = pysptk.sp2mc(
spectrogram, order=self.mgc_order, alpha=pysptk.util.mcepalpha(fs)
)
# F0 of speech
f0 = f0[:, None]
lf0 = f0.copy()
nonzero_indices = np.nonzero(f0)
lf0[nonzero_indices] = np.log(f0[nonzero_indices])
if self.use_harvest:
# https://github.com/mmorise/World/issues/35#issuecomment-306521887
vuv = (aperiodicity[:, 0] < 0.5).astype(np.float32)[:, None]
else:
vuv = (lf0 != 0).astype(np.float32)
lf0 = interp1d(lf0, kind="slinear")

# Aperiodicy
# Post-processing for aperiodicy
# ref: https://github.com/MTG/WGANSing/blob/mtg/vocoder.py
if self.interp_unvoiced_aperiodicity:
is_voiced = (vuv > 0).reshape(-1)
Expand Down

0 comments on commit 18aa4f7

Please sign in to comment.