Merge pull request #123 from r9y9/correct-vuv

Add correct_vuv option for data pre-processing
nnsvs · Jun 26, 2022 · 18aa4f7 · 18aa4f7
2 parents f7e0449 + 767e1a6
commit 18aa4f7
Show file tree

Hide file tree

Showing 7 changed files with 52 additions and 15 deletions.
diff --git a/nnsvs/bin/conf/prepare_features/acoustic/static_deltadelta.yaml b/nnsvs/bin/conf/prepare_features/acoustic/static_deltadelta.yaml
@@ -46,3 +46,7 @@ vibrato_mode: none
 # Ref: The NAIST Text-to-Speech System for the Blizzard Challenge 2015
 trajectory_smoothing: false
 trajectory_smoothing_cutoff: 50
+
+# Correct V/UV based on the musical score.
+# This is to prevent unwanted F0 estimation failures on silence regions.
+correct_vuv: false
diff --git a/nnsvs/bin/conf/prepare_features/acoustic/static_deltadelta_diffvib.yaml b/nnsvs/bin/conf/prepare_features/acoustic/static_deltadelta_diffvib.yaml
@@ -46,3 +46,7 @@ vibrato_mode: diff
 # Ref: The NAIST Text-to-Speech System for the Blizzard Challenge 2015
 trajectory_smoothing: false
 trajectory_smoothing_cutoff: 50
+
+# Correct V/UV based on the musical score.
+# This is to prevent unwanted F0 estimation failures on silence regions.
+correct_vuv: false
diff --git a/nnsvs/bin/conf/prepare_features/acoustic/static_deltadelta_sinevib.yaml b/nnsvs/bin/conf/prepare_features/acoustic/static_deltadelta_sinevib.yaml
@@ -46,3 +46,7 @@ vibrato_mode: sine
 # Ref: The NAIST Text-to-Speech System for the Blizzard Challenge 2015
 trajectory_smoothing: false
 trajectory_smoothing_cutoff: 50
+
+# Correct V/UV based on the musical score.
+# This is to prevent unwanted F0 estimation failures on silence regions.
+correct_vuv: false
diff --git a/nnsvs/bin/conf/prepare_features/acoustic/static_only.yaml b/nnsvs/bin/conf/prepare_features/acoustic/static_only.yaml
@@ -46,3 +46,7 @@ vibrato_mode: none
 # Ref: The NAIST Text-to-Speech System for the Blizzard Challenge 2015
 trajectory_smoothing: false
 trajectory_smoothing_cutoff: 50
+
+# Correct V/UV based on the musical score.
+# This is to prevent unwanted F0 estimation failures on silence regions.
+correct_vuv: false
diff --git a/nnsvs/bin/conf/prepare_features/acoustic/static_only_sinevib.yaml b/nnsvs/bin/conf/prepare_features/acoustic/static_only_sinevib.yaml
@@ -46,3 +46,7 @@ vibrato_mode: sine
 # Ref: The NAIST Text-to-Speech System for the Blizzard Challenge 2015
 trajectory_smoothing: false
 trajectory_smoothing_cutoff: 50
+
+# Correct V/UV based on the musical score.
+# This is to prevent unwanted F0 estimation failures on silence regions.
+correct_vuv: false
diff --git a/nnsvs/bin/prepare_features.py b/nnsvs/bin/prepare_features.py
@@ -160,6 +160,7 @@ def my_app(config: DictConfig) -> None:
         d4c_threshold=config.acoustic.d4c_threshold,
         trajectory_smoothing=config.acoustic.trajectory_smoothing,
         trajectory_smoothing_cutoff=config.acoustic.trajectory_smoothing_cutoff,
+        correct_vuv=config.acoustic.correct_vuv,
     )
     in_acoustic = FileSourceDataset(in_acoustic_source)
     out_acoustic = FileSourceDataset(out_acoustic_source)

diff --git a/nnsvs/data/data_source.py b/nnsvs/data/data_source.py
@@ -134,6 +134,7 @@ def __init__(
         d4c_threshold=0.85,
         trajectory_smoothing=False,
         trajectory_smoothing_cutoff=50,
+        correct_vuv=False,
     ):
         self.utt_list = utt_list
         self.wav_root = wav_root
@@ -155,6 +156,7 @@ def __init__(
         self.d4c_threshold = d4c_threshold
         self.trajectory_smoothing = trajectory_smoothing
         self.trajectory_smoothing_cutoff = trajectory_smoothing_cutoff
+        self.correct_vuv = correct_vuv
 
     def collect_files(self):
         wav_paths = _collect_files(self.wav_root, self.utt_list, ".wav")
@@ -205,6 +207,34 @@ def collect_features(self, wav_path, label_path):
         # Workaround for https://github.com/r9y9/nnsvs/issues/7
         f0 = np.maximum(f0, 0)
 
+        # Correct V/UV (and F0) based on the musical score information
+        # treat frames where musical notes are not assigned as unvoiced
+        if self.correct_vuv:
+            # Use smoothed mask so that we don't mask out overshoot or something
+            # that could happen at the start/end of notes
+            # 0.1 sec. window (could be tuned for better results)
+            win_length = int(0.1 / (self.frame_period * 0.001))
+            mask = np.convolve(f0_score, np.ones(win_length) / win_length, "same")
+            if len(f0) > len(mask):
+                mask = np.pad(mask, (0, len(f0) - len(mask)), "constant")
+            f0 = f0 * np.sign(mask)
+
+        spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs, f0_floor=min_f0)
+        aperiodicity = pyworld.d4c(x, f0, timeaxis, fs, threshold=self.d4c_threshold)
+
+        f0 = f0[:, None]
+        lf0 = f0.copy()
+        nonzero_indices = np.nonzero(f0)
+        lf0[nonzero_indices] = np.log(f0[nonzero_indices])
+        if self.use_harvest:
+            # https://github.com/mmorise/World/issues/35#issuecomment-306521887
+            vuv = (aperiodicity[:, 0] < 0.5).astype(np.float32)[:, None]
+        else:
+            vuv = (lf0 != 0).astype(np.float32)
+
+        # F0 -> continuous F0
+        lf0 = interp1d(lf0, kind="slinear")
+
         # Vibrato parameter extraction
         sr_f0 = int(1 / (self.frame_period * 0.001))
         if self.vibrato_mode == "sine":
@@ -250,25 +280,11 @@ def collect_features(self, wav_path, label_path):
         else:
             raise RuntimeError("Unknown vibrato mode: {}".format(self.vibrato_mode))
 
-        spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs, f0_floor=min_f0)
-        aperiodicity = pyworld.d4c(x, f0, timeaxis, fs, threshold=self.d4c_threshold)
-
         mgc = pysptk.sp2mc(
             spectrogram, order=self.mgc_order, alpha=pysptk.util.mcepalpha(fs)
         )
-        # F0 of speech
-        f0 = f0[:, None]
-        lf0 = f0.copy()
-        nonzero_indices = np.nonzero(f0)
-        lf0[nonzero_indices] = np.log(f0[nonzero_indices])
-        if self.use_harvest:
-            # https://github.com/mmorise/World/issues/35#issuecomment-306521887
-            vuv = (aperiodicity[:, 0] < 0.5).astype(np.float32)[:, None]
-        else:
-            vuv = (lf0 != 0).astype(np.float32)
-        lf0 = interp1d(lf0, kind="slinear")
 
-        # Aperiodicy
+        # Post-processing for aperiodicy
         # ref: https://github.com/MTG/WGANSing/blob/mtg/vocoder.py
         if self.interp_unvoiced_aperiodicity:
             is_voiced = (vuv > 0).reshape(-1)