Skip to content

Commit

Permalink
New variance parameter: voicing (#170)
Browse files Browse the repository at this point in the history
* Support voicing in acoustic and variance models (experimental)

* Fix KeyError
  • Loading branch information
yqzhishen committed Feb 23, 2024
1 parent b19a515 commit e5c79c0
Show file tree
Hide file tree
Showing 12 changed files with 141 additions and 9 deletions.
2 changes: 2 additions & 0 deletions configs/acoustic.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -54,12 +54,14 @@ mel_vmax: 1.5
interp_uv: true
energy_smooth_width: 0.12
breathiness_smooth_width: 0.12
voicing_smooth_width: 0.12
tension_smooth_width: 0.12

use_spk_id: false
f0_embed_type: continuous
use_energy_embed: false
use_breathiness_embed: false
use_voicing_embed: false
use_tension_embed: false
use_key_shift_embed: false
use_speed_embed: false
Expand Down
2 changes: 2 additions & 0 deletions configs/templates/config_acoustic.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ use_spk_id: false
num_spk: 1
use_energy_embed: false
use_breathiness_embed: false
use_voicing_embed: false
use_tension_embed: false
use_key_shift_embed: true
use_speed_embed: true

Expand Down
19 changes: 14 additions & 5 deletions configs/templates/config_variance.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,17 +19,26 @@ binary_data_dir: data/xxx/binary
binarization_args:
num_workers: 0

energy_db_min: -96.0
energy_db_max: -12.0
breathiness_db_min: -96.0
breathiness_db_max: -20.0

use_spk_id: false
num_spk: 1
predict_dur: true
predict_pitch: true
predict_energy: false
predict_breathiness: false
predict_voicing: false
predict_tension: false

energy_db_min: -96.0
energy_db_max: -12.0

breathiness_db_min: -96.0
breathiness_db_max: -20.0

voicing_db_min: -96.0
voicing_db_max: -12.0

tension_logit_min: -10.0
tension_logit_max: 10.0

hidden_size: 256
dur_prediction_args:
Expand Down
10 changes: 10 additions & 0 deletions configs/variance.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ predict_dur: true
predict_pitch: true
predict_energy: false
predict_breathiness: false
predict_voicing: false
predict_tension: false

dur_prediction_args:
Expand Down Expand Up @@ -75,13 +76,22 @@ pitch_prediction_args:
energy_db_min: -96.0
energy_db_max: -12.0
energy_smooth_width: 0.12

breathiness_db_min: -96.0
breathiness_db_max: -20.0
breathiness_smooth_width: 0.12
tension_logit_min: -10.0
tension_logit_max: 10.0
tension_smooth_width: 0.12

voicing_db_min: -96.0
voicing_db_max: -12.0
voicing_smooth_width: 0.12

tension_logit_min: -10.0
tension_logit_max: 10.0
tension_smooth_width: 0.12

variances_prediction_args:
total_repeat_bins: 48
residual_layers: 10
Expand Down
2 changes: 2 additions & 0 deletions inference/ds_acoustic.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ def __init__(self, device=None, load_model=True, load_vocoder=True, ckpt_steps=N
self.variances_to_embed.add('energy')
if hparams.get('use_breathiness_embed', False):
self.variances_to_embed.add('breathiness')
if hparams.get('use_voicing_embed', False):
self.variances_to_embed.add('voicing')
if hparams.get('use_tension_embed', False):
self.variances_to_embed.add('tension')

Expand Down
3 changes: 3 additions & 0 deletions modules/fastspeech/acoustic_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,14 @@ def __init__(self, vocab_size):
self.variance_embed_list = []
self.use_energy_embed = hparams.get('use_energy_embed', False)
self.use_breathiness_embed = hparams.get('use_breathiness_embed', False)
self.use_voicing_embed = hparams.get('use_voicing_embed', False)
self.use_tension_embed = hparams.get('use_tension_embed', False)
if self.use_energy_embed:
self.variance_embed_list.append('energy')
if self.use_breathiness_embed:
self.variance_embed_list.append('breathiness')
if self.use_voicing_embed:
self.variance_embed_list.append('voicing')
if self.use_tension_embed:
self.variance_embed_list.append('tension')

Expand Down
12 changes: 11 additions & 1 deletion modules/fastspeech/param_adaptor.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from modules.diffusion.ddpm import MultiVarianceDiffusion
from utils.hparams import hparams

VARIANCE_CHECKLIST = ['energy', 'breathiness', 'tension']
VARIANCE_CHECKLIST = ['energy', 'breathiness', 'voicing', 'tension']


class ParameterAdaptorModule(torch.nn.Module):
Expand All @@ -14,11 +14,14 @@ def __init__(self):
self.variance_prediction_list = []
self.predict_energy = hparams.get('predict_energy', False)
self.predict_breathiness = hparams.get('predict_breathiness', False)
self.predict_voicing = hparams.get('predict_voicing', False)
self.predict_tension = hparams.get('predict_tension', False)
if self.predict_energy:
self.variance_prediction_list.append('energy')
if self.predict_breathiness:
self.variance_prediction_list.append('breathiness')
if self.predict_voicing:
self.variance_prediction_list.append('voicing')
if self.predict_tension:
self.variance_prediction_list.append('tension')
self.predict_variances = len(self.variance_prediction_list) > 0
Expand All @@ -41,6 +44,13 @@ def build_adaptor(self, cls=MultiVarianceDiffusion):
))
clamps.append((hparams['breathiness_db_min'], 0.))

if self.predict_voicing:
ranges.append((
hparams['voicing_db_min'],
hparams['voicing_db_max']
))
clamps.append((hparams['voicing_db_min'], 0.))

if self.predict_tension:
ranges.append((
hparams['tension_logit_min'],
Expand Down
19 changes: 19 additions & 0 deletions preprocessing/acoustic_binarizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
get_mel2ph_torch,
get_energy_librosa,
get_breathiness_pyworld,
get_voicing_pyworld,
get_tension_base_harmonic,
)
from utils.hparams import hparams
Expand All @@ -39,6 +40,7 @@
'f0',
'energy',
'breathiness',
'voicing',
'tension',
'key_shift',
'speed',
Expand All @@ -47,6 +49,7 @@
pitch_extractor: BasePE = None
energy_smooth: SinusoidalSmoothingConv1d = None
breathiness_smooth: SinusoidalSmoothingConv1d = None
voicing_smooth: SinusoidalSmoothingConv1d = None
tension_smooth: SinusoidalSmoothingConv1d = None


Expand All @@ -56,6 +59,7 @@ def __init__(self):
self.lr = LengthRegulator()
self.need_energy = hparams['use_energy_embed']
self.need_breathiness = hparams['use_breathiness_embed']
self.need_voicing = hparams['use_voicing_embed']
self.need_tension = hparams['use_tension_embed']

def load_meta_data(self, raw_data_dir: pathlib.Path, ds_id, spk_id):
Expand Down Expand Up @@ -158,6 +162,21 @@ def process_item(self, item_name, meta_data, binarization_args):

processed_input['breathiness'] = breathiness.cpu().numpy()

if self.need_voicing:
# get ground truth voicing
voicing = get_voicing_pyworld(
dec_waveform, None, None, length=length
)

global voicing_smooth
if voicing_smooth is None:
voicing_smooth = SinusoidalSmoothingConv1d(
round(hparams['voicing_smooth_width'] / self.timestep)
).eval().to(self.device)
voicing = voicing_smooth(torch.from_numpy(voicing).to(self.device)[None])[0]

processed_input['voicing'] = voicing.cpu().numpy()

if self.need_tension:
# get ground truth tension
tension = get_tension_base_harmonic(
Expand Down
37 changes: 36 additions & 1 deletion preprocessing/variance_binarizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
get_mel2ph_torch,
get_energy_librosa,
get_breathiness_pyworld,
get_voicing_pyworld,
get_tension_base_harmonic,
)
from utils.hparams import hparams
Expand All @@ -44,6 +45,7 @@
'uv', # unvoiced masks (only for objective evaluation metrics), bool[T_s,]
'energy', # frame-level RMS (dB), float32[T_s,]
'breathiness', # frame-level RMS of aperiodic parts (dB), float32[T_s,]
'voicing', # frame-level RMS of harmonic parts (dB), float32[T_s,]
'tension', # frame-level tension (logit), float32[T_s,]
]
DS_INDEX_SEP = '#'
Expand All @@ -54,6 +56,7 @@
midi_smooth: SinusoidalSmoothingConv1d = None
energy_smooth: SinusoidalSmoothingConv1d = None
breathiness_smooth: SinusoidalSmoothingConv1d = None
voicing_smooth: SinusoidalSmoothingConv1d = None
tension_smooth: SinusoidalSmoothingConv1d = None


Expand All @@ -74,8 +77,9 @@ def __init__(self):

predict_energy = hparams['predict_energy']
predict_breathiness = hparams['predict_breathiness']
predict_voicing = hparams['predict_voicing']
predict_tension = hparams['predict_tension']
self.predict_variances = predict_energy or predict_breathiness or predict_tension
self.predict_variances = predict_energy or predict_breathiness or predict_voicing or predict_tension
self.lr = LengthRegulator().to(self.device)
self.prefer_ds = self.binarization_args['prefer_ds']
self.cached_ds = {}
Expand Down Expand Up @@ -417,6 +421,37 @@ def process_item(self, item_name, meta_data, binarization_args):

processed_input['breathiness'] = breathiness

# Below: extract voicing
if hparams['predict_voicing']:
voicing = None
voicing_from_wav = False
if self.prefer_ds:
voicing_seq = self.load_attr_from_ds(ds_id, name, 'voicing', idx=ds_seg_idx)
if voicing_seq is not None:
voicing = resample_align_curve(
np.array(voicing_seq.split(), np.float32),
original_timestep=float(self.load_attr_from_ds(
ds_id, name, 'voicing_timestep', idx=ds_seg_idx
)),
target_timestep=self.timestep,
align_length=length
)
if voicing is None:
voicing = get_voicing_pyworld(
dec_waveform, None, None, length=length
)
voicing_from_wav = True

if voicing_from_wav:
global voicing_smooth
if voicing_smooth is None:
voicing_smooth = SinusoidalSmoothingConv1d(
round(hparams['voicing_smooth_width'] / self.timestep)
).eval().to(self.device)
voicing = voicing_smooth(torch.from_numpy(voicing).to(self.device)[None])[0].cpu().numpy()

processed_input['voicing'] = voicing

# Below: extract tension
if hparams['predict_tension']:
tension = None
Expand Down
4 changes: 4 additions & 0 deletions training/acoustic_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ def __init__(self, prefix, preload=False):
self.required_variances['energy'] = 0.0
if hparams['use_breathiness_embed']:
self.required_variances['breathiness'] = 0.0
if hparams['use_voicing_embed']:
self.required_variances['voicing'] = 0.0
if hparams['use_tension_embed']:
self.required_variances['tension'] = 0.0

Expand Down Expand Up @@ -80,6 +82,8 @@ def __init__(self):
self.required_variances.append('energy')
if hparams['use_breathiness_embed']:
self.required_variances.append('breathiness')
if hparams['use_voicing_embed']:
self.required_variances.append('voicing')
if hparams['use_tension_embed']:
self.required_variances.append('tension')
super()._finish_init()
Expand Down
11 changes: 9 additions & 2 deletions training/variance_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,9 @@ def __init__(self, prefix, preload=False):
super(VarianceDataset, self).__init__(prefix, hparams['dataset_size_key'], preload)
need_energy = hparams['predict_energy']
need_breathiness = hparams['predict_breathiness']
need_voicing = hparams['predict_voicing']
need_tension = hparams['predict_tension']
self.predict_variances = need_energy or need_breathiness or need_tension
self.predict_variances = need_energy or need_breathiness or need_voicing or need_tension

def collater(self, samples):
batch = super().collater(samples)
Expand Down Expand Up @@ -60,6 +61,8 @@ def collater(self, samples):
batch['energy'] = utils.collate_nd([s['energy'] for s in samples], 0)
if hparams['predict_breathiness']:
batch['breathiness'] = utils.collate_nd([s['breathiness'] for s in samples], 0)
if hparams['predict_voicing']:
batch['voicing'] = utils.collate_nd([s['voicing'] for s in samples], 0)
if hparams['predict_tension']:
batch['tension'] = utils.collate_nd([s['tension'] for s in samples], 0)

Expand Down Expand Up @@ -92,12 +95,15 @@ def __init__(self):

predict_energy = hparams['predict_energy']
predict_breathiness = hparams['predict_breathiness']
predict_voicing = hparams['predict_voicing']
predict_tension = hparams['predict_tension']
self.variance_prediction_list = []
if predict_energy:
self.variance_prediction_list.append('energy')
if predict_breathiness:
self.variance_prediction_list.append('breathiness')
if predict_voicing:
self.variance_prediction_list.append('voicing')
if predict_tension:
self.variance_prediction_list.append('tension')
self.predict_variances = len(self.variance_prediction_list) > 0
Expand Down Expand Up @@ -153,6 +159,7 @@ def run_model(self, sample, infer=False):
pitch = sample.get('pitch') # [B, T_s]
energy = sample.get('energy') # [B, T_s]
breathiness = sample.get('breathiness') # [B, T_s]
voicing = sample.get('voicing') # [B, T_s]
tension = sample.get('tension') # [B, T_s]

pitch_retake = variance_retake = None
Expand All @@ -175,7 +182,7 @@ def run_model(self, sample, infer=False):
note_midi=note_midi, note_rest=note_rest,
note_dur=note_dur, note_glide=note_glide, mel2note=mel2note,
base_pitch=base_pitch, pitch=pitch,
energy=energy, breathiness=breathiness, tension=tension,
energy=energy, breathiness=breathiness, voicing=voicing, tension=tension,
pitch_retake=pitch_retake, variance_retake=variance_retake,
spk_id=spk_ids, infer=infer
)
Expand Down
29 changes: 29 additions & 0 deletions utils/binarizer_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -279,6 +279,35 @@ def get_breathiness_pyworld(
return breathiness


def get_voicing_pyworld(
waveform: Union[np.ndarray, DecomposedWaveform],
samplerate, f0, length,
*, hop_size=None, fft_size=None, win_size=None
):
"""
Definition of voicing: RMS of the harmonic part, in dB representation
:param waveform: All other analysis parameters will not take effect if a DeconstructedWaveform is given
:param samplerate: sampling rate
:param f0: reference f0
:param length: Expected number of frames
:param hop_size: Frame width, in number of samples
:param fft_size: Number of fft bins
:param win_size: Window size, in number of samples
:return: voicing
"""
if not isinstance(waveform, DecomposedWaveform):
waveform = DecomposedWaveform(
waveform=waveform, samplerate=samplerate, f0=f0,
hop_size=hop_size, fft_size=fft_size, win_size=win_size
)
waveform_sp = waveform.harmonic()
voicing = get_energy_librosa(
waveform_sp, length=length,
hop_size=waveform.hop_size, win_size=waveform.win_size
)
return voicing


def get_tension_base_harmonic(
waveform: Union[np.ndarray, DecomposedWaveform],
samplerate, f0, length,
Expand Down

0 comments on commit e5c79c0

Please sign in to comment.