Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

New variance parameter: voicing #170

Merged
merged 13 commits into from
Feb 23, 2024
Merged
2 changes: 2 additions & 0 deletions configs/acoustic.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -54,12 +54,14 @@ mel_vmax: 1.5
interp_uv: true
energy_smooth_width: 0.12
breathiness_smooth_width: 0.12
voicing_smooth_width: 0.12
tension_smooth_width: 0.12

use_spk_id: false
f0_embed_type: continuous
use_energy_embed: false
use_breathiness_embed: false
use_voicing_embed: false
use_tension_embed: false
use_key_shift_embed: false
use_speed_embed: false
Expand Down
2 changes: 2 additions & 0 deletions configs/templates/config_acoustic.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ use_spk_id: false
num_spk: 1
use_energy_embed: false
use_breathiness_embed: false
use_voicing_embed: false
use_tension_embed: false
use_key_shift_embed: true
use_speed_embed: true

Expand Down
19 changes: 14 additions & 5 deletions configs/templates/config_variance.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,17 +19,26 @@ binary_data_dir: data/xxx/binary
binarization_args:
num_workers: 0

energy_db_min: -96.0
energy_db_max: -12.0
breathiness_db_min: -96.0
breathiness_db_max: -20.0

use_spk_id: false
num_spk: 1
predict_dur: true
predict_pitch: true
predict_energy: false
predict_breathiness: false
predict_voicing: false
predict_tension: false

energy_db_min: -96.0
energy_db_max: -12.0

breathiness_db_min: -96.0
breathiness_db_max: -20.0

voicing_db_min: -96.0
voicing_db_max: -12.0

tension_logit_min: -10.0
tension_logit_max: 10.0

hidden_size: 256
dur_prediction_args:
Expand Down
10 changes: 10 additions & 0 deletions configs/variance.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ predict_dur: true
predict_pitch: true
predict_energy: false
predict_breathiness: false
predict_voicing: false
predict_tension: false

dur_prediction_args:
Expand Down Expand Up @@ -75,13 +76,22 @@ pitch_prediction_args:
energy_db_min: -96.0
energy_db_max: -12.0
energy_smooth_width: 0.12

breathiness_db_min: -96.0
breathiness_db_max: -20.0
breathiness_smooth_width: 0.12
tension_logit_min: -10.0
tension_logit_max: 10.0
tension_smooth_width: 0.12

voicing_db_min: -96.0
voicing_db_max: -12.0
voicing_smooth_width: 0.12

tension_logit_min: -10.0
tension_logit_max: 10.0
tension_smooth_width: 0.12

variances_prediction_args:
total_repeat_bins: 48
residual_layers: 10
Expand Down
2 changes: 2 additions & 0 deletions inference/ds_acoustic.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ def __init__(self, device=None, load_model=True, load_vocoder=True, ckpt_steps=N
self.variances_to_embed.add('energy')
if hparams.get('use_breathiness_embed', False):
self.variances_to_embed.add('breathiness')
if hparams.get('use_voicing_embed', False):
self.variances_to_embed.add('voicing')
if hparams.get('use_tension_embed', False):
self.variances_to_embed.add('tension')

Expand Down
3 changes: 3 additions & 0 deletions modules/fastspeech/acoustic_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,14 @@ def __init__(self, vocab_size):
self.variance_embed_list = []
self.use_energy_embed = hparams.get('use_energy_embed', False)
self.use_breathiness_embed = hparams.get('use_breathiness_embed', False)
self.use_voicing_embed = hparams.get('use_voicing_embed', False)
self.use_tension_embed = hparams.get('use_tension_embed', False)
if self.use_energy_embed:
self.variance_embed_list.append('energy')
if self.use_breathiness_embed:
self.variance_embed_list.append('breathiness')
if self.use_voicing_embed:
self.variance_embed_list.append('voicing')
if self.use_tension_embed:
self.variance_embed_list.append('tension')

Expand Down
12 changes: 11 additions & 1 deletion modules/fastspeech/param_adaptor.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from modules.diffusion.ddpm import MultiVarianceDiffusion
from utils.hparams import hparams

VARIANCE_CHECKLIST = ['energy', 'breathiness', 'tension']
VARIANCE_CHECKLIST = ['energy', 'breathiness', 'voicing', 'tension']


class ParameterAdaptorModule(torch.nn.Module):
Expand All @@ -14,11 +14,14 @@ def __init__(self):
self.variance_prediction_list = []
self.predict_energy = hparams.get('predict_energy', False)
self.predict_breathiness = hparams.get('predict_breathiness', False)
self.predict_voicing = hparams.get('predict_voicing', False)
self.predict_tension = hparams.get('predict_tension', False)
if self.predict_energy:
self.variance_prediction_list.append('energy')
if self.predict_breathiness:
self.variance_prediction_list.append('breathiness')
if self.predict_voicing:
self.variance_prediction_list.append('voicing')
if self.predict_tension:
self.variance_prediction_list.append('tension')
self.predict_variances = len(self.variance_prediction_list) > 0
Expand All @@ -41,6 +44,13 @@ def build_adaptor(self, cls=MultiVarianceDiffusion):
))
clamps.append((hparams['breathiness_db_min'], 0.))

if self.predict_voicing:
ranges.append((
hparams['voicing_db_min'],
hparams['voicing_db_max']
))
clamps.append((hparams['voicing_db_min'], 0.))

if self.predict_tension:
ranges.append((
hparams['tension_logit_min'],
Expand Down
19 changes: 19 additions & 0 deletions preprocessing/acoustic_binarizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
get_mel2ph_torch,
get_energy_librosa,
get_breathiness_pyworld,
get_voicing_pyworld,
get_tension_base_harmonic,
)
from utils.hparams import hparams
Expand All @@ -39,6 +40,7 @@
'f0',
'energy',
'breathiness',
'voicing',
'tension',
'key_shift',
'speed',
Expand All @@ -47,6 +49,7 @@
pitch_extractor: BasePE = None
energy_smooth: SinusoidalSmoothingConv1d = None
breathiness_smooth: SinusoidalSmoothingConv1d = None
voicing_smooth: SinusoidalSmoothingConv1d = None
tension_smooth: SinusoidalSmoothingConv1d = None


Expand All @@ -56,6 +59,7 @@ def __init__(self):
self.lr = LengthRegulator()
self.need_energy = hparams['use_energy_embed']
self.need_breathiness = hparams['use_breathiness_embed']
self.need_voicing = hparams['use_voicing_embed']
self.need_tension = hparams['use_tension_embed']

def load_meta_data(self, raw_data_dir: pathlib.Path, ds_id, spk_id):
Expand Down Expand Up @@ -158,6 +162,21 @@ def process_item(self, item_name, meta_data, binarization_args):

processed_input['breathiness'] = breathiness.cpu().numpy()

if self.need_voicing:
# get ground truth voicing
voicing = get_voicing_pyworld(
dec_waveform, None, None, length=length
)

global voicing_smooth
if voicing_smooth is None:
voicing_smooth = SinusoidalSmoothingConv1d(
round(hparams['voicing_smooth_width'] / self.timestep)
).eval().to(self.device)
voicing = voicing_smooth(torch.from_numpy(voicing).to(self.device)[None])[0]

processed_input['voicing'] = voicing.cpu().numpy()

if self.need_tension:
# get ground truth tension
tension = get_tension_base_harmonic(
Expand Down
37 changes: 36 additions & 1 deletion preprocessing/variance_binarizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
get_mel2ph_torch,
get_energy_librosa,
get_breathiness_pyworld,
get_voicing_pyworld,
get_tension_base_harmonic,
)
from utils.hparams import hparams
Expand All @@ -44,6 +45,7 @@
'uv', # unvoiced masks (only for objective evaluation metrics), bool[T_s,]
'energy', # frame-level RMS (dB), float32[T_s,]
'breathiness', # frame-level RMS of aperiodic parts (dB), float32[T_s,]
'voicing', # frame-level RMS of harmonic parts (dB), float32[T_s,]
'tension', # frame-level tension (logit), float32[T_s,]
]
DS_INDEX_SEP = '#'
Expand All @@ -54,6 +56,7 @@
midi_smooth: SinusoidalSmoothingConv1d = None
energy_smooth: SinusoidalSmoothingConv1d = None
breathiness_smooth: SinusoidalSmoothingConv1d = None
voicing_smooth: SinusoidalSmoothingConv1d = None
tension_smooth: SinusoidalSmoothingConv1d = None


Expand All @@ -74,8 +77,9 @@ def __init__(self):

predict_energy = hparams['predict_energy']
predict_breathiness = hparams['predict_breathiness']
predict_voicing = hparams['predict_voicing']
predict_tension = hparams['predict_tension']
self.predict_variances = predict_energy or predict_breathiness or predict_tension
self.predict_variances = predict_energy or predict_breathiness or predict_voicing or predict_tension
self.lr = LengthRegulator().to(self.device)
self.prefer_ds = self.binarization_args['prefer_ds']
self.cached_ds = {}
Expand Down Expand Up @@ -417,6 +421,37 @@ def process_item(self, item_name, meta_data, binarization_args):

processed_input['breathiness'] = breathiness

# Below: extract voicing
if hparams['predict_voicing']:
voicing = None
voicing_from_wav = False
if self.prefer_ds:
voicing_seq = self.load_attr_from_ds(ds_id, name, 'voicing', idx=ds_seg_idx)
if voicing_seq is not None:
voicing = resample_align_curve(
np.array(voicing_seq.split(), np.float32),
original_timestep=float(self.load_attr_from_ds(
ds_id, name, 'voicing_timestep', idx=ds_seg_idx
)),
target_timestep=self.timestep,
align_length=length
)
if voicing is None:
voicing = get_voicing_pyworld(
dec_waveform, None, None, length=length
)
voicing_from_wav = True

if voicing_from_wav:
global voicing_smooth
if voicing_smooth is None:
voicing_smooth = SinusoidalSmoothingConv1d(
round(hparams['voicing_smooth_width'] / self.timestep)
).eval().to(self.device)
voicing = voicing_smooth(torch.from_numpy(voicing).to(self.device)[None])[0].cpu().numpy()

processed_input['voicing'] = voicing

# Below: extract tension
if hparams['predict_tension']:
tension = None
Expand Down
4 changes: 4 additions & 0 deletions training/acoustic_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ def __init__(self, prefix, preload=False):
self.required_variances['energy'] = 0.0
if hparams['use_breathiness_embed']:
self.required_variances['breathiness'] = 0.0
if hparams['use_voicing_embed']:
self.required_variances['voicing'] = 0.0
if hparams['use_tension_embed']:
self.required_variances['tension'] = 0.0

Expand Down Expand Up @@ -80,6 +82,8 @@ def __init__(self):
self.required_variances.append('energy')
if hparams['use_breathiness_embed']:
self.required_variances.append('breathiness')
if hparams['use_voicing_embed']:
self.required_variances.append('voicing')
if hparams['use_tension_embed']:
self.required_variances.append('tension')
super()._finish_init()
Expand Down
11 changes: 9 additions & 2 deletions training/variance_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,9 @@ def __init__(self, prefix, preload=False):
super(VarianceDataset, self).__init__(prefix, hparams['dataset_size_key'], preload)
need_energy = hparams['predict_energy']
need_breathiness = hparams['predict_breathiness']
need_voicing = hparams['predict_voicing']
need_tension = hparams['predict_tension']
self.predict_variances = need_energy or need_breathiness or need_tension
self.predict_variances = need_energy or need_breathiness or need_voicing or need_tension

def collater(self, samples):
batch = super().collater(samples)
Expand Down Expand Up @@ -60,6 +61,8 @@ def collater(self, samples):
batch['energy'] = utils.collate_nd([s['energy'] for s in samples], 0)
if hparams['predict_breathiness']:
batch['breathiness'] = utils.collate_nd([s['breathiness'] for s in samples], 0)
if hparams['predict_voicing']:
batch['voicing'] = utils.collate_nd([s['voicing'] for s in samples], 0)
if hparams['predict_tension']:
batch['tension'] = utils.collate_nd([s['tension'] for s in samples], 0)

Expand Down Expand Up @@ -92,12 +95,15 @@ def __init__(self):

predict_energy = hparams['predict_energy']
predict_breathiness = hparams['predict_breathiness']
predict_voicing = hparams['predict_voicing']
predict_tension = hparams['predict_tension']
self.variance_prediction_list = []
if predict_energy:
self.variance_prediction_list.append('energy')
if predict_breathiness:
self.variance_prediction_list.append('breathiness')
if predict_voicing:
self.variance_prediction_list.append('voicing')
if predict_tension:
self.variance_prediction_list.append('tension')
self.predict_variances = len(self.variance_prediction_list) > 0
Expand Down Expand Up @@ -153,6 +159,7 @@ def run_model(self, sample, infer=False):
pitch = sample.get('pitch') # [B, T_s]
energy = sample.get('energy') # [B, T_s]
breathiness = sample.get('breathiness') # [B, T_s]
voicing = sample.get('voicing') # [B, T_s]
tension = sample.get('tension') # [B, T_s]

pitch_retake = variance_retake = None
Expand All @@ -175,7 +182,7 @@ def run_model(self, sample, infer=False):
note_midi=note_midi, note_rest=note_rest,
note_dur=note_dur, note_glide=note_glide, mel2note=mel2note,
base_pitch=base_pitch, pitch=pitch,
energy=energy, breathiness=breathiness, tension=tension,
energy=energy, breathiness=breathiness, voicing=voicing, tension=tension,
pitch_retake=pitch_retake, variance_retake=variance_retake,
spk_id=spk_ids, infer=infer
)
Expand Down
29 changes: 29 additions & 0 deletions utils/binarizer_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -279,6 +279,35 @@ def get_breathiness_pyworld(
return breathiness


def get_voicing_pyworld(
waveform: Union[np.ndarray, DecomposedWaveform],
samplerate, f0, length,
*, hop_size=None, fft_size=None, win_size=None
):
"""
Definition of voicing: RMS of the harmonic part, in dB representation
:param waveform: All other analysis parameters will not take effect if a DeconstructedWaveform is given
:param samplerate: sampling rate
:param f0: reference f0
:param length: Expected number of frames
:param hop_size: Frame width, in number of samples
:param fft_size: Number of fft bins
:param win_size: Window size, in number of samples
:return: voicing
"""
if not isinstance(waveform, DecomposedWaveform):
waveform = DecomposedWaveform(
waveform=waveform, samplerate=samplerate, f0=f0,
hop_size=hop_size, fft_size=fft_size, win_size=win_size
)
waveform_sp = waveform.harmonic()
voicing = get_energy_librosa(
waveform_sp, length=length,
hop_size=waveform.hop_size, win_size=waveform.win_size
)
return voicing


def get_tension_base_harmonic(
waveform: Union[np.ndarray, DecomposedWaveform],
samplerate, f0, length,
Expand Down