Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 12 additions & 11 deletions diffsynth/pipelines/ace_step.py
Original file line number Diff line number Diff line change
Expand Up @@ -448,28 +448,27 @@ def pad_src_audio(self, pipe, src_audio, task_type, repainting_ranges):
return src_audio, repainting_ranges, None, None
min_left = min([start for start, end in repainting_ranges])
max_right = max([end for start, end in repainting_ranges])
total_length = src_audio.shape[-1] // pipe.vae.sampling_rate
pad_left = max(0, -min_left)
pad_right = max(0, max_right - total_length)
if pad_left > 0 or pad_right > 0:
padding_frames_left, padding_frames_right = pad_left * pipe.vae.sampling_rate, pad_right * pipe.vae.sampling_rate
padding_frames_left = int(pad_left * pipe.vae.sampling_rate)
padding_frames_right = max(int(max_right * pipe.vae.sampling_rate) - src_audio.shape[-1], 0)
Comment on lines +452 to +453
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Using int() for converting seconds to audio frames can lead to off-by-one errors due to floating-point precision (e.g., 0.1 * 48000 might result in 4799.999999999999 which int() floors to 4799). Since this PR aims to fix float duration issues, using round() is more appropriate and consistent with the changes made elsewhere in this file.

Suggested change
padding_frames_left = int(pad_left * pipe.vae.sampling_rate)
padding_frames_right = max(int(max_right * pipe.vae.sampling_rate) - src_audio.shape[-1], 0)
padding_frames_left = round(pad_left * pipe.vae.sampling_rate)
padding_frames_right = max(round(max_right * pipe.vae.sampling_rate) - src_audio.shape[-1], 0)

if padding_frames_left > 0 or padding_frames_right > 0:
src_audio = F.pad(src_audio, (padding_frames_left, padding_frames_right), value=0.0)
repainting_ranges = [(start + pad_left, end + pad_left) for start, end in repainting_ranges]
return src_audio, repainting_ranges, pad_left, pad_right
return src_audio, repainting_ranges, padding_frames_left, padding_frames_right

def parse_repaint_masks(self, pipe, src_latents, task_type, repainting_ranges, repainting_strength, pad_left, pad_right):
def parse_repaint_masks(self, pipe, src_latents, task_type, repainting_ranges, repainting_strength, padding_frames_left, padding_frames_right):
if task_type != "repaint" or repainting_ranges is None:
return None, src_latents
# let repainting area be repainting_strength, non-repainting area be 0.0, and blend at the boundary with cf_frames.
max_latent_length = src_latents.shape[1]
denoise_mask = torch.zeros((1, max_latent_length, 1), dtype=pipe.torch_dtype, device=pipe.device)
for start, end in repainting_ranges:
start_frame = start * pipe.vae.sampling_rate // 1920
end_frame = end * pipe.vae.sampling_rate // 1920
start_frame = int(start * pipe.vae.sampling_rate / 1920)
end_frame = int(end * pipe.vae.sampling_rate / 1920)
Comment on lines +466 to +467
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

For consistency with the calculation of max_latent_length (line 513), these frame indices should be calculated using round() instead of int(). This ensures that float durations are mapped to the same latent frames across different parts of the pipeline.

Suggested change
start_frame = int(start * pipe.vae.sampling_rate / 1920)
end_frame = int(end * pipe.vae.sampling_rate / 1920)
start_frame = round(start * pipe.vae.sampling_rate / 1920)
end_frame = round(end * pipe.vae.sampling_rate / 1920)

denoise_mask[:, start_frame:end_frame, :] = repainting_strength
# set padding areas to 1.0 (full repaint) to avoid artifacts at the boundaries caused by padding
pad_left_frames = pad_left * pipe.vae.sampling_rate // 1920
pad_right_frames = pad_right * pipe.vae.sampling_rate // 1920
pad_left_frames = int(padding_frames_left / 1920)
pad_right_frames = int(padding_frames_right / 1920)
Comment on lines +470 to +471
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Using round() here is more consistent with the mapping of audio duration to latent frames used in the rest of the pipeline. This helps avoid potential artifacts at the boundaries when dealing with float durations.

Suggested change
pad_left_frames = int(padding_frames_left / 1920)
pad_right_frames = int(padding_frames_right / 1920)
pad_left_frames = round(padding_frames_left / 1920)
pad_right_frames = round(padding_frames_right / 1920)

denoise_mask[:, :pad_left_frames, :] = 1
denoise_mask[:, max_latent_length - pad_right_frames:, :] = 1

Expand Down Expand Up @@ -506,10 +505,12 @@ def process(self, pipe, duration, src_audio, audio_code_string, task_type=None,
if task_type == "cover":
lm_hints_5Hz = self.tokenize(pipe.tokenizer_model.tokenizer, src_latents, pipe.silence_latent, pipe.tokenizer_model.tokenizer.pool_window_size)
src_latents = pipe.tokenizer_model.detokenizer(lm_hints_5Hz)
if src_latents.shape[1] > source_latents.shape[1]:
source_latents = torch.cat([source_latents, src_latents[:, source_latents.shape[1]:]], dim=1)
Comment on lines +508 to +509
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

If the cover task is initiated using audio_code_string instead of src_audio, source_latents will be None. Accessing .shape[1] on a None object will raise an AttributeError. A check should be added to ensure source_latents is not None before performing the shape comparison and concatenation.

Suggested change
if src_latents.shape[1] > source_latents.shape[1]:
source_latents = torch.cat([source_latents, src_latents[:, source_latents.shape[1]:]], dim=1)
if source_latents is not None and src_latents.shape[1] > source_latents.shape[1]:
source_latents = torch.cat([source_latents, src_latents[:, source_latents.shape[1]:]], dim=1)

max_latent_length = src_latents.shape[1]
else:
# use silence latents.
max_latent_length = int(duration * pipe.sample_rate // 1920)
max_latent_length = round(duration * pipe.sample_rate / 1920)
src_latents = self._get_silence_latent_slice(pipe, max_latent_length).unsqueeze(0)
chunk_masks = torch.ones((1, max_latent_length, src_latents.shape[-1]), dtype=torch.bool, device=pipe.device)
attention_mask = torch.ones((1, max_latent_length), device=src_latents.device, dtype=pipe.torch_dtype)
Expand Down
2 changes: 1 addition & 1 deletion docs/en/Model_Details/ACE-Step.md
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ The input parameters for `AceStepPipeline` inference include:
* `seed`: Random seed.
* `rand_device`: Device for noise generation, defaults to "cpu".
* `num_inference_steps`: Number of inference steps, defaults to 8.
* `shift`: Timestep shift parameter for the scheduler, defaults to 1.0.
* `shift`: Timestep shift parameter for the scheduler, defaults to 3.0.

## Model Training

Expand Down
2 changes: 1 addition & 1 deletion docs/zh/Model_Details/ACE-Step.md
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ save_audio(audio, pipe.vae.sampling_rate, "acestep-v15-turbo.wav")
* `seed`: 随机种子。
* `rand_device`: 噪声生成设备,默认为 "cpu"。
* `num_inference_steps`: 推理步数,默认为 8。
* `shift`: 调度器时间偏移参数,默认为 1.0。
* `shift`: 调度器时间偏移参数,默认为 3.0。

## 模型训练

Expand Down