Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 10 additions & 6 deletions diffsynth/core/data/operators.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@
import torch, torchvision, imageio, os
import imageio.v3 as iio
from PIL import Image
import torchaudio
from diffsynth.utils.data.audio import read_audio


class DataProcessingPipeline:
Expand Down Expand Up @@ -249,23 +247,27 @@ def __call__(self, data):
class LoadAudio(DataProcessingOperator):
def __init__(self, sr=16000):
self.sr = sr
def __call__(self, data: str):
import librosa
input_audio, sample_rate = librosa.load(data, sr=self.sr)
self.audio_loader = librosa.load

def __call__(self, data: str):
input_audio, sample_rate = self.audio_loader(data, sr=self.sr)
return input_audio
Comment on lines +254 to 255
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The LoadAudio operator returns a numpy array (from librosa.load), whereas other audio operators in this file (like LoadAudioWithTorchaudio and LoadPureAudioWithTorchaudio) return torch tensors. This inconsistency can cause issues in data pipelines that expect a uniform tensor format. Additionally, LoadAudio only returns the waveform, while the others return a (waveform, sample_rate) tuple. Consider standardizing the output format across all audio operators.

Suggested change
input_audio, sample_rate = self.audio_loader(data, sr=self.sr)
return input_audio
input_audio, sample_rate = self.audio_loader(data, sr=self.sr)
return torch.from_numpy(input_audio)



class LoadAudioWithTorchaudio(DataProcessingOperator, FrameSamplerByRateMixin):

def __init__(self, num_frames=121, time_division_factor=8, time_division_remainder=1, frame_rate=24, fix_frame_rate=True):
FrameSamplerByRateMixin.__init__(self, num_frames, time_division_factor, time_division_remainder, frame_rate, fix_frame_rate)
import torchaudio
self.audio_loader = torchaudio.load

def __call__(self, data: str):
try:
reader = self.get_reader(data)
num_frames = self.get_num_frames(reader)
duration = num_frames / self.frame_rate
waveform, sample_rate = torchaudio.load(data)
waveform, sample_rate = self.audio_loader(data)
Comment on lines 267 to +270
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

There is a resource leak here: the imageio reader created by self.get_reader(data) is never closed. This can lead to an accumulation of open file handles, especially when processing large datasets. Furthermore, the file is opened twice (once by imageio and once by self.audio_loader). Using a context manager for the reader ensures it is closed immediately after extracting the necessary metadata.

Suggested change
reader = self.get_reader(data)
num_frames = self.get_num_frames(reader)
duration = num_frames / self.frame_rate
waveform, sample_rate = torchaudio.load(data)
waveform, sample_rate = self.audio_loader(data)
with self.get_reader(data) as reader:
num_frames = self.get_num_frames(reader)
duration = num_frames / self.frame_rate
waveform, sample_rate = self.audio_loader(data)

target_samples = int(duration * sample_rate)
current_samples = waveform.shape[-1]
if current_samples > target_samples:
Expand All @@ -285,10 +287,12 @@ def __init__(self, target_sample_rate=None, target_duration=None):
self.target_sample_rate = target_sample_rate
self.target_duration = target_duration
self.resample = True if target_sample_rate is not None else False
from diffsynth.utils.data.audio import read_audio
self.audio_loader = read_audio

def __call__(self, data: str):
try:
waveform, sample_rate = read_audio(data, resample=self.resample, resample_rate=self.target_sample_rate)
waveform, sample_rate = self.audio_loader(data, resample=self.resample, resample_rate=self.target_sample_rate)
if self.target_duration is not None:
target_samples = int(self.target_duration * sample_rate)
current_samples = waveform.shape[-1]
Expand Down
11 changes: 10 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,17 @@ npu = [
"torchvision==0.22.1+cpu"
]
audio = [
"av",
"torchaudio",
"torchcodec"
"torchcodec",
"librosa"
]
all = [
"av",
"torchaudio",
"torchcodec",
"librosa",
"streamlit"
]

[tool.setuptools]
Expand Down