Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
52d624b
Add num_channels parameter to AudioEncoder
NicolasHug May 21, 2025
aad9c7d
Merge branch 'main' of github.com:pytorch/torchcodec into encoding_nu…
NicolasHug May 22, 2025
2d76a7b
Add validation for num_channels
NicolasHug May 22, 2025
7d643f2
Fix FFmpeg 5.X?
NicolasHug May 22, 2025
5d9eb54
Migrate encoder tests to public Python APIs
NicolasHug May 22, 2025
c40deef
Add output sample rate, WIP
NicolasHug May 22, 2025
96e5e60
Merge branch 'main' of github.com:pytorch/torchcodec into migrate_enc…
NicolasHug May 22, 2025
952af0f
Re-remove
NicolasHug May 22, 2025
88a87c4
Merge branch 'migrate_encoding_test' into encoding_sample_rate_lezzzgo
NicolasHug May 22, 2025
e0ba0c5
Merge branch 'main' of github.com:pytorch/torchcodec into encoding_sa…
NicolasHug May 22, 2025
2c559b2
Use 'output' more consistently
NicolasHug May 23, 2025
70ae1a1
Use AudioStreamOptions in AudioEncoder
NicolasHug May 23, 2025
75e23b9
Merge branch 'main' of github.com:pytorch/torchcodec into use_audioSt…
NicolasHug May 27, 2025
b6e3c27
Merge branch 'use_audioStreamOptions' into encoding_sample_rate_lezzzgo
NicolasHug May 27, 2025
387328a
Merge branch 'main' of github.com:pytorch/torchcodec into encoding_sa…
NicolasHug May 27, 2025
823e7f0
WIP
NicolasHug May 27, 2025
4be2953
Add flushing logic for swresample buffers
NicolasHug May 27, 2025
639d5ab
More tests
NicolasHug May 27, 2025
3ce4612
WIP
NicolasHug May 28, 2025
6c91450
Refactor audio sample conversion in encoder
NicolasHug May 28, 2025
b2eed2f
Merge branch 'move-conversion-out' into encoding_sample_rate_lezzzgo
NicolasHug May 29, 2025
8fdb6ed
wav tests pass
NicolasHug May 29, 2025
3399b34
Merge branch 'main' of github.com:pytorch/torchcodec into encoding_sa…
NicolasHug May 29, 2025
6d7908f
Use intermediate FIFO, WIP
NicolasHug May 30, 2025
f30d0ff
mostly works
NicolasHug May 30, 2025
ef1b461
WIP
NicolasHug May 30, 2025
6d2aef1
Merge branch 'main' of github.com:pytorch/torchcodec into encoding_sa…
NicolasHug Jul 4, 2025
17cd1d8
Fix bug where we would encode too many samples
NicolasHug Jul 4, 2025
17340a6
Properly set frames pts
NicolasHug Jul 4, 2025
e74d72d
Nits
NicolasHug Jul 4, 2025
5ef60d7
Fix FFmpeg4 compilation
NicolasHug Jul 4, 2025
51e80a3
Add comments
NicolasHug Jul 4, 2025
b6f8478
Relax check on macos?
NicolasHug Jul 6, 2025
0af41b7
fix
NicolasHug Jul 6, 2025
00ca07e
Merge branch 'main' of github.com:pytorch/torchcodec into encoding_sa…
NicolasHug Jul 8, 2025
fa0856a
Add comments, address review
NicolasHug Jul 8, 2025
61bbe4f
More comments
NicolasHug Jul 8, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions examples/encoding/audio_encoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,13 +78,15 @@ def make_sinewave() -> tuple[torch.Tensor, int]:
# %%
# The encoder supports some encoding options that allow you to change how to
# data is encoded. For example, we can decide to encode our mono data (1
# channel) into stereo data (2 channels):
encoded_samples = encoder.to_tensor(format="wav", num_channels=2)
# channel) into stereo data (2 channels), and to specify an output sample rate:

desired_sample_rate = 32000
encoded_samples = encoder.to_tensor(format="wav", num_channels=2, sample_rate=desired_sample_rate)

stereo_samples_back = AudioDecoder(encoded_samples).get_all_samples()

print(stereo_samples_back)
play_audio(stereo_samples_back.data, rate=stereo_samples_back.sample_rate)
play_audio(stereo_samples_back.data, rate=desired_sample_rate)

# %%
# Check the docstring of the encoding methods to learn about the different
Expand Down
187 changes: 143 additions & 44 deletions src/torchcodec/_core/Encoder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ AudioEncoder::AudioEncoder(
int sampleRate,
std::string_view fileName,
const AudioStreamOptions& audioStreamOptions)
: samples_(validateSamples(samples)) {
: samples_(validateSamples(samples)), inSampleRate_(sampleRate) {
setFFmpegLogLevel();
AVFormatContext* avFormatContext = nullptr;
int status = avformat_alloc_output_context2(
Expand All @@ -132,7 +132,7 @@ AudioEncoder::AudioEncoder(
", make sure it's a valid path? ",
getFFMPEGErrorStringFromErrorCode(status));

initializeEncoder(sampleRate, audioStreamOptions);
initializeEncoder(audioStreamOptions);
}

AudioEncoder::AudioEncoder(
Expand All @@ -142,6 +142,7 @@ AudioEncoder::AudioEncoder(
std::unique_ptr<AVIOToTensorContext> avioContextHolder,
const AudioStreamOptions& audioStreamOptions)
: samples_(validateSamples(samples)),
inSampleRate_(sampleRate),
avioContextHolder_(std::move(avioContextHolder)) {
setFFmpegLogLevel();
AVFormatContext* avFormatContext = nullptr;
Expand All @@ -159,11 +160,10 @@ AudioEncoder::AudioEncoder(

avFormatContext_->pb = avioContextHolder_->getAVIOContext();

initializeEncoder(sampleRate, audioStreamOptions);
initializeEncoder(audioStreamOptions);
}

void AudioEncoder::initializeEncoder(
int sampleRate,
const AudioStreamOptions& audioStreamOptions) {
// We use the AVFormatContext's default codec for that
// specific format/container.
Expand Down Expand Up @@ -191,8 +191,9 @@ void AudioEncoder::initializeEncoder(
// not related to the input sampes.
setDefaultChannelLayout(avCodecContext_, outNumChannels_);

validateSampleRate(*avCodec, sampleRate);
avCodecContext_->sample_rate = sampleRate;
outSampleRate_ = audioStreamOptions.sampleRate.value_or(inSampleRate_);
validateSampleRate(*avCodec, outSampleRate_);
avCodecContext_->sample_rate = outSampleRate_;

// Input samples are expected to be FLTP. Not all encoders support FLTP, so we
// may need to convert the samples into a supported output sample format,
Expand All @@ -217,6 +218,21 @@ void AudioEncoder::initializeEncoder(
"avcodec_parameters_from_context failed: ",
getFFMPEGErrorStringFromErrorCode(status));
streamIndex_ = avStream->index;

// If sample rate conversion is needed and the encoder doesn't support
// variable frame size, we need to create an intermediate FIFO. See
// [Encoding loop, sample rate conversion and FIFO].
if (((avCodec->capabilities & AV_CODEC_CAP_VARIABLE_FRAME_SIZE) == 0) &&
(inSampleRate_ != outSampleRate_)) {
// frame_size * 2 is a decent default size. FFmpeg automatically
// re-allocates the fifo if more space is needed.
auto avAudioFifo = av_audio_fifo_alloc(
avCodecContext_->sample_fmt,
outNumChannels_,
avCodecContext_->frame_size * 2);
TORCH_CHECK(avAudioFifo != nullptr, "Couldn't create AVAudioFifo.");
avAudioFifo_.reset(avAudioFifo);
}
}

torch::Tensor AudioEncoder::encodeToTensor() {
Expand All @@ -234,24 +250,15 @@ void AudioEncoder::encode() {
TORCH_CHECK(!encodeWasCalled_, "Cannot call encode() twice.");
encodeWasCalled_ = true;

UniqueAVFrame avFrame(av_frame_alloc());
TORCH_CHECK(avFrame != nullptr, "Couldn't allocate AVFrame.");
// Default to 256 like in torchaudio
int numSamplesAllocatedPerFrame =
avCodecContext_->frame_size > 0 ? avCodecContext_->frame_size : 256;
avFrame->nb_samples = numSamplesAllocatedPerFrame;
avFrame->format = AV_SAMPLE_FMT_FLTP;
avFrame->sample_rate = avCodecContext_->sample_rate;
UniqueAVFrame avFrame = allocateAVFrame(
numSamplesAllocatedPerFrame,
inSampleRate_,
static_cast<int>(samples_.sizes()[0]),
AV_SAMPLE_FMT_FLTP);
avFrame->pts = 0;
// We set the channel layout of the frame to the default layout corresponding
// to the input samples' number of channels
setDefaultChannelLayout(avFrame, static_cast<int>(samples_.sizes()[0]));

auto status = av_frame_get_buffer(avFrame.get(), 0);
TORCH_CHECK(
status == AVSUCCESS,
"Couldn't allocate avFrame's buffers: ",
getFFMPEGErrorStringFromErrorCode(status));

AutoAVPacket autoAVPacket;

Expand All @@ -261,19 +268,13 @@ void AudioEncoder::encode() {
int numBytesPerSample = static_cast<int>(samples_.element_size());
int numBytesPerChannel = numSamples * numBytesPerSample;

status = avformat_write_header(avFormatContext_.get(), nullptr);
auto status = avformat_write_header(avFormatContext_.get(), nullptr);
TORCH_CHECK(
status == AVSUCCESS,
"Error in avformat_write_header: ",
getFFMPEGErrorStringFromErrorCode(status));

while (numEncodedSamples < numSamples) {
status = av_frame_make_writable(avFrame.get());
TORCH_CHECK(
status == AVSUCCESS,
"Couldn't make AVFrame writable: ",
getFFMPEGErrorStringFromErrorCode(status));

int numSamplesToEncode =
std::min(numSamplesAllocatedPerFrame, numSamples - numEncodedSamples);
int numBytesToEncode = numSamplesToEncode * numBytesPerSample;
Expand All @@ -294,10 +295,9 @@ void AudioEncoder::encode() {
avFrame->nb_samples = numSamplesToEncode;

UniqueAVFrame convertedAVFrame = maybeConvertAVFrame(avFrame);
encodeInnerLoop(autoAVPacket, convertedAVFrame);
encodeFrameThroughFifo(autoAVPacket, convertedAVFrame);

numEncodedSamples += numSamplesToEncode;
avFrame->pts += static_cast<int64_t>(numSamplesToEncode);
}
TORCH_CHECK(numEncodedSamples == numSamples, "Hmmmmmm something went wrong.");

Expand All @@ -313,7 +313,8 @@ void AudioEncoder::encode() {
UniqueAVFrame AudioEncoder::maybeConvertAVFrame(const UniqueAVFrame& avFrame) {
if (static_cast<AVSampleFormat>(avFrame->format) ==
avCodecContext_->sample_fmt &&
getNumChannels(avFrame) == outNumChannels_) {
getNumChannels(avFrame) == outNumChannels_ &&
avFrame->sample_rate == outSampleRate_) {
// Note: the clone references the same underlying data, it's a cheap copy.
return UniqueAVFrame(av_frame_clone(avFrame.get()));
}
Expand All @@ -322,31 +323,99 @@ UniqueAVFrame AudioEncoder::maybeConvertAVFrame(const UniqueAVFrame& avFrame) {
swrContext_.reset(createSwrContext(
static_cast<AVSampleFormat>(avFrame->format),
avCodecContext_->sample_fmt,
avFrame->sample_rate, // No sample rate conversion
avFrame->sample_rate,
outSampleRate_,
avFrame,
outNumChannels_));
}
UniqueAVFrame convertedAVFrame = convertAudioAVFrameSamples(
swrContext_,
avFrame,
avCodecContext_->sample_fmt,
avFrame->sample_rate, // No sample rate conversion
outSampleRate_,
outNumChannels_);

if (avFrame->sample_rate == outSampleRate_) {
TORCH_CHECK(
convertedAVFrame->nb_samples == avFrame->nb_samples,
"convertedAVFrame->nb_samples=",
convertedAVFrame->nb_samples,
" differs from ",
"avFrame->nb_samples=",
avFrame->nb_samples,
"This is unexpected, please report on the TorchCodec bug tracker.");
}
return convertedAVFrame;
}

void AudioEncoder::encodeFrameThroughFifo(
AutoAVPacket& autoAVPacket,
const UniqueAVFrame& avFrame,
// flushFifo is only set to true in maybeFlushSwrBuffers(), i.e. at the very
// end of the encoding process when we're flushing buffers. We also want to
// flush the FIFO so as to not leave any remaining samples in it.
bool flushFifo) {
if (avAudioFifo_ == nullptr) {
encodeFrame(autoAVPacket, avFrame);
return;
}
int numSamplesWritten = av_audio_fifo_write(
avAudioFifo_.get(),
reinterpret_cast<void**>(avFrame->data),
avFrame->nb_samples);
TORCH_CHECK(
convertedAVFrame->nb_samples == avFrame->nb_samples,
"convertedAVFrame->nb_samples=",
convertedAVFrame->nb_samples,
" differs from ",
"avFrame->nb_samples=",
numSamplesWritten == avFrame->nb_samples,
"Tried to write ",
avFrame->nb_samples,
"This is unexpected, please report on the TorchCodec bug tracker.");
return convertedAVFrame;
" samples, but only wrote ",
numSamplesWritten);

UniqueAVFrame newavFrame = allocateAVFrame(
avCodecContext_->frame_size,
outSampleRate_,
outNumChannels_,
avCodecContext_->sample_fmt);

// Explaining the while bound:
// - if we're not flushing the FIFO, i.e. in most cases, we want to pull
// exactly `frame_size` samples from the FIFO, so we have to stop before it
// contains less than `frame_size` samples.
// - if we're flushing the FIFO, we want to read from the FIFO until the very
// last sample it contains.
//
// In both cases, for as long as we can, we're trying to pull exatly
// `frame_size` samples from the FIFO and send each `frame_size`-sized avFrame
// to encodeFrame(). Only the very last avFrame of the encoding process is
// allowed to contained less than frame_size samples. That only happens when
// flushFifo is true.
while (av_audio_fifo_size(avAudioFifo_.get()) >=
(flushFifo ? 1 : avCodecContext_->frame_size)) {
int samplesToRead = std::min(
av_audio_fifo_size(avAudioFifo_.get()), newavFrame->nb_samples);
int numSamplesRead = av_audio_fifo_read(
avAudioFifo_.get(),
reinterpret_cast<void**>(newavFrame->data),
samplesToRead);
TORCH_CHECK(
numSamplesRead == samplesToRead,
"Tried to read ",
samplesToRead,
" samples, but only read ",
numSamplesRead);

newavFrame->nb_samples = numSamplesRead;
encodeFrame(autoAVPacket, newavFrame);
}
}

void AudioEncoder::encodeInnerLoop(
void AudioEncoder::encodeFrame(
AutoAVPacket& autoAVPacket,
const UniqueAVFrame& avFrame) {
if (avFrame != nullptr) {
avFrame->pts = lastEncodedAVFramePts_;
lastEncodedAVFramePts_ += avFrame->nb_samples;
}

auto status = avcodec_send_frame(avCodecContext_.get(), avFrame.get());
TORCH_CHECK(
status == AVSUCCESS,
Expand Down Expand Up @@ -385,11 +454,41 @@ void AudioEncoder::encodeInnerLoop(
}
}

void AudioEncoder::maybeFlushSwrBuffers(AutoAVPacket& autoAVPacket) {
// Similar to the decoder's method with the same name, but for encoding this
// time. That is, when sample conversion is involved, libswresample may have
// buffered some samples that we now need to flush and send to the encoder.
if (swrContext_ == nullptr && inSampleRate_ == outSampleRate_) {
return;
}
TORCH_CHECK(
swrContext_ != nullptr,
"swrContext is null, but sample rate conversion is needed. ",
"This is unexpected, please report on the TorchCodec bug tracker.");

int numRemainingSamples = // this is an upper bound
swr_get_out_samples(swrContext_.get(), 0);
if (numRemainingSamples == 0) {
return;
}

UniqueAVFrame avFrame = allocateAVFrame(
numRemainingSamples,
outSampleRate_,
outNumChannels_,
avCodecContext_->sample_fmt);
int actualNumRemainingSamples = swr_convert(
swrContext_.get(), avFrame->data, avFrame->nb_samples, NULL, 0);
avFrame->nb_samples = actualNumRemainingSamples;

// We're potentially sending avFrame through the FIFO (if it exists), in which
// case we also want to flush the FIFO itself.
encodeFrameThroughFifo(autoAVPacket, avFrame, /*flushFifo=*/true);
}

void AudioEncoder::flushBuffers() {
// We flush the main FFmpeg buffers, but not swresample buffers. Flushing
// swresample is only necessary when converting sample rates, which we don't
// do for encoding.
AutoAVPacket autoAVPacket;
encodeInnerLoop(autoAVPacket, UniqueAVFrame(nullptr));
maybeFlushSwrBuffers(autoAVPacket);
encodeFrame(autoAVPacket, UniqueAVFrame(nullptr));
}
} // namespace facebook::torchcodec
Loading
Loading