From f5f9cc0709c40609969689b2310417a6eca29f68 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Mon, 25 May 2026 19:56:26 +0000 Subject: [PATCH] feat(stablediffusion-ggml): mux LTX-2 audio into output MP4 sd.cpp's generate_video now returns a sd_audio_t* alongside the video frames for models with an audio VAE (LTX-2.3). Our gosd wrapper was already collecting that pointer but immediately freed it without ever muxing it into the output, so LTX-2 generations landed as silent MP4s even though the audio VAE decode succeeded. Stage the planar float32 waveform to a temp WAV (IEEE float, header hand-built; samples interleaved on the fly), then add it as a second ffmpeg input with -c:a aac -map 0:v:0 -map 1:a:0 -shortest. The temp WAV is cleaned up unconditionally after ffmpeg exits, including on the write/waitpid error paths. Non-LTX models (Wan i2v / FLF2V) keep their current behaviour: audio arg is nullptr, the audio-related ffmpeg flags are not added, and no temp file is created. Assisted-by: Claude:claude-opus-4-7 --- backend/go/stablediffusion-ggml/cpp/gosd.cpp | 172 ++++++++++++++++--- 1 file changed, 145 insertions(+), 27 deletions(-) diff --git a/backend/go/stablediffusion-ggml/cpp/gosd.cpp b/backend/go/stablediffusion-ggml/cpp/gosd.cpp index 8a7c187ffbb1..3fa0f7063f72 100644 --- a/backend/go/stablediffusion-ggml/cpp/gosd.cpp +++ b/backend/go/stablediffusion-ggml/cpp/gosd.cpp @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -1075,9 +1076,71 @@ static uint8_t* load_and_resize_image(const char* path, int target_width, int ta return buf; } +// Write sd.cpp's audio buffer to a temp WAV file (IEEE float, interleaved). +// sd_audio_t.data is planar (all channel 0 samples, then channel 1, etc.) — we +// interleave on the fly so ffmpeg's standard wav demuxer can read it directly. +// Returns 0 on success and fills wav_path (must be at least 64 bytes). +static int write_planar_float_wav(const sd_audio_t* a, char* wav_path, size_t wav_path_sz) { + if (!a || !a->data || a->sample_count == 0 || a->channels == 0 || a->sample_rate == 0) { + return -1; + } + + snprintf(wav_path, wav_path_sz, "/tmp/gosd-audio-XXXXXX.wav"); + int fd = mkstemps(wav_path, 4); + if (fd < 0) { perror("mkstemps wav"); return -1; } + FILE* f = fdopen(fd, "wb"); + if (!f) { perror("fdopen wav"); close(fd); return -1; } + + uint64_t frames = a->sample_count; + uint32_t channels = a->channels; + uint32_t sample_rate = a->sample_rate; + uint64_t total_samples64 = frames * (uint64_t)channels; + uint64_t data_bytes64 = total_samples64 * sizeof(float); + if (data_bytes64 > 0xFFFFFFFFull - 44) { + fprintf(stderr, "audio too large for 32-bit WAV (%" PRIu64 " bytes)\n", data_bytes64); + fclose(f); + unlink(wav_path); + return -1; + } + uint32_t data_bytes = (uint32_t)data_bytes64; + uint32_t riff_size = 36 + data_bytes; + uint16_t fmt_code = 3; // WAVE_FORMAT_IEEE_FLOAT + uint16_t bits_per_sample = 32; + uint16_t block_align = (uint16_t)(channels * sizeof(float)); + uint32_t byte_rate = sample_rate * block_align; + uint16_t ch16 = (uint16_t)channels; + uint32_t fmt_size = 16; + + fwrite("RIFF", 1, 4, f); + fwrite(&riff_size, 4, 1, f); + fwrite("WAVEfmt ", 1, 8, f); + fwrite(&fmt_size, 4, 1, f); + fwrite(&fmt_code, 2, 1, f); + fwrite(&ch16, 2, 1, f); + fwrite(&sample_rate, 4, 1, f); + fwrite(&byte_rate, 4, 1, f); + fwrite(&block_align, 2, 1, f); + fwrite(&bits_per_sample, 2, 1, f); + fwrite("data", 1, 4, f); + fwrite(&data_bytes, 4, 1, f); + + // Interleave planar [ch0_samples..., ch1_samples...] → [ch0_s0, ch1_s0, ...] + for (uint64_t s = 0; s < frames; s++) { + for (uint32_t c = 0; c < channels; c++) { + float v = a->data[(size_t)c * frames + s]; + fwrite(&v, sizeof(float), 1, f); + } + } + fclose(f); + return 0; +} + // Pipe raw RGB/RGBA frames to ffmpeg stdin and let it produce an MP4 at dst. -// Uses fork+execvp to avoid shell interpretation of dst. -static int ffmpeg_mux_raw_to_mp4(sd_image_t* frames, int num_frames, int fps, const char* dst) { +// Uses fork+execvp to avoid shell interpretation of dst. When `audio` is +// non-null, the audio waveform is staged to a temp WAV and added as a second +// ffmpeg input so the final MP4 contains both video and AAC audio. +static int ffmpeg_mux_raw_to_mp4(sd_image_t* frames, int num_frames, int fps, + const sd_audio_t* audio, const char* dst) { if (num_frames <= 0 || !frames || !frames[0].data) { fprintf(stderr, "ffmpeg_mux: empty frames\n"); return 1; @@ -1092,38 +1155,87 @@ static int ffmpeg_mux_raw_to_mp4(sd_image_t* frames, int num_frames, int fps, co snprintf(size_str, sizeof(size_str), "%dx%d", width, height); snprintf(fps_str, sizeof(fps_str), "%d", fps); + // Optional audio: write a temp WAV file if the model produced audio. + char wav_path[64] = {0}; + bool have_audio = false; + if (audio && audio->data && audio->sample_count > 0 && audio->channels > 0 && audio->sample_rate > 0) { + if (write_planar_float_wav(audio, wav_path, sizeof(wav_path)) == 0) { + have_audio = true; + fprintf(stderr, "ffmpeg_mux: audio %u Hz × %u ch × %" PRIu64 " frames → %s\n", + audio->sample_rate, audio->channels, audio->sample_count, wav_path); + } else { + fprintf(stderr, "ffmpeg_mux: failed to stage audio; producing silent video\n"); + } + } + int pipefd[2]; - if (pipe(pipefd) != 0) { perror("pipe"); return 1; } + if (pipe(pipefd) != 0) { + perror("pipe"); + if (have_audio) unlink(wav_path); + return 1; + } pid_t pid = fork(); - if (pid < 0) { perror("fork"); close(pipefd[0]); close(pipefd[1]); return 1; } + if (pid < 0) { + perror("fork"); + close(pipefd[0]); close(pipefd[1]); + if (have_audio) unlink(wav_path); + return 1; + } if (pid == 0) { // child close(pipefd[1]); if (dup2(pipefd[0], STDIN_FILENO) < 0) { perror("dup2"); _exit(127); } close(pipefd[0]); - std::vector argv = { - const_cast("ffmpeg"), - const_cast("-y"), - const_cast("-hide_banner"), - const_cast("-loglevel"), const_cast("warning"), - const_cast("-f"), const_cast("rawvideo"), - const_cast("-pix_fmt"), const_cast(pix_fmt_in), - const_cast("-s"), size_str, - const_cast("-framerate"), fps_str, - const_cast("-i"), const_cast("-"), - const_cast("-c:v"), const_cast("libx264"), - const_cast("-pix_fmt"), const_cast("yuv420p"), - const_cast("-movflags"), const_cast("+faststart"), - // Force MP4 container. Distributed LocalAI hands us a staging - // path (e.g. /staging/localai-output-NNN.tmp) with a non-standard - // extension; relying on filename suffix makes ffmpeg bail with - // "Unable to choose an output format". - const_cast("-f"), const_cast("mp4"), - const_cast(dst), - nullptr - }; + std::vector argv; + argv.push_back(const_cast("ffmpeg")); + argv.push_back(const_cast("-y")); + argv.push_back(const_cast("-hide_banner")); + argv.push_back(const_cast("-loglevel")); + argv.push_back(const_cast("warning")); + // Input 0: raw video from stdin + argv.push_back(const_cast("-f")); + argv.push_back(const_cast("rawvideo")); + argv.push_back(const_cast("-pix_fmt")); + argv.push_back(const_cast(pix_fmt_in)); + argv.push_back(const_cast("-s")); + argv.push_back(size_str); + argv.push_back(const_cast("-framerate")); + argv.push_back(fps_str); + argv.push_back(const_cast("-i")); + argv.push_back(const_cast("-")); + // Input 1: optional audio WAV + if (have_audio) { + argv.push_back(const_cast("-i")); + argv.push_back(wav_path); + argv.push_back(const_cast("-map")); + argv.push_back(const_cast("0:v:0")); + argv.push_back(const_cast("-map")); + argv.push_back(const_cast("1:a:0")); + argv.push_back(const_cast("-c:a")); + argv.push_back(const_cast("aac")); + argv.push_back(const_cast("-b:a")); + argv.push_back(const_cast("192k")); + // -shortest so the final clip ends with the shorter of the two + // streams — guards against an audio buffer that overshoots the + // video duration (or vice versa) on certain LTX variants. + argv.push_back(const_cast("-shortest")); + } + argv.push_back(const_cast("-c:v")); + argv.push_back(const_cast("libx264")); + argv.push_back(const_cast("-pix_fmt")); + argv.push_back(const_cast("yuv420p")); + argv.push_back(const_cast("-movflags")); + argv.push_back(const_cast("+faststart")); + // Force MP4 container. Distributed LocalAI hands us a staging + // path (e.g. /staging/localai-output-NNN.tmp) with a non-standard + // extension; relying on filename suffix makes ffmpeg bail with + // "Unable to choose an output format". + argv.push_back(const_cast("-f")); + argv.push_back(const_cast("mp4")); + argv.push_back(const_cast(dst)); + argv.push_back(nullptr); execvp(argv[0], argv.data()); perror("execvp ffmpeg"); _exit(127); @@ -1148,6 +1260,7 @@ static int ffmpeg_mux_raw_to_mp4(sd_image_t* frames, int num_frames, int fps, co close(pipefd[1]); int status; waitpid(pid, &status, 0); + if (have_audio) unlink(wav_path); return 1; } p += n; @@ -1158,8 +1271,13 @@ static int ffmpeg_mux_raw_to_mp4(sd_image_t* frames, int num_frames, int fps, co int status = 0; while (waitpid(pid, &status, 0) < 0) { - if (errno != EINTR) { perror("waitpid"); return 1; } + if (errno != EINTR) { + perror("waitpid"); + if (have_audio) unlink(wav_path); + return 1; + } } + if (have_audio) unlink(wav_path); if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) { fprintf(stderr, "ffmpeg exited with status %d\n", status); return 1; @@ -1234,7 +1352,7 @@ int gen_video(sd_vid_gen_params_t *p, int steps, char *dst, float cfg_scale, int fprintf(stderr, "Generated %d frames, muxing to %s via ffmpeg\n", num_frames_out, dst); - int rc = ffmpeg_mux_raw_to_mp4(frames, num_frames_out, fps, dst); + int rc = ffmpeg_mux_raw_to_mp4(frames, num_frames_out, fps, audio, dst); for (int i = 0; i < num_frames_out; i++) { if (frames[i].data) free(frames[i].data);