Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions .github/workflows/test-extra.yml
Original file line number Diff line number Diff line change
Expand Up @@ -485,6 +485,23 @@ jobs:
- name: Build llama-cpp backend image and run gRPC e2e tests
run: |
make test-extra-backend-llama-cpp
tests-llama-cpp-grpc-transcription:
needs: detect-changes
if: needs.detect-changes.outputs.llama-cpp == 'true' || needs.detect-changes.outputs.run-all == 'true'
runs-on: ubuntu-latest
timeout-minutes: 90
steps:
- name: Clone
uses: actions/checkout@v6
with:
submodules: true
- name: Setup Go
uses: actions/setup-go@v5
with:
go-version: '1.25.4'
- name: Build llama-cpp backend image and run audio transcription gRPC e2e tests
run: |
make test-extra-backend-llama-cpp-transcription
tests-ik-llama-cpp-grpc:
needs: detect-changes
if: needs.detect-changes.outputs.ik-llama-cpp == 'true' || needs.detect-changes.outputs.run-all == 'true'
Expand Down
17 changes: 17 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -493,6 +493,10 @@ test-extra-backend: protogen-go
BACKEND_TEST_MODEL_URL="$${BACKEND_TEST_MODEL_URL:-$(BACKEND_TEST_MODEL_URL)}" \
BACKEND_TEST_MODEL_FILE="$$BACKEND_TEST_MODEL_FILE" \
BACKEND_TEST_MODEL_NAME="$$BACKEND_TEST_MODEL_NAME" \
BACKEND_TEST_MMPROJ_URL="$$BACKEND_TEST_MMPROJ_URL" \
BACKEND_TEST_MMPROJ_FILE="$$BACKEND_TEST_MMPROJ_FILE" \
BACKEND_TEST_AUDIO_URL="$$BACKEND_TEST_AUDIO_URL" \
BACKEND_TEST_AUDIO_FILE="$$BACKEND_TEST_AUDIO_FILE" \
BACKEND_TEST_CAPS="$$BACKEND_TEST_CAPS" \
BACKEND_TEST_PROMPT="$$BACKEND_TEST_PROMPT" \
BACKEND_TEST_OPTIONS="$$BACKEND_TEST_OPTIONS" \
Expand All @@ -507,6 +511,19 @@ test-extra-backend-llama-cpp: docker-build-llama-cpp
test-extra-backend-ik-llama-cpp: docker-build-ik-llama-cpp
BACKEND_IMAGE=local-ai-backend:ik-llama-cpp $(MAKE) test-extra-backend

## Audio transcription wrapper for the llama-cpp backend.
## Drives the new AudioTranscription / AudioTranscriptionStream RPCs against
## ggml-org/Qwen3-ASR-0.6B-GGUF (a small ASR model that requires its mmproj
## audio encoder companion). The audio fixture is a short public-domain
## "jfk.wav" clip ggml-org bundles with whisper.cpp's CI assets.
test-extra-backend-llama-cpp-transcription: docker-build-llama-cpp
BACKEND_IMAGE=local-ai-backend:llama-cpp \
BACKEND_TEST_MODEL_URL=https://huggingface.co/ggml-org/Qwen3-ASR-0.6B-GGUF/resolve/main/Qwen3-ASR-0.6B-Q8_0.gguf \
BACKEND_TEST_MMPROJ_URL=https://huggingface.co/ggml-org/Qwen3-ASR-0.6B-GGUF/resolve/main/mmproj-Qwen3-ASR-0.6B-Q8_0.gguf \
BACKEND_TEST_AUDIO_URL=https://github.com/ggml-org/whisper.cpp/raw/master/samples/jfk.wav \
BACKEND_TEST_CAPS=health,load,transcription \
$(MAKE) test-extra-backend

## vllm is resolved from a HuggingFace model id (no file download) and
## exercises Predict + streaming + tool-call extraction via the hermes parser.
## Requires a host CPU with the SIMD instructions the prebuilt vllm CPU
Expand Down
11 changes: 11 additions & 0 deletions backend/backend.proto
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ service Backend {
rpc GenerateImage(GenerateImageRequest) returns (Result) {}
rpc GenerateVideo(GenerateVideoRequest) returns (Result) {}
rpc AudioTranscription(TranscriptRequest) returns (TranscriptResult) {}
rpc AudioTranscriptionStream(TranscriptRequest) returns (stream TranscriptStreamResponse) {}
rpc TTS(TTSRequest) returns (Result) {}
rpc TTSStream(TTSRequest) returns (stream Reply) {}
rpc SoundGeneration(SoundGenerationRequest) returns (Result) {}
Expand Down Expand Up @@ -322,11 +323,21 @@ message TranscriptRequest {
bool translate = 5;
bool diarize = 6;
string prompt = 7;
float temperature = 8;
repeated string timestamp_granularities = 9;
bool stream = 10;
}

message TranscriptResult {
repeated TranscriptSegment segments = 1;
string text = 2;
string language = 3;
float duration = 4;
}

message TranscriptStreamResponse {
string delta = 1;
TranscriptResult final_result = 2;
}

message TranscriptSegment {
Expand Down
2 changes: 1 addition & 1 deletion backend/cpp/llama-cpp/Makefile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@

LLAMA_VERSION?=e97492369888f5311e4d1f3beb325a36bbed70e9
LLAMA_VERSION?=6a6780a232b73fe44799b0c0d5f01c61612f1b79
LLAMA_REPO?=https://github.com/ggerganov/llama.cpp

CMAKE_ARGS?=
Expand Down
136 changes: 136 additions & 0 deletions backend/cpp/llama-cpp/grpc-server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@
#include <regex>
#include <atomic>
#include <cstdlib>
#include <fstream>
#include <iterator>
#include <mutex>
#include <signal.h>
#include <thread>
Expand Down Expand Up @@ -76,6 +78,27 @@ static grpc::Status checkAuth(grpc::ServerContext* context) {
return grpc::Status(grpc::StatusCode::UNAUTHENTICATED, "invalid token");
}

// Minimal base64 encoder. The C++ backend already pulls in base64_decode from
// llama.cpp's server-common.cpp, but no encoder is exposed — and we need one to
// hand audio bytes to the existing PredictOptions.audios path (which expects
// base64-encoded strings, just like images).
static std::string base64_encode_bytes(const unsigned char* data, size_t len) {
static const char tbl[] =
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
std::string out;
out.reserve(((len + 2) / 3) * 4);
for (size_t i = 0; i < len; i += 3) {
uint32_t triple = (uint32_t(data[i]) << 16);
if (i + 1 < len) triple |= (uint32_t(data[i + 1]) << 8);
if (i + 2 < len) triple |= uint32_t(data[i + 2]);
out.push_back(tbl[(triple >> 18) & 0x3F]);
out.push_back(tbl[(triple >> 12) & 0x3F]);
out.push_back(i + 1 < len ? tbl[(triple >> 6) & 0x3F] : '=');
out.push_back(i + 2 < len ? tbl[triple & 0x3F] : '=');
}
return out;
}

// END LocalAI


Expand Down Expand Up @@ -2931,6 +2954,119 @@ class BackendServiceImpl final : public backend::Backend::Service {

return grpc::Status::OK;
}

// runTranscriptionAsCompletion implements OAI /v1/audio/transcriptions on
// top of the existing chat-completion + multimodal-audio pipeline, exactly
// the way upstream llama.cpp's server does it (see
// tools/server/server-context.cpp post_transcriptions_oai → forwards into
// handle_completions_impl with a single user message attaching the audio
// file via the mtmd marker).
//
// We synthesize a backend::PredictOptions with one user message
// ("Transcribe audio to text" + optional language hint) and the audio
// bytes attached via the existing PredictOptions.audios field, then
// delegate to our own Predict() handler. This keeps every multimodal
// codepath identical to the chat path and avoids duplicating ~700 lines
// of task-construction logic.
grpc::Status runTranscriptionAsCompletion(grpc::ServerContext* context,
const backend::TranscriptRequest* request,
backend::Reply* out_reply) {
if (params_base.model.path.empty()) {
return grpc::Status(grpc::StatusCode::FAILED_PRECONDITION, "Model not loaded");
}
if (request->dst().empty()) {
return grpc::Status(grpc::StatusCode::INVALID_ARGUMENT, "dst (audio file path) is required");
}

// Read audio bytes from the path LocalAI's HTTP layer wrote.
std::ifstream f(request->dst(), std::ios::binary);
if (!f.is_open()) {
return grpc::Status(grpc::StatusCode::INVALID_ARGUMENT, "failed to open audio file: " + request->dst());
}
std::vector<unsigned char> bytes((std::istreambuf_iterator<char>(f)),
std::istreambuf_iterator<char>());
f.close();
if (bytes.empty()) {
return grpc::Status(grpc::StatusCode::INVALID_ARGUMENT, "audio file is empty: " + request->dst());
}

std::string b64 = base64_encode_bytes(bytes.data(), bytes.size());

// Build the same prompt upstream uses in convert_transcriptions_to_chatcmpl.
std::string user_prompt = "Transcribe audio to text";
if (!request->language().empty()) {
user_prompt += " (language: " + request->language() + ")";
}
if (!request->prompt().empty()) {
// Optional context hint from the caller.
user_prompt += "\n" + request->prompt();
}

backend::PredictOptions synthetic;
synthetic.set_usetokenizertemplate(true);
synthetic.set_temperature(request->temperature());
// Generation length: leave at 0 so parse_options uses -1 (model default).
// The model's stop tokens / EOS handle termination naturally for ASR.
backend::Message* msg = synthetic.add_messages();
msg->set_role("user");
msg->set_content(user_prompt);
synthetic.add_audios(b64);

return Predict(context, &synthetic, out_reply);
}

grpc::Status AudioTranscription(ServerContext* context,
const backend::TranscriptRequest* request,
backend::TranscriptResult* response) override {
auto auth = checkAuth(context);
if (!auth.ok()) return auth;

backend::Reply reply;
grpc::Status st = runTranscriptionAsCompletion(context, request, &reply);
if (!st.ok()) {
return st;
}
response->set_text(reply.message());
if (!request->language().empty()) {
response->set_language(request->language());
}
return grpc::Status::OK;
}

grpc::Status AudioTranscriptionStream(ServerContext* context,
const backend::TranscriptRequest* request,
grpc::ServerWriter<backend::TranscriptStreamResponse>* writer) override {
auto auth = checkAuth(context);
if (!auth.ok()) return auth;

// Buffered streaming: run the transcription as a normal chat
// completion, then emit one delta + one final event. Real
// token-by-token streaming would require refactoring PredictStream's
// 700-line writer-coupled body; the HTTP/SSE contract is identical
// either way, and clients that only consume the assembled text don't
// notice the difference.
backend::Reply reply;
grpc::Status st = runTranscriptionAsCompletion(context, request, &reply);
if (!st.ok()) {
return st;
}

const std::string& text = reply.message();
if (!text.empty()) {
backend::TranscriptStreamResponse delta_chunk;
delta_chunk.set_delta(text);
writer->Write(delta_chunk);
}

backend::TranscriptStreamResponse final_chunk;
backend::TranscriptResult* final_result = final_chunk.mutable_final_result();
final_result->set_text(text);
if (!request->language().empty()) {
final_result->set_language(request->language());
}
writer->Write(final_chunk);
return grpc::Status::OK;
}
};


Expand Down
1 change: 1 addition & 0 deletions backend/go/voxtral/govoxtral.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,5 +56,6 @@ func (v *Voxtral) AudioTranscription(opts *pb.TranscriptRequest) (pb.TranscriptR
return pb.TranscriptResult{
Segments: segments,
Text: text,
Language: opts.Language,
}, nil
}
8 changes: 8 additions & 0 deletions backend/go/whisper/gowhisper.go
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,12 @@ func (w *Whisper) AudioTranscription(opts *pb.TranscriptRequest) (pb.TranscriptR
}

data := buf.AsFloat32Buffer().Data
// whisper.cpp resamples to 16 kHz internally; this matches buf.Format.SampleRate
// for the converted file produced by AudioToWav above.
var duration float32
if buf.Format != nil && buf.Format.SampleRate > 0 {
duration = float32(len(data)) / float32(buf.Format.SampleRate)
}
segsLen := uintptr(0xdeadbeef)
segsLenPtr := unsafe.Pointer(&segsLen)

Expand Down Expand Up @@ -158,5 +164,7 @@ func (w *Whisper) AudioTranscription(opts *pb.TranscriptRequest) (pb.TranscriptR
return pb.TranscriptResult{
Segments: segments,
Text: strings.TrimSpace(text),
Language: opts.Language,
Duration: duration,
}, nil
}
Loading
Loading