From b2b76a8b7642e743cd514170a99a33531474fb60 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Tue, 9 Sep 2025 14:38:34 +0200 Subject: [PATCH 1/4] feat(chatterbox): support multilingual Signed-off-by: Ettore Di Giacinto --- backend/python/chatterbox/backend.py | 53 +++++++++++++++++++++++----- 1 file changed, 45 insertions(+), 8 deletions(-) diff --git a/backend/python/chatterbox/backend.py b/backend/python/chatterbox/backend.py index 0944202b9457..40dac9bd31d6 100644 --- a/backend/python/chatterbox/backend.py +++ b/backend/python/chatterbox/backend.py @@ -14,9 +14,15 @@ import torch import torchaudio as ta from chatterbox.tts import ChatterboxTTS - +from chatterbox.mtl_tts import ChatterboxMultilingualTTS import grpc +def is_float(s): + try: + float(s) + return True + except ValueError: + return False _ONE_DAY_IN_SECONDS = 60 * 60 * 24 @@ -47,6 +53,27 @@ def LoadModel(self, request, context): if not torch.cuda.is_available() and request.CUDA: return backend_pb2.Result(success=False, message="CUDA is not available") + + options = request.Options + + # empty dict + self.options = {} + + # The options are a list of strings in this form optname:optvalue + # We are storing all the options in a dict so we can use it later when + # generating the images + for opt in options: + if ":" not in opt: + continue + key, value = opt.split(":") + # if value is a number, convert it to the appropriate type + if is_float(value): + if value.is_integer(): + value = int(value) + else: + value = float(value) + self.options[key] = value + self.AudioPath = None if os.path.isabs(request.AudioPath): @@ -56,10 +83,14 @@ def LoadModel(self, request, context): modelFileBase = os.path.dirname(request.ModelFile) # modify LoraAdapter to be relative to modelFileBase self.AudioPath = os.path.join(modelFileBase, request.AudioPath) - try: print("Preparing models, please wait", file=sys.stderr) - self.model = ChatterboxTTS.from_pretrained(device=device) + if "multilingual" in self.options: + # remove key from options + del self.options["multilingual"] + self.model = ChatterboxMultilingualTTS.from_pretrained(device=device) + else: + self.model = ChatterboxTTS.from_pretrained(device=device) except Exception as err: return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}") # Implement your logic here for the LoadModel service @@ -68,12 +99,18 @@ def LoadModel(self, request, context): def TTS(self, request, context): try: - # Generate audio using ChatterboxTTS + kwargs = {} + + if "language" in self.options: + kwargs["language_id"] = self.options["language"] if self.AudioPath is not None: - wav = self.model.generate(request.text, audio_prompt_path=self.AudioPath) - else: - wav = self.model.generate(request.text) - + kwargs["audio_prompt_path"] = self.AudioPath + + # add options to kwargs + kwargs.update(self.options) + + # Generate audio using ChatterboxTTS + wav = self.model.generate(request.text, **kwargs) # Save the generated audio ta.save(request.dst, wav, self.model.sr) From 88aea3d3b3524d551468dc2f9b46f3bb214ae3cf Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Wed, 24 Sep 2025 18:05:15 +0200 Subject: [PATCH 2/4] Add l4t support Signed-off-by: Ettore Di Giacinto --- .github/workflows/backend.yml | 12 ++++++++++++ backend/index.yaml | 12 ++++++++++++ backend/python/chatterbox/install.sh | 1 + backend/python/chatterbox/requirements-l4t.txt | 6 ++++++ 4 files changed, 31 insertions(+) create mode 100644 backend/python/chatterbox/requirements-l4t.txt diff --git a/.github/workflows/backend.yml b/.github/workflows/backend.yml index b303de1f420d..048e9a47b955 100644 --- a/.github/workflows/backend.yml +++ b/.github/workflows/backend.yml @@ -955,6 +955,18 @@ jobs: backend: "exllama2" dockerfile: "./backend/Dockerfile.python" context: "./backend" + - build-type: 'cublas' + cuda-major-version: "12" + cuda-minor-version: "0" + platforms: 'linux/arm64' + skip-drivers: 'true' + tag-latest: 'auto' + tag-suffix: '-nvidia-l4t-arm64-chatterbox' + base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0" + runs-on: 'ubuntu-24.04-arm' + backend: "chatterbox" + dockerfile: "./backend/Dockerfile.python" + context: "./backend" # runs out of space on the runner # - build-type: 'hipblas' # cuda-major-version: "" diff --git a/backend/index.yaml b/backend/index.yaml index e078391d65c1..c55df26368ee 100644 --- a/backend/index.yaml +++ b/backend/index.yaml @@ -353,6 +353,7 @@ nvidia: "cuda12-chatterbox" metal: "metal-chatterbox" default: "cpu-chatterbox" + nvidia-l4t: "nvidia-l4t-arm64-chatterbox" - &piper name: "piper" uri: "quay.io/go-skynet/local-ai-backends:latest-piper" @@ -1239,6 +1240,7 @@ nvidia: "cuda12-chatterbox-development" metal: "metal-chatterbox-development" default: "cpu-chatterbox-development" + nvidia-l4t: "nvidia-l4t-arm64-chatterbox" - !!merge <<: *chatterbox name: "cpu-chatterbox" uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-chatterbox" @@ -1249,6 +1251,16 @@ uri: "quay.io/go-skynet/local-ai-backends:master-cpu-chatterbox" mirrors: - localai/localai-backends:master-cpu-chatterbox +- !!merge <<: *chatterbox + name: "nvidia-l4t-arm64-chatterbox" + uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-l4t-arm64-chatterbox" + mirrors: + - localai/localai-backends:latest-gpu-nvidia-l4t-arm64-chatterbox +- !!merge <<: *chatterbox + name: "nvidia-l4t-arm64-chatterbox-development" + uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-l4t-arm64-chatterbox" + mirrors: + - localai/localai-backends:master-gpu-nvidia-l4t-arm64-chatterbox - !!merge <<: *chatterbox name: "metal-chatterbox" uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-chatterbox" diff --git a/backend/python/chatterbox/install.sh b/backend/python/chatterbox/install.sh index 32befa8e6c03..8f607485baac 100755 --- a/backend/python/chatterbox/install.sh +++ b/backend/python/chatterbox/install.sh @@ -15,5 +15,6 @@ fi if [ "x${BUILD_PROFILE}" == "xintel" ]; then EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match" fi +EXTRA_PIP_INSTALL_FLAGS+=" --no-build-isolation" installRequirements diff --git a/backend/python/chatterbox/requirements-l4t.txt b/backend/python/chatterbox/requirements-l4t.txt new file mode 100644 index 000000000000..6f90be031d2f --- /dev/null +++ b/backend/python/chatterbox/requirements-l4t.txt @@ -0,0 +1,6 @@ +--extra-index-url https://pypi.jetson-ai-lab.io/jp6/cu126/ +torch +torchaudio +transformers +chatterbox-tts@git+https://git@github.com/mudler/chatterbox.git@faster +accelerate From 16a39ea321035de54c85399eebf1a1125fdf9878 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Wed, 24 Sep 2025 18:05:26 +0200 Subject: [PATCH 3/4] Fixups Signed-off-by: Ettore Di Giacinto --- Makefile | 3 +++ backend/python/chatterbox/backend.py | 17 +++++++++++++---- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/Makefile b/Makefile index 24502b57be9e..5d32926aed18 100644 --- a/Makefile +++ b/Makefile @@ -429,6 +429,9 @@ docker-build-kitten-tts: docker-save-kitten-tts: backend-images docker save local-ai-backend:kitten-tts -o backend-images/kitten-tts.tar +docker-save-chatterbox: backend-images + docker save local-ai-backend:chatterbox -o backend-images/chatterbox.tar + docker-build-kokoro: docker build --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg BASE_IMAGE=$(BASE_IMAGE) -t local-ai-backend:kokoro -f backend/Dockerfile.python --build-arg BACKEND=kokoro ./backend diff --git a/backend/python/chatterbox/backend.py b/backend/python/chatterbox/backend.py index 40dac9bd31d6..4cc45b7c75b9 100644 --- a/backend/python/chatterbox/backend.py +++ b/backend/python/chatterbox/backend.py @@ -18,11 +18,19 @@ import grpc def is_float(s): + """Check if a string can be converted to float.""" try: float(s) return True except ValueError: return False +def is_int(s): + """Check if a string can be converted to int.""" + try: + int(s) + return True + except ValueError: + return False _ONE_DAY_IN_SECONDS = 60 * 60 * 24 @@ -68,10 +76,11 @@ def LoadModel(self, request, context): key, value = opt.split(":") # if value is a number, convert it to the appropriate type if is_float(value): - if value.is_integer(): - value = int(value) - else: - value = float(value) + value = float(value) + elif is_int(value): + value = int(value) + elif value.lower() in ["true", "false"]: + value = value.lower() == "true" self.options[key] = value self.AudioPath = None From 2610e07150585436b5adbdd70d9ae90fc46d49a4 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Wed, 24 Sep 2025 18:05:46 +0200 Subject: [PATCH 4/4] fix: switch to fork Until https://github.com/resemble-ai/chatterbox/pull/295 is merged Signed-off-by: Ettore Di Giacinto --- backend/python/chatterbox/requirements-cpu.txt | 10 ++++++---- backend/python/chatterbox/requirements-cublas11.txt | 3 ++- backend/python/chatterbox/requirements-cublas12.txt | 9 +++++---- backend/python/chatterbox/requirements-hipblas.txt | 5 +++-- backend/python/chatterbox/requirements-intel.txt | 5 +++-- 5 files changed, 19 insertions(+), 13 deletions(-) diff --git a/backend/python/chatterbox/requirements-cpu.txt b/backend/python/chatterbox/requirements-cpu.txt index 4d9cf55cbf60..625d5a50958e 100644 --- a/backend/python/chatterbox/requirements-cpu.txt +++ b/backend/python/chatterbox/requirements-cpu.txt @@ -1,6 +1,8 @@ --extra-index-url https://download.pytorch.org/whl/cpu accelerate -torch==2.6.0 -torchaudio==2.6.0 -transformers==4.46.3 -chatterbox-tts==0.1.2 \ No newline at end of file +torch +torchaudio +transformers +# https://github.com/mudler/LocalAI/pull/6240#issuecomment-3329518289 +chatterbox-tts@git+https://git@github.com/mudler/chatterbox.git@faster +#chatterbox-tts==0.1.4 \ No newline at end of file diff --git a/backend/python/chatterbox/requirements-cublas11.txt b/backend/python/chatterbox/requirements-cublas11.txt index 1d5f08e2de95..6dbeb19ec862 100644 --- a/backend/python/chatterbox/requirements-cublas11.txt +++ b/backend/python/chatterbox/requirements-cublas11.txt @@ -2,5 +2,6 @@ torch==2.6.0+cu118 torchaudio==2.6.0+cu118 transformers==4.46.3 -chatterbox-tts==0.1.2 +# https://github.com/mudler/LocalAI/pull/6240#issuecomment-3329518289 +chatterbox-tts@git+https://git@github.com/mudler/chatterbox.git@faster accelerate \ No newline at end of file diff --git a/backend/python/chatterbox/requirements-cublas12.txt b/backend/python/chatterbox/requirements-cublas12.txt index 3e97fda28c2c..84b9b6f80f89 100644 --- a/backend/python/chatterbox/requirements-cublas12.txt +++ b/backend/python/chatterbox/requirements-cublas12.txt @@ -1,5 +1,6 @@ -torch==2.6.0 -torchaudio==2.6.0 -transformers==4.46.3 -chatterbox-tts==0.1.2 +torch +torchaudio +transformers +# https://github.com/mudler/LocalAI/pull/6240#issuecomment-3329518289 +chatterbox-tts@git+https://git@github.com/mudler/chatterbox.git@faster accelerate diff --git a/backend/python/chatterbox/requirements-hipblas.txt b/backend/python/chatterbox/requirements-hipblas.txt index 9086928d7d06..458ad44f47af 100644 --- a/backend/python/chatterbox/requirements-hipblas.txt +++ b/backend/python/chatterbox/requirements-hipblas.txt @@ -1,6 +1,7 @@ --extra-index-url https://download.pytorch.org/whl/rocm6.0 torch==2.6.0+rocm6.1 torchaudio==2.6.0+rocm6.1 -transformers==4.46.3 -chatterbox-tts==0.1.2 +transformers +# https://github.com/mudler/LocalAI/pull/6240#issuecomment-3329518289 +chatterbox-tts@git+https://git@github.com/mudler/chatterbox.git@faster accelerate diff --git a/backend/python/chatterbox/requirements-intel.txt b/backend/python/chatterbox/requirements-intel.txt index d4cb498482bf..b011a20c3d96 100644 --- a/backend/python/chatterbox/requirements-intel.txt +++ b/backend/python/chatterbox/requirements-intel.txt @@ -2,8 +2,9 @@ intel-extension-for-pytorch==2.3.110+xpu torch==2.3.1+cxx11.abi torchaudio==2.3.1+cxx11.abi -transformers==4.46.3 -chatterbox-tts==0.1.2 +transformers +# https://github.com/mudler/LocalAI/pull/6240#issuecomment-3329518289 +chatterbox-tts@git+https://git@github.com/mudler/chatterbox.git@faster accelerate oneccl_bind_pt==2.3.100+xpu optimum[openvino]