Skip to content

Commit

Permalink
TTS API improvements (#2308)
Browse files Browse the repository at this point in the history
* update doc on COQUI_LANGUAGE env variable

Signed-off-by: blob42 <contact@blob42.xyz>

* return errors from tts gRPC backend

Signed-off-by: blob42 <contact@blob42.xyz>

* handle speaker_id and language in coqui TTS backend

Signed-off-by: blob42 <contact@blob42.xyz>

* TTS endpoint: add optional language paramter

Signed-off-by: blob42 <contact@blob42.xyz>

* tts fix: empty language string breaks non-multilingual models

Signed-off-by: blob42 <contact@blob42.xyz>

* allow tts param definition in config file

- consolidate TTS options under `tts` config entry

Signed-off-by: blob42 <contact@blob42.xyz>

* tts: update doc

Signed-off-by: blob42 <contact@blob42.xyz>

---------

Signed-off-by: blob42 <contact@blob42.xyz>
Co-authored-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
  • Loading branch information
blob42 and mudler committed Jun 1, 2024
1 parent 95c65d6 commit b99182c
Show file tree
Hide file tree
Showing 10 changed files with 166 additions and 78 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -447,7 +447,7 @@ protogen-clean: protogen-go-clean protogen-python-clean
.PHONY: protogen-go
protogen-go:
mkdir -p pkg/grpc/proto
protoc -Ibackend/ --go_out=pkg/grpc/proto/ --go_opt=paths=source_relative --go-grpc_out=pkg/grpc/proto/ --go-grpc_opt=paths=source_relative \
protoc --experimental_allow_proto3_optional -Ibackend/ --go_out=pkg/grpc/proto/ --go_opt=paths=source_relative --go-grpc_out=pkg/grpc/proto/ --go-grpc_opt=paths=source_relative \
backend/backend.proto

.PHONY: protogen-go-clean
Expand Down
1 change: 1 addition & 0 deletions backend/backend.proto
Original file line number Diff line number Diff line change
Expand Up @@ -266,6 +266,7 @@ message TTSRequest {
string model = 2;
string dst = 3;
string voice = 4;
optional string language = 5;
}

message TokenizationResponse {
Expand Down
16 changes: 15 additions & 1 deletion backend/python/coqui/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,21 @@ def LoadModel(self, request, context):

def TTS(self, request, context):
try:
self.tts.tts_to_file(text=request.text, speaker_wav=self.AudioPath, language=COQUI_LANGUAGE, file_path=request.dst)
# if model is multilangual add language from request or env as fallback
lang = request.language or COQUI_LANGUAGE
if lang == "":
lang = None
if self.tts.is_multi_lingual and lang is None:
return backend_pb2.Result(success=False, message=f"Model is multi-lingual, but no language was provided")

# if model is multi-speaker, use speaker_wav or the speaker_id from request.voice
if self.tts.is_multi_speaker and self.AudioPath is None and request.voice is None:
return backend_pb2.Result(success=False, message=f"Model is multi-speaker, but no speaker was provided")

if self.tts.is_multi_speaker and request.voice is not None:
self.tts.tts_to_file(text=request.text, speaker=request.voice, language=lang, file_path=request.dst)
else:
self.tts.tts_to_file(text=request.text, speaker_wav=self.AudioPath, language=lang, file_path=request.dst)
except Exception as err:
return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
return backend_pb2.Result(success=True)
Expand Down
17 changes: 16 additions & 1 deletion core/backend/tts.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,16 @@ func generateUniqueFileName(dir, baseName, ext string) string {
}
}

func ModelTTS(backend, text, modelFile, voice string, loader *model.ModelLoader, appConfig *config.ApplicationConfig, backendConfig config.BackendConfig) (string, *proto.Result, error) {
func ModelTTS(
backend,
text,
modelFile,
voice ,
language string,
loader *model.ModelLoader,
appConfig *config.ApplicationConfig,
backendConfig config.BackendConfig,
) (string, *proto.Result, error) {
bb := backend
if bb == "" {
bb = model.PiperBackend
Expand Down Expand Up @@ -83,7 +92,13 @@ func ModelTTS(backend, text, modelFile, voice string, loader *model.ModelLoader,
Model: modelPath,
Voice: voice,
Dst: filePath,
Language: &language,
})

// return RPC error if any
if !res.Success {
return "", nil, fmt.Errorf(res.Message)
}

return filePath, res, err
}
3 changes: 2 additions & 1 deletion core/cli/tts.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ type TTSCMD struct {
Backend string `short:"b" default:"piper" help:"Backend to run the TTS model"`
Model string `short:"m" required:"" help:"Model name to run the TTS"`
Voice string `short:"v" help:"Voice name to run the TTS"`
Language string `short:"l" help:"Language to use with the TTS"`
OutputFile string `short:"o" type:"path" help:"The path to write the output wav file"`
ModelsPath string `env:"LOCALAI_MODELS_PATH,MODELS_PATH" type:"path" default:"${basepath}/models" help:"Path containing models used for inferencing" group:"storage"`
BackendAssetsPath string `env:"LOCALAI_BACKEND_ASSETS_PATH,BACKEND_ASSETS_PATH" type:"path" default:"/tmp/localai/backend_data" help:"Path used to extract libraries that are required by some of the backends in runtime" group:"storage"`
Expand Down Expand Up @@ -52,7 +53,7 @@ func (t *TTSCMD) Run(ctx *cliContext.Context) error {
options := config.BackendConfig{}
options.SetDefaults()

filePath, _, err := backend.ModelTTS(t.Backend, text, t.Model, t.Voice, ml, opts, options)
filePath, _, err := backend.ModelTTS(t.Backend, text, t.Model, t.Voice, t.Language, ml, opts, options)
if err != nil {
return err
}
Expand Down
13 changes: 11 additions & 2 deletions core/config/backend_config.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,15 @@ const (
RAND_SEED = -1
)

type TTSConfig struct {

// Voice wav path or id
Voice string `yaml:"voice"`

// Vall-e-x
VallE VallE `yaml:"vall-e"`
}

type BackendConfig struct {
schema.PredictionOptions `yaml:"parameters"`
Name string `yaml:"name"`
Expand Down Expand Up @@ -49,8 +58,8 @@ type BackendConfig struct {
// GRPC Options
GRPC GRPC `yaml:"grpc"`

// Vall-e-x
VallE VallE `yaml:"vall-e"`
// TTS specifics
TTSConfig `yaml:"tts"`

// CUDA
// Explicitly enable CUDA or not (some backends might need it)
Expand Down
2 changes: 1 addition & 1 deletion core/http/endpoints/elevenlabs/tts.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ func TTSEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfi
}
log.Debug().Msgf("Request for model: %s", modelFile)

filePath, _, err := backend.ModelTTS(cfg.Backend, input.Text, modelFile, voiceID, ml, appConfig, *cfg)
filePath, _, err := backend.ModelTTS(cfg.Backend, input.Text, modelFile, "", voiceID, ml, appConfig, *cfg)
if err != nil {
return err
}
Expand Down
22 changes: 17 additions & 5 deletions core/http/endpoints/localai/tts.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,13 @@ import (
)

// TTSEndpoint is the OpenAI Speech API endpoint https://platform.openai.com/docs/api-reference/audio/createSpeech
// @Summary Generates audio from the input text.
// @Param request body schema.TTSRequest true "query params"
// @Success 200 {string} binary "Response"
// @Router /v1/audio/speech [post]
// @Summary Generates audio from the input text.
// @Accept json
// @Produce audio/x-wav
// @Param request body schema.TTSRequest true "query params"
// @Success 200 {string} binary "generated audio/wav file"
// @Router /v1/audio/speech [post]
// @Router /tts [post]
func TTSEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
return func(c *fiber.Ctx) error {

Expand All @@ -40,6 +43,7 @@ func TTSEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfi
)

if err != nil {
log.Err(err)
modelFile = input.Model
log.Warn().Msgf("Model not found in context: %s", input.Model)
} else {
Expand All @@ -51,7 +55,15 @@ func TTSEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfi
cfg.Backend = input.Backend
}

filePath, _, err := backend.ModelTTS(cfg.Backend, input.Input, modelFile, input.Voice, ml, appConfig, *cfg)
if input.Language != "" {
cfg.Language = input.Language
}

if input.Voice != "" {
cfg.Voice = input.Voice
}

filePath, _, err := backend.ModelTTS(cfg.Backend, input.Input, modelFile, cfg.Voice, cfg.Language, ml, appConfig, *cfg)
if err != nil {
return err
}
Expand Down
120 changes: 61 additions & 59 deletions core/schema/localai.go
Original file line number Diff line number Diff line change
@@ -1,59 +1,61 @@
package schema

import (
gopsutil "github.com/shirou/gopsutil/v3/process"
)

type BackendMonitorRequest struct {
Model string `json:"model" yaml:"model"`
}

type BackendMonitorResponse struct {
MemoryInfo *gopsutil.MemoryInfoStat
MemoryPercent float32
CPUPercent float64
}

type TTSRequest struct {
Model string `json:"model" yaml:"model"`
Input string `json:"input" yaml:"input"`
Voice string `json:"voice" yaml:"voice"`
Backend string `json:"backend" yaml:"backend"`
}

type StoresSet struct {
Store string `json:"store,omitempty" yaml:"store,omitempty"`

Keys [][]float32 `json:"keys" yaml:"keys"`
Values []string `json:"values" yaml:"values"`
}

type StoresDelete struct {
Store string `json:"store,omitempty" yaml:"store,omitempty"`

Keys [][]float32 `json:"keys"`
}

type StoresGet struct {
Store string `json:"store,omitempty" yaml:"store,omitempty"`

Keys [][]float32 `json:"keys" yaml:"keys"`
}

type StoresGetResponse struct {
Keys [][]float32 `json:"keys" yaml:"keys"`
Values []string `json:"values" yaml:"values"`
}

type StoresFind struct {
Store string `json:"store,omitempty" yaml:"store,omitempty"`

Key []float32 `json:"key" yaml:"key"`
Topk int `json:"topk" yaml:"topk"`
}

type StoresFindResponse struct {
Keys [][]float32 `json:"keys" yaml:"keys"`
Values []string `json:"values" yaml:"values"`
Similarities []float32 `json:"similarities" yaml:"similarities"`
}
package schema

import (
gopsutil "github.com/shirou/gopsutil/v3/process"
)

type BackendMonitorRequest struct {
Model string `json:"model" yaml:"model"`
}

type BackendMonitorResponse struct {
MemoryInfo *gopsutil.MemoryInfoStat
MemoryPercent float32
CPUPercent float64
}

// @Description TTS request body
type TTSRequest struct {
Model string `json:"model" yaml:"model"` // model name or full path
Input string `json:"input" yaml:"input"` // text input
Voice string `json:"voice" yaml:"voice"` // voice audio file or speaker id
Backend string `json:"backend" yaml:"backend"`
Language string `json:"language,omitempty" yaml:"language,omitempty"` // (optional) language to use with TTS model
}

type StoresSet struct {
Store string `json:"store,omitempty" yaml:"store,omitempty"`

Keys [][]float32 `json:"keys" yaml:"keys"`
Values []string `json:"values" yaml:"values"`
}

type StoresDelete struct {
Store string `json:"store,omitempty" yaml:"store,omitempty"`

Keys [][]float32 `json:"keys"`
}

type StoresGet struct {
Store string `json:"store,omitempty" yaml:"store,omitempty"`

Keys [][]float32 `json:"keys" yaml:"keys"`
}

type StoresGetResponse struct {
Keys [][]float32 `json:"keys" yaml:"keys"`
Values []string `json:"values" yaml:"values"`
}

type StoresFind struct {
Store string `json:"store,omitempty" yaml:"store,omitempty"`

Key []float32 `json:"key" yaml:"key"`
Topk int `json:"topk" yaml:"topk"`
}

type StoresFindResponse struct {
Keys [][]float32 `json:"keys" yaml:"keys"`
Values []string `json:"values" yaml:"values"`
Similarities []float32 `json:"similarities" yaml:"similarities"`
}
48 changes: 41 additions & 7 deletions docs/content/docs/features/text-to-audio.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,10 @@ Coqui works without any configuration, to test it, you can run the following cur
}'
```

You can use the env variable COQUI_LANGUAGE to set the language used by the coqui backend.

You can also use config files to configure tts models (see section below on how to use config files).

### Bark

[Bark](https://github.com/suno-ai/bark) allows to generate audio from text prompts.
Expand Down Expand Up @@ -148,11 +152,12 @@ name: cloned-voice
backend: vall-e-x
parameters:
model: "cloned-voice"
vall-e:
# The path to the audio file to be cloned
# relative to the models directory
# Max 15s
audio_path: "audio-sample.wav"
tts:
vall-e:
# The path to the audio file to be cloned
# relative to the models directory
# Max 15s
audio_path: "audio-sample.wav"
```
Then you can specify the model name in the requests:
Expand All @@ -164,6 +169,35 @@ curl http://localhost:8080/tts -H "Content-Type: application/json" -d '{
}' | aplay
```

## Parler-tts
### Parler-tts

`parler-tts`. It is possible to install and configure the model directly from the gallery. https://github.com/huggingface/parler-tts


## Using config files

You can also use a `config-file` to specify TTS models and their parameters.

In the following example we define a custom config to load the `xtts_v2` model, and specify a voice and language.

```yaml

name: xtts_v2
backend: coqui
parameters:
language: fr
model: tts_models/multilingual/multi-dataset/xtts_v2

tts:
voice: Ana Florence
```
`parler-tts`. It is possible to install and configure the model directly from the gallery. https://github.com/huggingface/parler-tts
With this config, you can now use the following curl command to generate a text-to-speech audio file:
```bash
curl -L http://localhost:8080/tts \
-H "Content-Type: application/json" \
-d '{
"model": "xtts_v2",
"input": "Bonjour, je suis Ana Florence. Comment puis-je vous aider?"
}' | aplay
```

0 comments on commit b99182c

Please sign in to comment.