Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TTS API improvements #2086

Closed
wants to merge 9 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -428,7 +428,7 @@ protogen-clean: protogen-go-clean protogen-python-clean
.PHONY: protogen-go
protogen-go:
mkdir -p pkg/grpc/proto
protoc -Ibackend/ --go_out=pkg/grpc/proto/ --go_opt=paths=source_relative --go-grpc_out=pkg/grpc/proto/ --go-grpc_opt=paths=source_relative \
protoc --experimental_allow_proto3_optional -Ibackend/ --go_out=pkg/grpc/proto/ --go_opt=paths=source_relative --go-grpc_out=pkg/grpc/proto/ --go-grpc_opt=paths=source_relative \
backend/backend.proto

.PHONY: protogen-go-clean
Expand Down
1 change: 1 addition & 0 deletions backend/backend.proto
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,7 @@ message TTSRequest {
string model = 2;
string dst = 3;
string voice = 4;
optional string language = 5;
}

message TokenizationResponse {
Expand Down
16 changes: 15 additions & 1 deletion backend/python/coqui/coqui_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,21 @@ def LoadModel(self, request, context):

def TTS(self, request, context):
try:
self.tts.tts_to_file(text=request.text, speaker_wav=self.AudioPath, language=COQUI_LANGUAGE, file_path=request.dst)
# if model is multilangual add language from request or env as fallback
lang = request.language or COQUI_LANGUAGE
if lang == "":
lang = None
if self.tts.is_multi_lingual and lang is None:
return backend_pb2.Result(success=False, message=f"Model is multi-lingual, but no language was provided")

# if model is multi-speaker, use speaker_wav or the speaker_id from request.voice
if self.tts.is_multi_speaker and self.AudioPath is None and request.voice is None:
return backend_pb2.Result(success=False, message=f"Model is multi-speaker, but no speaker was provided")

if self.tts.is_multi_speaker and request.voice is not None:
self.tts.tts_to_file(text=request.text, speaker=request.voice, language=lang, file_path=request.dst)
else:
self.tts.tts_to_file(text=request.text, speaker_wav=self.AudioPath, language=lang, file_path=request.dst)
except Exception as err:
return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
return backend_pb2.Result(success=True)
Expand Down
17 changes: 16 additions & 1 deletion core/backend/tts.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,16 @@ func generateUniqueFileName(dir, baseName, ext string) string {
}
}

func ModelTTS(backend, text, modelFile, voice string, loader *model.ModelLoader, appConfig *config.ApplicationConfig, backendConfig config.BackendConfig) (string, *proto.Result, error) {
func ModelTTS(
backend,
text,
modelFile,
voice ,
language string,
loader *model.ModelLoader,
appConfig *config.ApplicationConfig,
backendConfig config.BackendConfig,
) (string, *proto.Result, error) {
bb := backend
if bb == "" {
bb = model.PiperBackend
Expand Down Expand Up @@ -83,7 +92,13 @@ func ModelTTS(backend, text, modelFile, voice string, loader *model.ModelLoader,
Model: modelPath,
Voice: voice,
Dst: filePath,
Language: &language,
})

// return RPC error if any
if !res.Success {
return "", nil, fmt.Errorf(res.Message)
}

return filePath, res, err
}
3 changes: 2 additions & 1 deletion core/cli/tts.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ type TTSCMD struct {
Backend string `short:"b" default:"piper" help:"Backend to run the TTS model"`
Model string `short:"m" required:"" help:"Model name to run the TTS"`
Voice string `short:"v" help:"Voice name to run the TTS"`
Language string `short:"l" help:"Language to use with the TTS"`
OutputFile string `short:"o" type:"path" help:"The path to write the output wav file"`
ModelsPath string `env:"LOCALAI_MODELS_PATH,MODELS_PATH" type:"path" default:"${basepath}/models" help:"Path containing models used for inferencing" group:"storage"`
BackendAssetsPath string `env:"LOCALAI_BACKEND_ASSETS_PATH,BACKEND_ASSETS_PATH" type:"path" default:"/tmp/localai/backend_data" help:"Path used to extract libraries that are required by some of the backends in runtime" group:"storage"`
Expand Down Expand Up @@ -45,7 +46,7 @@ func (t *TTSCMD) Run(ctx *Context) error {
options := config.BackendConfig{}
options.SetDefaults()

filePath, _, err := backend.ModelTTS(t.Backend, text, t.Model, t.Voice, ml, opts, options)
filePath, _, err := backend.ModelTTS(t.Backend, text, t.Model, t.Voice, t.Language, ml, opts, options)
if err != nil {
return err
}
Expand Down
15 changes: 13 additions & 2 deletions core/config/backend_config.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,15 @@ const (
RAND_SEED = -1
)

type TTSConfig struct {

// Voice wav path or id
Voice string `yaml:"voice"`

// Vall-e-x
VallE VallE `yaml:"vall-e"`
}

type BackendConfig struct {
schema.PredictionOptions `yaml:"parameters"`
Name string `yaml:"name"`
Expand Down Expand Up @@ -56,8 +65,8 @@ type BackendConfig struct {
// GRPC Options
GRPC GRPC `yaml:"grpc"`

// Vall-e-x
VallE VallE `yaml:"vall-e"`
// TTS specifics
TTSConfig `yaml:"tts"`

// CUDA
// Explicitly enable CUDA or not (some backends might need it)
Expand Down Expand Up @@ -373,6 +382,7 @@ func (cl *BackendConfigLoader) LoadBackendConfigFileByName(modelName, modelPath
if err := cl.LoadBackendConfig(
modelConfig, opts...,
); err != nil {
log.Error().Msgf("failed loading model config (%s) %s", modelConfig, err.Error())
return nil, fmt.Errorf("failed loading model config (%s) %s", modelConfig, err.Error())
}
cfgExisting, exists = cl.GetBackendConfig(modelName)
Expand Down Expand Up @@ -445,6 +455,7 @@ func (cl *BackendConfigLoader) LoadBackendConfig(file string, opts ...ConfigLoad
defer cl.Unlock()
c, err := ReadBackendConfig(file, opts...)
if err != nil {
log.Error().Msgf("backend config reading error: %s", err.Error())
return fmt.Errorf("cannot read config file: %w", err)
}

Expand Down
2 changes: 1 addition & 1 deletion core/http/endpoints/elevenlabs/tts.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ func TTSEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfi
}
log.Debug().Msgf("Request for model: %s", modelFile)

filePath, _, err := backend.ModelTTS(cfg.Backend, input.Text, modelFile, voiceID, ml, appConfig, *cfg)
filePath, _, err := backend.ModelTTS(cfg.Backend, input.Text, modelFile, "", voiceID, ml, appConfig, *cfg)
if err != nil {
return err
}
Expand Down
22 changes: 17 additions & 5 deletions core/http/endpoints/localai/tts.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,13 @@ import (
)

// TTSEndpoint is the OpenAI Speech API endpoint https://platform.openai.com/docs/api-reference/audio/createSpeech
// @Summary Generates audio from the input text.
// @Param request body schema.TTSRequest true "query params"
// @Success 200 {string} binary "Response"
// @Router /v1/audio/speech [post]
// @Summary Generates audio from the input text.
// @Accept json
// @Produce audio/x-wav
// @Param request body schema.TTSRequest true "query params"
// @Success 200 {string} binary "generated audio/wav file"
// @Router /v1/audio/speech [post]
// @Router /tts [post]
func TTSEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
return func(c *fiber.Ctx) error {

Expand All @@ -40,6 +43,7 @@ func TTSEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfi
)

if err != nil {
log.Err(err)
modelFile = input.Model
log.Warn().Msgf("Model not found in context: %s", input.Model)
} else {
Expand All @@ -51,7 +55,15 @@ func TTSEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfi
cfg.Backend = input.Backend
}

filePath, _, err := backend.ModelTTS(cfg.Backend, input.Input, modelFile, input.Voice, ml, appConfig, *cfg)
if input.Language != "" {
cfg.Language = input.Language
}

if input.Voice != "" {
cfg.Voice = input.Voice
}

filePath, _, err := backend.ModelTTS(cfg.Backend, input.Input, modelFile, cfg.Voice, cfg.Language, ml, appConfig, *cfg)
if err != nil {
return err
}
Expand Down
120 changes: 61 additions & 59 deletions core/schema/localai.go
Original file line number Diff line number Diff line change
@@ -1,59 +1,61 @@
package schema

import (
gopsutil "github.com/shirou/gopsutil/v3/process"
)

type BackendMonitorRequest struct {
Model string `json:"model" yaml:"model"`
}

type BackendMonitorResponse struct {
MemoryInfo *gopsutil.MemoryInfoStat
MemoryPercent float32
CPUPercent float64
}

type TTSRequest struct {
Model string `json:"model" yaml:"model"`
Input string `json:"input" yaml:"input"`
Voice string `json:"voice" yaml:"voice"`
Backend string `json:"backend" yaml:"backend"`
}

type StoresSet struct {
Store string `json:"store,omitempty" yaml:"store,omitempty"`

Keys [][]float32 `json:"keys" yaml:"keys"`
Values []string `json:"values" yaml:"values"`
}

type StoresDelete struct {
Store string `json:"store,omitempty" yaml:"store,omitempty"`

Keys [][]float32 `json:"keys"`
}

type StoresGet struct {
Store string `json:"store,omitempty" yaml:"store,omitempty"`

Keys [][]float32 `json:"keys" yaml:"keys"`
}

type StoresGetResponse struct {
Keys [][]float32 `json:"keys" yaml:"keys"`
Values []string `json:"values" yaml:"values"`
}

type StoresFind struct {
Store string `json:"store,omitempty" yaml:"store,omitempty"`

Key []float32 `json:"key" yaml:"key"`
Topk int `json:"topk" yaml:"topk"`
}

type StoresFindResponse struct {
Keys [][]float32 `json:"keys" yaml:"keys"`
Values []string `json:"values" yaml:"values"`
Similarities []float32 `json:"similarities" yaml:"similarities"`
}
package schema

import (
gopsutil "github.com/shirou/gopsutil/v3/process"
)

type BackendMonitorRequest struct {
Model string `json:"model" yaml:"model"`
}

type BackendMonitorResponse struct {
MemoryInfo *gopsutil.MemoryInfoStat
MemoryPercent float32
CPUPercent float64
}

// @Description TTS request body
type TTSRequest struct {
Model string `json:"model" yaml:"model"` // model name or full path
Input string `json:"input" yaml:"input"` // text input
Voice string `json:"voice" yaml:"voice"` // voice audio file or speaker id
Backend string `json:"backend" yaml:"backend"`
Language string `json:"language,omitempty" yaml:"language,omitempty"` // (optional) language to use with TTS model
blob42 marked this conversation as resolved.
Show resolved Hide resolved
}

type StoresSet struct {
Store string `json:"store,omitempty" yaml:"store,omitempty"`

Keys [][]float32 `json:"keys" yaml:"keys"`
Values []string `json:"values" yaml:"values"`
}

type StoresDelete struct {
Store string `json:"store,omitempty" yaml:"store,omitempty"`

Keys [][]float32 `json:"keys"`
}

type StoresGet struct {
Store string `json:"store,omitempty" yaml:"store,omitempty"`

Keys [][]float32 `json:"keys" yaml:"keys"`
}

type StoresGetResponse struct {
Keys [][]float32 `json:"keys" yaml:"keys"`
Values []string `json:"values" yaml:"values"`
}

type StoresFind struct {
Store string `json:"store,omitempty" yaml:"store,omitempty"`

Key []float32 `json:"key" yaml:"key"`
Topk int `json:"topk" yaml:"topk"`
}

type StoresFindResponse struct {
Keys [][]float32 `json:"keys" yaml:"keys"`
Values []string `json:"values" yaml:"values"`
Similarities []float32 `json:"similarities" yaml:"similarities"`
}
43 changes: 38 additions & 5 deletions docs/content/docs/features/text-to-audio.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,10 @@ Coqui works without any configuration, to test it, you can run the following cur
}'
```

You can use the env variable COQUI_LANGUAGE to set the language used by the coqui backend.

You can also use config files to configure tts models (see section below on how to use config files).

### Bark

[Bark](https://github.com/suno-ai/bark) allows to generate audio from text prompts.
Expand Down Expand Up @@ -148,11 +152,12 @@ name: cloned-voice
backend: vall-e-x
parameters:
model: "cloned-voice"
vall-e:
# The path to the audio file to be cloned
# relative to the models directory
# Max 15s
audio_path: "audio-sample.wav"
tts:
vall-e:
# The path to the audio file to be cloned
# relative to the models directory
# Max 15s
audio_path: "audio-sample.wav"
```

Then you can specify the model name in the requests:
Expand All @@ -163,3 +168,31 @@ curl http://localhost:8080/tts -H "Content-Type: application/json" -d '{
"input":"Hello!"
}' | aplay
```

### Using config files

You can also use a `config-file` to specify TTS models and their parameters.

In the following example we define a custom config to load the `xtts_v2` model, and specify a voice and language.

```yaml

name: xtts_v2
backend: coqui
parameters:
language: fr
model: tts_models/multilingual/multi-dataset/xtts_v2

tts:
voice: Ana Florence
```

With this config, you can now use the following curl command to generate a text-to-speech audio file:
```bash
curl -L http://localhost:8080/tts \
-H "Content-Type: application/json" \
-d '{
"model": "xtts_v2",
"input": "Bonjour, je suis Ana Florence. Comment puis-je vous aider?"
}' | aplay
```