From 59b1b2f4333ed514ea3f12c59d0ab818626592bb Mon Sep 17 00:00:00 2001 From: Aleksandr Mokrov Date: Thu, 13 Jun 2024 17:17:16 +0200 Subject: [PATCH] Parler TTS (#2114) CVS-141596 --- .ci/spellcheck/.pyspelling.wordlist.txt | 2 + notebooks/parler-tts-text-to-speech/README.md | 31 + .../parler-tts-text-to-speech.ipynb | 579 ++++++++++++++++++ selector/src/shared/notebook-tags.js | 2 +- 4 files changed, 613 insertions(+), 1 deletion(-) create mode 100644 notebooks/parler-tts-text-to-speech/README.md create mode 100644 notebooks/parler-tts-text-to-speech/parler-tts-text-to-speech.ipynb diff --git a/.ci/spellcheck/.pyspelling.wordlist.txt b/.ci/spellcheck/.pyspelling.wordlist.txt index 9975e98456..0908f02971 100644 --- a/.ci/spellcheck/.pyspelling.wordlist.txt +++ b/.ci/spellcheck/.pyspelling.wordlist.txt @@ -401,6 +401,7 @@ LSTM LSTMs Luo LVLM +Lyth macOS Magika Mahalanobis @@ -543,6 +544,7 @@ panoptic parallelized parameterization parametrize +Parler parsers perceptron Patil diff --git a/notebooks/parler-tts-text-to-speech/README.md b/notebooks/parler-tts-text-to-speech/README.md new file mode 100644 index 0000000000..40d187393a --- /dev/null +++ b/notebooks/parler-tts-text-to-speech/README.md @@ -0,0 +1,31 @@ +# Text-to-speech (TTS) with Parler-TTS and OpenVINO™ + +Parler-TTS is a lightweight text-to-speech (TTS) model that can generate high-quality, natural sounding speech in the style of a given speaker (gender, pitch, speaking style, etc). It is a reproduction of work from the paper [Natural language guidance of high-fidelity text-to-speech with synthetic annotations](https://www.text-description-to-speech.com/) by Dan Lyth and Simon King, from Stability AI and Edinburgh University respectively. + +![](https://images.squarespace-cdn.com/content/v1/657816dfbefe0533e8a69d9a/30c96e25-acc5-4019-acdd-648da6142c4c/architecture_v3.png?format=2500w) + +Text-to-speech models trained on large-scale datasets have demonstrated impressive in-context learning capabilities and naturalness. However, control of speaker identity and style in these models typically requires conditioning on reference speech recordings, limiting creative applications. Alternatively, natural language prompting of speaker identity and style has demonstrated promising results and provides an intuitive method of control. However, reliance on human-labeled descriptions prevents scaling to large datasets. + +This work bridges the gap between these two approaches. The authors propose a scalable method for labeling various aspects of speaker identity, style, and recording conditions. This method then is applied to a 45k hour dataset, which is used to train a speech language model. Furthermore, the authors propose simple methods for increasing audio fidelity, significantly outperforming recent work despite relying entirely on found data. + + +[GitHub repository](https://github.com/huggingface/parler-tts) + +[HuggingFace page](https://huggingface.co/parler-tts) + + +## Notebook Contents + +This notebook demonstrates how to convert and run the Parler TTS model using OpenVINO. + +Notebook contains the following steps: +1. Load the original model and inference. +2. Convert the model to OpenVINO IR. +3. Compiling models and inference. +4. Interactive inference. + +## Installation instructions + +This is a self-contained example that relies solely on its own code.
+We recommend running the notebook in a virtual environment. You only need a Jupyter server to start. +For details, please refer to [Installation Guide](../../README.md). diff --git a/notebooks/parler-tts-text-to-speech/parler-tts-text-to-speech.ipynb b/notebooks/parler-tts-text-to-speech/parler-tts-text-to-speech.ipynb new file mode 100644 index 0000000000..1f43d6e495 --- /dev/null +++ b/notebooks/parler-tts-text-to-speech/parler-tts-text-to-speech.ipynb @@ -0,0 +1,579 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "7ff975e6-b25d-4366-adad-4eeda48a9f37", + "metadata": {}, + "source": [ + "# Text-to-speech (TTS) with Parler-TTS and OpenVINO\n", + "\n", + "Parler-TTS is a lightweight text-to-speech (TTS) model that can generate high-quality, natural sounding speech in the style of a given speaker (gender, pitch, speaking style, etc). It is a reproduction of work from the paper [Natural language guidance of high-fidelity text-to-speech with synthetic annotations](https://www.text-description-to-speech.com/) by Dan Lyth and Simon King, from Stability AI and Edinburgh University respectively.\n", + "\n", + "![](https://images.squarespace-cdn.com/content/v1/657816dfbefe0533e8a69d9a/30c96e25-acc5-4019-acdd-648da6142c4c/architecture_v3.png?format=2500w)\n", + "\n", + "Text-to-speech models trained on large-scale datasets have demonstrated impressive in-context learning capabilities and naturalness. However, control of speaker identity and style in these models typically requires conditioning on reference speech recordings, limiting creative applications. Alternatively, natural language prompting of speaker identity and style has demonstrated promising results and provides an intuitive method of control. However, reliance on human-labeled descriptions prevents scaling to large datasets.\n", + "\n", + "This work bridges the gap between these two approaches. The authors propose a scalable method for labeling various aspects of speaker identity, style, and recording conditions. This method then is applied to a 45k hour dataset, which is used to train a speech language model. Furthermore, the authors propose simple methods for increasing audio fidelity, significantly outperforming recent work despite relying entirely on found data.\n", + "\n", + "\n", + "[GitHub repository](https://github.com/huggingface/parler-tts)\n", + "\n", + "[HuggingFace page](https://huggingface.co/parler-tts)\n", + "\n", + "#### Table of contents:\n", + "- [Prerequisites](#Prerequisites)\n", + "- [Load the original model and inference](#Load-the-original-model-and-inference)\n", + "- [Convert the model to OpenVINO IR](#Convert-the-model-to-OpenVINO-IR)\n", + "- [Compiling models and inference](#Compiling-models-and-inference)\n", + "- [Interactive inference](#Interactive-inference)" + ] + }, + { + "cell_type": "markdown", + "id": "27608f06-e493-436f-9319-e6a181b66ea4", + "metadata": {}, + "source": [ + "## Prerequisites\n", + "[back to top ⬆️](#Table-of-contents:)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "86c51122-d726-427b-bdd5-59af4d210d75", + "metadata": {}, + "outputs": [], + "source": [ + "%pip install --pre -Uq openvino --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly\n", + "%pip install -q git+https://github.com/huggingface/parler-tts.git \"gradio>=4.19\" transformers \"torch>=2.2\" --extra-index-url https://download.pytorch.org/whl/cpu" + ] + }, + { + "cell_type": "markdown", + "id": "592145b2-12b6-4606-9447-cf55048f0548", + "metadata": {}, + "source": [ + "## Load the original model and inference\n", + "[back to top ⬆️](#Table-of-contents:)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a9b69168-5cf5-4d70-9f2b-00cfc5ab1cda", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "import torch\n", + "from parler_tts import ParlerTTSForConditionalGeneration\n", + "from transformers import AutoTokenizer\n", + "import soundfile as sf\n", + "\n", + "device = \"cpu\"\n", + "\n", + "repo_id = \"parler-tts/parler_tts_mini_v0.1\"\n", + "model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device)\n", + "tokenizer = AutoTokenizer.from_pretrained(repo_id)\n", + "\n", + "prompt = \"Hey, how are you doing today?\"\n", + "description = \"A female speaker with a slightly low-pitched voice delivers her words quite expressively, in a very confined sounding environment with clear audio quality. She speaks very fast.\"\n", + "\n", + "input_ids = tokenizer(description, return_tensors=\"pt\").input_ids.to(device)\n", + "prompt_input_ids = tokenizer(prompt, return_tensors=\"pt\").input_ids.to(device)\n", + "\n", + "generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)\n", + "audio_arr = generation.cpu().numpy().squeeze()\n", + "sf.write(\"parler_tts_out.wav\", audio_arr, model.config.sampling_rate)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "02b3562b-38cb-4657-861a-0cf9fa4c7b09", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import IPython.display as ipd\n", + "\n", + "ipd.Audio(\"parler_tts_out.wav\")" + ] + }, + { + "cell_type": "markdown", + "id": "ba1f8bc5-406b-469c-90db-1f2e620f37e4", + "metadata": {}, + "source": [ + "## Convert the model to OpenVINO IR\n", + "[back to top ⬆️](#Table-of-contents:)\n", + "\n", + "Let's define the conversion function for PyTorch modules. We use `ov.convert_model` function to obtain OpenVINO Intermediate Representation object and `ov.save_model` function to save it as XML file." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "8e9cc041-3029-424e-9d2b-0e31132a73f6", + "metadata": {}, + "outputs": [], + "source": [ + "import openvino as ov\n", + "\n", + "\n", + "def convert(model: torch.nn.Module, xml_path: str, example_input):\n", + " xml_path = Path(xml_path)\n", + " if not xml_path.exists():\n", + " xml_path.parent.mkdir(parents=True, exist_ok=True)\n", + " with torch.no_grad():\n", + " converted_model = ov.convert_model(model, example_input=example_input)\n", + "\n", + " ov.save_model(converted_model, xml_path)\n", + "\n", + " # cleanup memory\n", + " torch._C._jit_clear_class_registry()\n", + " torch.jit._recursive.concrete_type_store = torch.jit._recursive.ConcreteTypeStore()\n", + " torch.jit._state._clear_class_state()" + ] + }, + { + "cell_type": "markdown", + "id": "d6c3ebe5-a310-43fe-bf12-c53487f0cefe", + "metadata": {}, + "source": [ + "In the pipeline two models are used: Text Encoder (`T5EncoderModel`) and Decoder (`ParlerTTSDecoder`). Lets convert them one by one." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6f46be1b-2200-40e6-8e1d-fc3a5336be98", + "metadata": {}, + "outputs": [], + "source": [ + "from pathlib import Path\n", + "\n", + "\n", + "TEXT_ENCODER_OV_PATH = Path(\"models/text_encoder_ir.xml\")\n", + "\n", + "\n", + "example_input = {\n", + " \"input_ids\": torch.ones((1, 39), dtype=torch.int64),\n", + "}\n", + "\n", + "text_encoder_ov_model = convert(model.text_encoder, TEXT_ENCODER_OV_PATH, example_input)" + ] + }, + { + "cell_type": "markdown", + "id": "fb981bec-24b7-46ad-a7b2-13c1f3e45092", + "metadata": {}, + "source": [ + "The Decoder Model performs in generation pipeline and we can separate it into two stage. In the first stage the model generates `past_key_values` into output for the second stage. In the second stage the model produces tokens during several runs." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "df163228-0ba4-467b-bbb0-c8bb9f697789", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "DECODER_STAGE_1_OV_PATH = Path(\"models/decoder_stage_1_ir.xml\")\n", + "\n", + "\n", + "class DecoderStage1Wrapper(torch.nn.Module):\n", + " def __init__(self, decoder):\n", + " super().__init__()\n", + " self.decoder = decoder\n", + "\n", + " def forward(self, input_ids=None, encoder_hidden_states=None, encoder_attention_mask=None, prompt_hidden_states=None):\n", + " return self.decoder(\n", + " input_ids=input_ids,\n", + " return_dict=False,\n", + " encoder_hidden_states=encoder_hidden_states,\n", + " encoder_attention_mask=encoder_attention_mask,\n", + " prompt_hidden_states=prompt_hidden_states,\n", + " )\n", + "\n", + "\n", + "example_input = {\n", + " \"input_ids\": torch.ones((9, 1), dtype=torch.int64),\n", + " \"encoder_hidden_states\": torch.ones((1, 39, 1024), dtype=torch.float32),\n", + " \"encoder_attention_mask\": torch.ones((1, 39), dtype=torch.int64),\n", + " \"prompt_hidden_states\": torch.ones((1, 9, 1024), dtype=torch.float32),\n", + "}\n", + "\n", + "decoder_1_ov_model = convert(DecoderStage1Wrapper(model.decoder.model.decoder), DECODER_STAGE_1_OV_PATH, example_input)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "19029cfc-84a7-417c-b45a-bb042529fdbc", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "DECODER_STAGE_2_OV_PATH = Path(\"models/decoder_stage_2_ir.xml\")\n", + "\n", + "\n", + "class DecoderStage2Wrapper(torch.nn.Module):\n", + " def __init__(self, decoder):\n", + " super().__init__()\n", + " self.decoder = decoder\n", + "\n", + " def forward(self, input_ids=None, encoder_hidden_states=None, encoder_attention_mask=None, past_key_values=None):\n", + " past_key_values = tuple(tuple(past_key_values[i : i + 4]) for i in range(0, len(past_key_values), 4))\n", + " return self.decoder(\n", + " input_ids=input_ids,\n", + " return_dict=False,\n", + " encoder_hidden_states=encoder_hidden_states,\n", + " encoder_attention_mask=encoder_attention_mask,\n", + " past_key_values=past_key_values,\n", + " )\n", + "\n", + "\n", + "example_input = {\n", + " \"input_ids\": torch.ones((9, 1), dtype=torch.int64),\n", + " \"encoder_hidden_states\": torch.ones((1, 39, 1024), dtype=torch.float32),\n", + " \"encoder_attention_mask\": torch.ones((1, 39), dtype=torch.int64),\n", + " \"past_key_values\": (\n", + " (\n", + " torch.ones(1, 16, 10, 64, dtype=torch.float32),\n", + " torch.ones(1, 16, 10, 64, dtype=torch.float32),\n", + " torch.ones(1, 16, 39, 64, dtype=torch.float32),\n", + " torch.ones(1, 16, 39, 64, dtype=torch.float32),\n", + " )\n", + " * 24\n", + " ),\n", + "}\n", + "\n", + "decoder_2_ov_model = convert(DecoderStage2Wrapper(model.decoder.model.decoder), DECODER_STAGE_2_OV_PATH, example_input)" + ] + }, + { + "cell_type": "markdown", + "id": "3ba48d0c-4e19-4829-84d3-619d11028a19", + "metadata": {}, + "source": [ + "## Compiling models and inference\n", + "[back to top ⬆️](#Table-of-contents:)\n", + "\n", + "Select device from dropdown list for running inference using OpenVINO." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "dd188eac-0ccb-40ea-8418-9e9eb5676f74", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "23cb1bf6b2ff4520bc25981b14751a92", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Dropdown(description='Device:', index=4, options=('CPU', 'GPU.0', 'GPU.1', 'GPU.2', 'AUTO'), value='AUTO')" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import ipywidgets as widgets\n", + "\n", + "core = ov.Core()\n", + "device = widgets.Dropdown(\n", + " options=core.available_devices + [\"AUTO\"],\n", + " value=\"AUTO\",\n", + " description=\"Device:\",\n", + " disabled=False,\n", + ")\n", + "\n", + "device" + ] + }, + { + "cell_type": "markdown", + "id": "fc1777b2-c1fd-45be-930a-22df04d8cbd7", + "metadata": {}, + "source": [ + "Let's create callable wrapper classes for compiled models to allow interaction with original pipeline. Note that all of wrapper classes return `torch.Tensor`s instead of `np.array`s. In the `DecoderWrapper` we separates the pipeline into two stages." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "6ec4c90c-48b6-4bdf-a75d-f38719f8edcb", + "metadata": {}, + "outputs": [], + "source": [ + "from collections import namedtuple\n", + "\n", + "EncoderOutput = namedtuple(\"EncoderOutput\", \"last_hidden_state\")\n", + "DecoderOutput = namedtuple(\"DecoderOutput\", (\"last_hidden_state\", \"past_key_values\", \"hidden_states\", \"attentions\", \"cross_attentions\"))\n", + "\n", + "\n", + "class TextEncoderModelWrapper(torch.nn.Module):\n", + " def __init__(self, encoder_ir_path, config):\n", + " self.encoder = core.compile_model(encoder_ir_path, device.value)\n", + " self.config = config\n", + " self.dtype = self.config.torch_dtype\n", + "\n", + " def __call__(self, input_ids, **_):\n", + " last_hidden_state = self.encoder(input_ids)[0]\n", + " return EncoderOutput(torch.from_numpy(last_hidden_state))\n", + "\n", + "\n", + "class DecoderWrapper(torch.nn.Module):\n", + " def __init__(self, decoder_stage_1_ir_path, decoder_stage_2_ir_path):\n", + " super().__init__()\n", + " self.decoder_stage_1 = core.compile_model(decoder_stage_1_ir_path, device.value)\n", + " self.decoder_stage_2 = core.compile_model(decoder_stage_2_ir_path, device.value)\n", + "\n", + " def __call__(self, input_ids=None, encoder_hidden_states=None, encoder_attention_mask=None, past_key_values=None, prompt_hidden_states=None, **kwargs):\n", + " inputs = {}\n", + " if input_ids is not None:\n", + " inputs[\"input_ids\"] = input_ids\n", + " if encoder_hidden_states is not None:\n", + " inputs[\"encoder_hidden_states\"] = encoder_hidden_states\n", + " if encoder_attention_mask is not None:\n", + " inputs[\"encoder_attention_mask\"] = encoder_attention_mask\n", + " if prompt_hidden_states is not None:\n", + " inputs[\"prompt_hidden_states\"] = prompt_hidden_states\n", + " if past_key_values is not None:\n", + " past_key_values = tuple(past_key_value for pkv_per_layer in past_key_values for past_key_value in pkv_per_layer)\n", + " inputs[\"past_key_values\"] = past_key_values\n", + " arguments = (\n", + " input_ids,\n", + " encoder_hidden_states,\n", + " encoder_attention_mask,\n", + " *past_key_values,\n", + " )\n", + " outs = self.decoder_stage_2(arguments)\n", + " else:\n", + " outs = self.decoder_stage_1(inputs)\n", + "\n", + " outs = [torch.from_numpy(out) for out in outs.values()]\n", + " past_key_values = list(list(outs[i : i + 4]) for i in range(1, len(outs), 4))\n", + "\n", + " return DecoderOutput(outs[0], past_key_values, None, None, None)" + ] + }, + { + "cell_type": "markdown", + "id": "5e4b6148-ce3b-4a8c-85a7-b9a66610bb5d", + "metadata": {}, + "source": [ + "Now we can replace the original models by our wrapped OpenVINO models and run inference. " + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "c64c816d-580e-4359-996d-c97f04158511", + "metadata": {}, + "outputs": [], + "source": [ + "model.text_encoder = TextEncoderModelWrapper(TEXT_ENCODER_OV_PATH, model.text_encoder.config)\n", + "model.decoder.model.decoder = DecoderWrapper(DECODER_STAGE_1_OV_PATH, DECODER_STAGE_2_OV_PATH)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a314ec16-c1f8-4fcd-a88a-f6152181c271", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)\n", + "audio_arr = generation.cpu().numpy().squeeze()\n", + "sf.write(\"parler_tts_out.wav\", audio_arr, model.config.sampling_rate)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "ddc1191e-2090-4d8f-b177-40fe97d7a03b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import IPython.display as ipd\n", + "\n", + "ipd.Audio(\"parler_tts_out.wav\")" + ] + }, + { + "cell_type": "markdown", + "id": "7978bc2e-077d-46cc-acd3-df4f90038730", + "metadata": {}, + "source": [ + "## Interactive inference\n", + "[back to top ⬆️](#Table-of-contents:)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4b2f2fea-a99b-435b-a286-0bb0378e9747", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "import gradio as gr\n", + "import numpy as np\n", + "from transformers import AutoFeatureExtractor, set_seed\n", + "\n", + "\n", + "title = \"Text-to-speech (TTS) with Parler-TTS and OpenVINO\"\n", + "feature_extractor = AutoFeatureExtractor.from_pretrained(repo_id)\n", + "SAMPLE_RATE = feature_extractor.sampling_rate\n", + "\n", + "\n", + "def infer(prompt, description, seed):\n", + " set_seed(seed)\n", + "\n", + " input_ids = tokenizer(description, return_tensors=\"pt\").input_ids.to(\"cpu\")\n", + " prompt_input_ids = tokenizer(prompt, return_tensors=\"pt\").input_ids.to(\"cpu\")\n", + "\n", + " generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)\n", + " audio_arr = generation.cpu().numpy().squeeze()\n", + " sr = SAMPLE_RATE\n", + "\n", + " return sr, audio_arr\n", + "\n", + "\n", + "demo = gr.Interface(\n", + " infer,\n", + " [\n", + " gr.Text(label=\"Prompt\"),\n", + " gr.Text(label=\"Description\"),\n", + " gr.Slider(\n", + " label=\"Seed\",\n", + " value=42,\n", + " step=1,\n", + " minimum=0,\n", + " maximum=np.iinfo(np.int32).max,\n", + " ),\n", + " ],\n", + " gr.Audio(label=\"Output Audio\", type=\"numpy\"),\n", + " title=title,\n", + " description=description,\n", + " examples=[\n", + " [\n", + " \"Hey, how are you doing today?\",\n", + " \"A female speaker with a slightly low-pitched voice delivers her words quite expressively, in a very confined sounding environment with clear audio quality. She speaks very fast.\",\n", + " ],\n", + " [\n", + " \"'This is the best time of my life, Bartley,' she said happily.\",\n", + " \"A female speaker with a slightly low-pitched, quite monotone voice delivers her words at a slightly faster-than-average pace in a confined space with very clear audio.\",\n", + " ],\n", + " [\n", + " \"Montrose also, after having experienced still more variety of good and bad fortune, threw down his arms, and retired out of the kingdom.\t\",\n", + " \"A male speaker with a slightly high-pitched voice delivering his words at a slightly slow pace in a small, confined space with a touch of background noise and a quite monotone tone.\",\n", + " ],\n", + " [\n", + " \"montrose also after having experienced still more variety of good and bad fortune threw down his arms and retired out of the kingdom\",\n", + " \"A male speaker with a low-pitched voice delivering his words at a fast pace in a small, confined space with a lot of background noise and an animated tone.\",\n", + " ],\n", + " ],\n", + ")\n", + "\n", + "try:\n", + " demo.queue().launch(debug=True)\n", + "except Exception:\n", + " demo.queue().launch(share=True, debug=True)\n", + "# if you are launching remotely, specify server_name and server_port\n", + "# demo.launch(server_name='your server name', server_port='server port in int')\n", + "# Read more in the docs: https://gradio.app/docs/" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + }, + "openvino_notebooks": { + "imageUrl": "https://images.squarespace-cdn.com/content/v1/657816dfbefe0533e8a69d9a/30c96e25-acc5-4019-acdd-648da6142c4c/architecture_v3.png?raw=true", + "tags": { + "categories": [ + "Model Demos" + ], + "libraries": [], + "other": [ + "Transformers" + ], + "tasks": [ + "Text-to-Audio", + "Text-to-Speech" + ] + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/selector/src/shared/notebook-tags.js b/selector/src/shared/notebook-tags.js index c6142cff51..2de04ae55d 100644 --- a/selector/src/shared/notebook-tags.js +++ b/selector/src/shared/notebook-tags.js @@ -17,6 +17,7 @@ export const TASKS = /** @type {const} */ ({ TEXT_TO_VIDEO: 'Text-to-Video', VIDEO_TO_TEXT: 'Video-to-Text', TEXT_TO_AUDIO: 'Text-to-Audio', + TEXT_TO_SPEECH: 'Text-to-Speech', AUDIO_TO_TEXT: 'Audio-to-Text', VISUAL_QUESTION_ANSWERING: 'Visual Question Answering', IMAGE_CAPTIONING: "Image Captioning", @@ -61,7 +62,6 @@ export const TASKS = /** @type {const} */ ({ AUDIO_GENERATION: 'Audio Generation', AUDIO_CLASSIFICATION: 'Audio Classification', VOICE_ACTIVITY_DETECTION: 'Voice Activity Detection', - AUDIO_CLASSIFICATION: "Audio Classification", }, OTHER: { KNOWLEDGE_REPRESENTATION: 'Knowledge Representation',