### Preparation

In [1]:
!pip install git+https://github.com/huggingface/parler-tts.git -q

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
apache-beam 2.46.0 requires cloudpickle~=2.2.1, but you have cloudpickle 3.1.0 which is incompatible.
apache-beam 2.46.0 requires dill<0.3.2,>=0.3.1.1, but you have dill 0.3.8 which is incompatible.
apache-beam 2.46.0 requires numpy<1.25.0,>=1.14.3, but you have numpy 1.26.4 which is incompatible.
apache-beam 2.46.0 requires protobuf<4,>3.12.2, but you have protobuf 4.25.5 which is incompatible.
apache-beam 2.46.0 requires pyarrow<10.0.0,>=3.0.0, but you have pyarrow 17.0.0 which is incompatible.
google-cloud-aiplatform 0.6.0a1 requires google-api-core[grpc]<2.0.0dev,>=1.22.2, but you have google-api-core 2.11.1 which is incompatible.
google-cloud-automl 1.0.1 requires google-api-core[grpc]<2.0.0dev,>=1.14.0, but you have google-api-core 2.11.1 which is incompatible.
google-cloud-bigquery 2.34.4 requires prot

### Demo

In [6]:
!pip install gradio -q

In [7]:
import torch
import gradio as gr
from parler_tts import ParlerTTSForConditionalGeneration
from transformers import AutoTokenizer
import numpy as np
import soundfile as sf
import os

In [8]:
# Set up the model and tokenizer
device = "cuda:0" if torch.cuda.is_available() else "cpu"

model = ParlerTTSForConditionalGeneration.from_pretrained(
    "Amadeus99/parler-tts-mini-v1-yt-v1",
    torch_dtype=torch.float16
).to(device)

tokenizer = AutoTokenizer.from_pretrained("Amadeus99/parler-tts-mini-v1-yt-v1")

config.json:   0%|          | 0.00/7.11k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.76G [00:00<?, ?B/s]

  WeightNorm.apply(module, name, dim)


generation_config.json:   0%|          | 0.00/265 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/20.8k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

In [9]:
def generate_audio(prompt, description):
    """
    Generate audio from text prompt and voice description

    Args:
        prompt (str): The text to be converted to speech
        description (str): Description of the voice characteristics

    Returns:
        str: Path to the generated audio file
    """
    try:
        # Ensure inputs are on the correct device
        input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device)
        prompt_input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)

        # Generate audio
        generation = model.generate(
            input_ids=input_ids,
            prompt_input_ids=prompt_input_ids
        )

        # Convert to numpy array
        audio_arr = generation.cpu().numpy().squeeze()

        # Ensure output directory exists
        os.makedirs("outputs", exist_ok=True)

        # Save the audio file
        output_path = "outputs/generated_audio.wav"
        sf.write(output_path, audio_arr.astype(np.float32), model.config.sampling_rate)

        return output_path

    except Exception as e:
        return f"Error generating audio: {str(e)}"

In [10]:
# Create Gradio interface
demo = gr.Interface(
    fn=generate_audio,
    inputs=[
        gr.Textbox(label="Text Prompt",
                   placeholder="Enter the text you want to convert to speech"),
        gr.Textbox(label="Voice Description",
                   placeholder="Describe the voice characteristics (e.g., 'A man speaks in a very monotone voice')",
                   value="A man speaks in a very monotone voice")
    ],
    outputs=[
        gr.Audio(label="Generated Speech", type="filepath")
    ],
    title="Parler TTS Audio Generation",
    description="Generate speech using Parler TTS model. Provide a text prompt and voice description.",
    examples=[
        [
            "halo, ini adalah model untuk konversi kalimat ke suara",
            "A man speaks in a very monotone voice"
        ]
    ]
)

In [11]:
demo.launch()

* Running on local URL:  http://127.0.0.1:7860
Kaggle notebooks require sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

* Running on public URL: https://6a67031d1788f46198.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
