# Character Voice
This is a bonus. Let's try to give these characters a voice.

TODO: Do the cloning in a separate env to avoid pypi conflicts.

## Generate new voice
First we will use [Parler-TTS](https://huggingface.co/parler-tts/parler-tts-large-v1) to generate a new voice for each character by inputting a description of the voice heard. We save that voice to a wav file for reference.

## Voice Cloning
Then for anything we want a character to say, we use [CoquiTTS](https://huggingface.co/coqui/XTTS-v2) to clone that voice and generate new speech.

In [None]:
import settings
from model import Story, Character

story = Story.load_from_directory(settings.STORY_DIR + "/step_4")

In [None]:
# Parler lets us create a voice with a description of how it sounds
%pip install transformers==4.21.1 
# As of 11/2/2024 this breaks (audio is garbled)
%pip install --upgrade git+https://github.com/huggingface/parler-tts.git@dcaed95e1cce6f616e3e1956f8d63f0f3f5dfe5f

In [None]:
import os

# Let's create the directory we will store reference voices in
current_step = 11
wav_file_dir = f"{settings.STORY_DIR}/step_{current_step}/voices/"
os.makedirs(wav_file_dir, exist_ok=True)

def speech_path(character):
    return f"{wav_file_dir}/{character.name}.wav"

## Generate Reference Speech

### Load the model

In [None]:
import torch
from parler_tts import ParlerTTSForConditionalGeneration
from transformers import AutoTokenizer
import soundfile as sf

device = "cuda:0" if torch.cuda.is_available() else "cpu"
model_name = "parler-tts/parler-tts-large-v1"

model = ParlerTTSForConditionalGeneration.from_pretrained(model_name).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
def create_baseline_speech(character: Character):
    description = f"very clear audio of a {character.gender} {character.race} age around {character.age} {character.voice_description}"
    baseline_speech = "The birch canoe slid on the smooth planks. Glue the sheet to the dark blue background. These days a chicken leg is a rare dish. Rice is often served in round bowls. Help the woman get back to her feet."

    display(description)

    input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device)
    prompt_input_ids = tokenizer(baseline_speech, return_tensors="pt").input_ids.to(device)

    generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
    audio_arr = generation.cpu().numpy().squeeze()

    # Save the wave file
    wave_file = speech_path(character)
    sf.write(wave_file, audio_arr, model.config.sampling_rate)

    # Might be able to do this to fix the attention mask: https://www.reddit.com/r/KoboldAI/comments/yz26ol/how_to_fix_the_attention_mask_and_the_pad_token/
    
    return wave_file

In [None]:
# def create_baseline_speech(character: Character, prompt: str):
#     description = f"A {character.gender} age around {character.age} with a voice like: {character.voice_description}"
#     baseline_speech = "The birch canoe slid on the smooth planks. Glue the sheet to the dark blue background. These days a chicken leg is a rare dish. Rice is often served in round bowls. Help the woman get back to her feet."

#     display(description)

#     input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device)
#     prompt_input_ids = tokenizer(baseline_speech, return_tensors="pt").input_ids.to(device)

#     generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
#     audio_arr = generation.cpu().numpy().squeeze()
    
#     # Save the wave file
#     wave_file = speech_path(character)
#     sf.write(wave_file, audio_arr, model.config.sampling_rate)
    
#     return wave_file

### Generate Baseline for Each Character


In [None]:
from IPython.display import display, Markdown
import IPython.display as ipd


for character in story.characters:
    display(Markdown(f"""---
## {character.name}
**Voice**: {character.voice_description}
"""))
    display(ipd.Audio(create_baseline_speech(character)))


In [None]:
# raise Exception("Stop here")

# Clone Voice (Fish Speech)

## Install fish-speech

In [None]:
!git clone https://github.com/fishaudio/fish-speech.git || echo Already Cloned

In [None]:
# %pip install torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1

# (Ubuntu / Debian User) Install sox + ffmpeg
# !sudo apt install -y libsox-dev ffmpeg 

# (Ubuntu / Debian User) Install pyaudio 
# !sudo apt install build-essential cmake libasound-dev portaudio19-dev libportaudio2 libportaudiocpp0
    
# Install fish-speech
%pip install -e ./fish-speech[stable]

In [None]:
import locale
locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')

## Download Weights

In [None]:
!huggingface-cli download fishaudio/fish-speech-1.4 --local-dir fish-speech/checkpoints/fish-speech-1.4/

In [None]:
import settings
from model import Story, Character

story = Story.load_from_directory(settings.STORY_DIR + "/step_7")

In [None]:
import os

# Let's create the directory we will store reference voices in
current_step = 11
wav_file_dir = f"{settings.STORY_DIR}/step_{current_step}/voices/"
os.makedirs(wav_file_dir, exist_ok=True)

def baseline_wav(character: Character) -> str:
    return f"{wav_file_dir}/{character.name}.wav"

def baseline_npy(character: Character) -> str:
    return f"{wav_file_dir}/{character.name}.fish.npy"

def catchphrase_fish(character: Character):
    return f"{wav_file_dir}/{character.name}.catchphrase.fish.wav"

## Encode Baseline Voice

In [None]:
for character in story.characters:
    src_audio = baseline_wav(character)
    dst_audio = baseline_npy(character)
    !python fish-speech/tools/vqgan/inference.py \
        -i "{src_audio}" \
        -o "{dst_audio}" \
        --checkpoint-path "fish-speech/checkpoints/fish-speech-1.4/firefly-gan-vq-fsq-8x1024-21hz-generator.pth"


## Generate Catchphrase

In [None]:
from IPython.display import display, Markdown
import IPython.display as ipd
from IPython.display import Image as IPImage

baseline_speech = "The birch canoe slid on the smooth planks. Glue the sheet to the dark blue background. These days a chicken leg is a rare dish. Rice is often served in round bowls. Help the woman get back to her feet."

for character in story.characters:
    display(Markdown(f"""---

## {character.name}

**Gender**: {character.gender}

**Age**: {character.age}

**Voice**: {character.voice_description}

**Catch Phrase**: {character.catch_phrase}
"""))
    # display(character.image)
    image_path = f"{settings.STORY_DIR}/step_7/characters/{character.name}.gif"
    display(IPImage(image_path))

    reference_file = baseline_wav(character)
    output_file = catchphrase_fish(character)
    npy = baseline_npy(character)
    text = f"I'm {character.name}. {character.description}. {character.catch_phrase}"

    # Generate tokens
    !python fish-speech/tools/llama/generate.py \
        --text "{text}" \
        --prompt-text "{baseline_speech}" \
        --prompt-tokens "{npy}" \
        --checkpoint-path "fish-speech/checkpoints/fish-speech-1.4" \
        --num-samples 2 \
        --compile \
        --device cuda

    # Inference
    !python fish-speech/tools/vqgan/inference.py \
        -i "codes_0.npy" \
        -o "{output_file}" \
        --checkpoint-path "fish-speech/checkpoints/fish-speech-1.4/firefly-gan-vq-fsq-8x1024-21hz-generator.pth"\
        --device cuda


    display("Reference file")
    display(ipd.Audio(reference_file))
    
    display("Catch Phrase")
    display(ipd.Audio(output_file))


    




In [None]:
raise Exception("Stop here")

## Clone Voice (CoquiTTS)

Let's generate each character catch-phrase based off the previous baseline voice.

### NOTE: THIS BREAKS THE ABOVE VOICE CLONING DUE TO TRANSFORMERS VERSION

In [None]:
# CoquiTTS will let us clone that previously generated voice and replicate it for new TTS
# %pip install TTS # Doesn't work for Python 3.12.7
%pip install --upgrade coqui-tts

## Clone Voice

Let's generate each character catch-phrase based off the previous baseline voice.

### Load model

## ALERT: YOU NEED TO RUN THIS IN THE TERMINAL FIRST TO AGREE TO THE LICENSE AGREEMENT

In [None]:
import torch
from TTS.api import TTS

# Get device
device = "cuda" if torch.cuda.is_available() else "cpu"

# List available 🐸TTS models
print(TTS().list_models())

# Init TTS
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)

In [None]:
import settings
from model import Story, Character

story = Story.load_from_directory(settings.STORY_DIR + "/step_7")

In [None]:
import os

# Let's create the directory we will store reference voices in
current_step = 11
wav_file_dir = f"{settings.STORY_DIR}/step_{current_step}/voices/"
os.makedirs(wav_file_dir, exist_ok=True)

def speech_path(character):
    return f"{wav_file_dir}/{character.name}.wav"

In [None]:
def speech_path_catchphrase(character):
    return f"{wav_file_dir}/{character.name}.catchphrase.wav"

In [None]:
from IPython.display import display, Markdown
import IPython.display as ipd
from IPython.display import Image as IPImage

for character in story.characters:
    display(Markdown(f"""---

## {character.name}

**Gender**: {character.gender}

**Age**: {character.age}

**Voice**: {character.voice_description}

**Catch Phrase**: {character.catch_phrase}
"""))
    # display(character.image)
    image_path = f"{settings.STORY_DIR}/step_7/characters/{character.name}.gif"
    display(IPImage(image_path))

    reference_file = wave_file = speech_path(character)
    output_file = speech_path_catchphrase(character)

    display("Reference file")
    display(ipd.Audio(reference_file))
    
    display("Catch Phrase")
    text = f"I'm {character.name}. {character.description}. {character.catch_phrase}"
    tts.tts_to_file(text=text, speaker_wav=reference_file, language="en", file_path=output_file)
    display(ipd.Audio(output_file))

    display("A quick brown fox jumps over the lazy dog.")
    wav = tts.tts(text="A quick brown fox jumps over the lazy dog.", speaker_wav=reference_file, language="en")
    display(ipd.Audio(wav, rate=22050))

    


In [None]:
from IPython.display import display, Markdown
import IPython.display as ipd
from IPython.display import Image as IPImage

for character in story.characters:
    display(Markdown(f"""---

## {character.name}

**Gender**: {character.gender}

**Age**: {character.age}

**Voice**: {character.voice_description}

**Catch Phrase**: {character.catch_phrase}
"""))
    # display(character.image)
    image_path = f"{settings.STORY_DIR}/step_7/characters/{character.name}.gif"
    display(IPImage(image_path))

    reference_file = wave_file = speech_path(character)
    output_file = speech_path_catchphrase(character)

    display("Reference file")
    display(ipd.Audio(reference_file))
    
    display("Catch Phrase")
    text = f"I'm {character.name}. {character.description}. {character.catch_phrase}"
    tts.tts_to_file(text=text, speaker_wav=reference_file, language="en", file_path=output_file)
    display(ipd.Audio(output_file))

    display("A quick brown fox jumps over the lazy dog.")
    wav = tts.tts(text="A quick brown fox jumps over the lazy dog.", speaker_wav=reference_file, language="en")
    display(ipd.Audio(wav, rate=22050))

    
