## Utilize a text-to-speech model to generate an AI-voice-actor with your voice.

In [None]:
#@title This cell downloads and install all the required software packages.
%%capture
# the scipy version packaged with colab is not tolerant of misformated WAV files.
# install the latest version.
!pip3 install -U scipy
!pip install ffmpeg-python

!git clone https://github.com/jnordberg/tortoise-tts.git
%cd tortoise-tts
!pip3 install -r requirements.txt
!pip3 install transformers==4.19.0 einops==0.5.0 rotary_embedding_torch==0.1.5 unidecode==1.3.5
!python3 setup.py install

import torch
import torchaudio
import torch.nn as nn
import torch.nn.functional as F

import IPython

from tortoise.api import TextToSpeech
from tortoise.utils.audio import load_audio, load_voice, load_voices

# This will download all the models used by Tortoise from the HuggingFace hub.
tts = TextToSpeech()

"""
To write this piece of code I took inspiration/code from a lot of places.
It was late night, so I'm not sure how much I created or just copied o.O
Here are some of the possible references:
https://blog.addpipe.com/recording-audio-in-the-browser-using-pure-html5-and-minimal-javascript/
https://stackoverflow.com/a/18650249
https://hacks.mozilla.org/2014/06/easy-audio-capture-with-the-mediarecorder-api/
https://air.ghost.io/recording-to-an-audio-file-using-html5-and-js/
https://stackoverflow.com/a/49019356
"""
from IPython.display import HTML, Audio
from google.colab.output import eval_js
from base64 import b64decode
import numpy as np
from scipy.io.wavfile import read as wav_read
import io
import ffmpeg

AUDIO_HTML = """
<script>
var my_div = document.createElement("DIV");
var my_p = document.createElement("P");
var my_btn = document.createElement("BUTTON");
var t = document.createTextNode("Klick mich um eine Sprachaufnahme zu starten!");

my_btn.appendChild(t);
//my_p.appendChild(my_btn);
my_div.appendChild(my_btn);
document.body.appendChild(my_div);

var base64data = 0;
var reader;
var recorder, gumStream;
var recordButton = my_btn;

var handleSuccess = function(stream) {
  gumStream = stream;
  var options = {
    //bitsPerSecond: 8000, //chrome seems to ignore, always 48k
    mimeType : 'audio/webm;codecs=opus'
    //mimeType : 'audio/webm;codecs=pcm'
  };            
  //recorder = new MediaRecorder(stream, options);
  recorder = new MediaRecorder(stream);
  recorder.ondataavailable = function(e) {            
    var url = URL.createObjectURL(e.data);
    var preview = document.createElement('audio');
    preview.controls = true;
    preview.src = url;
    document.body.appendChild(preview);

    reader = new FileReader();
    reader.readAsDataURL(e.data); 
    reader.onloadend = function() {
      base64data = reader.result;
      //console.log("Inside FileReader:" + base64data);
    }
  };
  recorder.start();
  };

recordButton.innerText = "Aufzeichnung läuft... Klick mich nochmal um die Aufnahme zu stoppen.";

navigator.mediaDevices.getUserMedia({audio: true}).then(handleSuccess);


function toggleRecording() {
  if (recorder && recorder.state == "recording") {
      recorder.stop();
      gumStream.getAudioTracks()[0].stop();
      recordButton.innerText = "Die Aufnahme wird gespeichert, bitte warte kurz!"
  }
}

// https://stackoverflow.com/a/951057
function sleep(ms) {
  return new Promise(resolve => setTimeout(resolve, ms));
}

var data = new Promise(resolve=>{
//recordButton.addEventListener("click", toggleRecording);
recordButton.onclick = ()=>{
toggleRecording()

sleep(2000).then(() => {
  // wait 2000ms for the data to be available...
  // ideally this should use something like await...
  //console.log("Inside data:" + base64data)
  resolve(base64data.toString())

});

}
});
      
</script>
"""

def get_audio():
  display(HTML(AUDIO_HTML))
  data = eval_js("data")
  binary = b64decode(data.split(',')[1])
  
  process = (ffmpeg
    .input('pipe:0')
    .output('pipe:1', format='wav')
    .run_async(pipe_stdin=True, pipe_stdout=True, pipe_stderr=True, quiet=True, overwrite_output=True)
  )
  output, err = process.communicate(input=binary)
  
  riff_chunk_size = len(output) - 8
  # Break up the chunk size into four bytes, held in b.
  q = riff_chunk_size
  b = []
  for i in range(4):
      q, r = divmod(q, 256)
      b.append(r)

  # Replace bytes 4:8 in proc.stdout with the actual size of the RIFF chunk.
  riff = output[:4] + bytes(b) + output[8:]

  sr, audio = wav_read(io.BytesIO(riff))

  return audio, sr

## For the AI to be able to adopt your voice, you first need to record some examples.

### Record this text, as if you would tell the funniest joke in this world! 


What do you call a fish wearing a bowtie?

Sofishticated.

In [None]:
# @title Record the funny text here and listen to the result.
import scipy

funny_audio, funny_sr = get_audio()

funny_audio

array([0, 0, 0, ..., 0, 0, 0], dtype=int16)

In [None]:
# @title If you are happy with your recording, you can save it here.
from pathlib import Path

directory_path = "/content/tortoise-tts/tortoise/voices/student"
Path(directory_path).mkdir(parents=True, exist_ok=True)

scipy.io.wavfile.write('/content/tortoise-tts/tortoise/voices/student/funny.wav', funny_sr, funny_audio)

### Record this text with a very angry voice.

Hey, that doesn't belong to you!! Give my phone back!

In [None]:
# @title Record the angry text here and listen to the result again.

from pathlib import Path
import scipy

angry_audio, angry_sr = get_audio()

angry_audio

array([ 0,  0,  0, ...,  0, -1, -1], dtype=int16)

In [None]:
# @title If you are happy with your recording, you can save it here.

directory_path = "/content/tortoise-tts/tortoise/voices/student"
Path(directory_path).mkdir(parents=True, exist_ok=True)

scipy.io.wavfile.write('/content/tortoise-tts/tortoise/voices/student/angry.wav', angry_sr, angry_audio)

### Record this text with a very sad voice.

My cat has dissapeared since last week. I hope she comes back to me :(

In [None]:
# @title Record the sad text here and listen to the resut again.

from pathlib import Path
import scipy

sad_audio, sad_sr = get_audio()

sad_audio

array([ 0,  0,  0, ...,  0,  0, -1], dtype=int16)

In [None]:
# @title If you are happy with your recording, you can save it here.

directory_path = "/content/tortoise-tts/tortoise/voices/student"
Path(directory_path).mkdir(parents=True, exist_ok=True)

scipy.io.wavfile.write('/content/tortoise-tts/tortoise/voices/student/sad.wav', sad_sr, sad_audio)

In [None]:
# @title Enter the text your AI-voice-actor should speak and with which emotion it should do so.

# This is the text that will be spoken.
text = "" #@param {type:"string"}

emotion = "wütend" #@param ["wütend", "traurig", "glücklich", "lustig"]
emotion_map = {
    "": None,
    "wütend": "angry",
    "traurig": "sad",
    "glücklich": "happy",
    "lustig": "funny"
}

inflection = ""
if emotion_map[emotion]:
  inflection = f"[I am so {emotion_map[emotion]},] "

combined_text = f"{inflection}{text}"

# Pick a "preset mode" to determine quality. Options: {"ultra_fast", "fast" (default), "standard", "high_quality"}. See docs in api.py
preset = "standard" #@param ["ultra_fast", "fast", "standard"]

In [None]:
# @title This cell generates the spoken text of the AI-voice-actor.

CUSTOM_VOICE_NAME = "student"

# Generate speech with the custom voice.
voice_samples, conditioning_latents = load_voice(CUSTOM_VOICE_NAME)
voice_samples, conditioning_latents
gen = tts.tts_with_preset(
    text, 
    voice_samples=voice_samples, 
    conditioning_latents=conditioning_latents,
    preset=preset
)
torchaudio.save(f'generated-{CUSTOM_VOICE_NAME}.wav', gen.squeeze(0).cpu(), 24000)
IPython.display.Audio(f'generated-{CUSTOM_VOICE_NAME}.wav')

Generating autoregressive samples..


100%|██████████| 16/16 [00:49<00:00,  3.08s/it]


Computing best candidates using CLVP and CVVP


100%|██████████| 16/16 [00:25<00:00,  1.58s/it]


Transforming autoregressive outputs into audio..


100%|██████████| 200/200 [00:22<00:00,  8.74it/s]
