<a href="https://colab.research.google.com/github/olaviinha/NeuralTextToAudio/blob/main/TTS_voice_cloning_pub.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#<font face="Trebuchet MS" size="6">Voice cloning TTS<font color="#999" size="4">&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;</font><font color="#999" size="4">Text-to-audio</font><font color="#999" size="4">&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;<a href="https://github.com/olaviinha/NeuralTextToAudio" target="_blank"><font color="#999" size="4">Github</font></a></font>

Text-to-speech tool that takes a text input and and audio file of a voice, and produces a new audio file with input text spoken with the voice of the voice audio provided. Do not expect realistic state-of-the-art outputs. With a good speaker voice some results may be relatively good, but most will be poor.

#### Tips:
- `local_models_dir` is optional but recommended. It will store models in your Google Drive and/or use them from there if already available.
- All directory paths should be relative to your Google Drive root, e.g. `output_dir` should be `music/ai-generated-sounds` if you have a directory called _music_ in your Drive, containing a subdirectory called _ai-generated-sounds_.

In [None]:
#@title #Setup
#@markdown This cell needs to be run only once. It will mount your Google Drive and setup prerequisites.
#@markdown <br><small>Mounting Google Drive is required by this notebook.</small>
force_setup = False
pip_packages = ''
local_models_dir = "" #@param{type:"string"}
# main_repository = 'https://github.com/CorentinJ/Real-Time-Voice-Cloning.git'
 
import os
from google.colab import output
import warnings
warnings.filterwarnings('ignore')
%cd /content/
 
# inhagcutils
if not os.path.isfile('/content/inhagcutils.ipynb') and force_setup == False:
  !pip -q install import-ipynb {pip_packages}
  !curl -s -O https://raw.githubusercontent.com/olaviinha/inhagcutils/master/inhagcutils.ipynb
import import_ipynb
from inhagcutils import *
 
# Mount Drive
if not os.path.isdir('/content/drive') and force_setup == False:
  from google.colab import drive
  drive.mount('/content/drive')
 
# Drive symlink
if not os.path.isdir('/content/mydrive') and force_setup == False:
  os.symlink('/content/drive/My Drive', '/content/mydrive')
  drive_root_set = True
drive_root = '/content/mydrive/'
 
dir_tmp = '/content/tmp/'
dir_source = '/content/tmp/source/'
dir_chops = '/content/tmp/chops/'
dir_clips = '/content/tmp/clips/'
create_dirs([dir_tmp, dir_source, dir_chops, dir_clips])
 
#----

import os
from os.path import exists, join, basename, splitext
 
git_repo_url = 'https://github.com/CorentinJ/Real-Time-Voice-Cloning.git'
project_name = splitext(basename(git_repo_url))[0]
if not exists(project_name) or force_setup == True:
  %cd /content/
  # clone and install
  !git clone -q --recursive {git_repo_url}
  # install dependencies
  !cd {project_name} && pip install -q -r requirements.txt
  !pip install -q gdown
  !apt-get install -qq libportaudio2
  !apt install sox
 
  # download pretrained model
if local_models_dir == "":
  dir_models = dir_tmp
  # !cd {project_name} && wget https://github.com/blue-fish/Real-Time-Voice-Cloning/releases/download/v1.0/pretrained.zip && unzip -o pretrained.zip
else:
  dir_models = drive_root+fix_path(local_models_dir)
  if not os.path.isdir(dir_models):
    os.mkdir(dir_models)
 
%cd /content/
%cd {project_name}

import sys
sys.path.append(project_name)
 
# from IPython.display import display, Audio, clear_output
from IPython.display import Audio 
from IPython.core.display import display

from IPython.utils import io
import ipywidgets as widgets
import numpy as np
 
from synthesizer.inference import Synthesizer
from encoder import inference as encoder
from vocoder import inference as vocoder
from pathlib import Path
import soundfile as sf

if not os.path.isfile(dir_models+"encoder.pt"):
  %cd "{dir_models}"
  !wget https://drive.google.com/uc?export=download&id=1q8mEGwCkFy23KZsinbuvdKAQLqNKbYf1
if not os.path.isfile(dir_models+"synthesizer.pt"):
  %cd "{dir_models}"
  !wget https://drive.google.com/uc?export=download&id=1EqFMIbvxffxtjiVrtykroF6_mUh-5Z3s
if not os.path.isfile(dir_models+"vocoder.pt"):
  %cd "{dir_models}"
  !wget https://drive.google.com/uc?export=download&id=1cf2NO6FtI0jDuy8AV3Xgn6leO6dHjIgu

%cd /content/
%cd {project_name}

encoder.load_model(project_name / Path(dir_models+"encoder.pt"))
synthesizer = Synthesizer(project_name / Path(dir_models+"synthesizer.pt"))
vocoder.load_model(project_name / Path(dir_models+"vocoder.pt"))
 
fade_ms = 3
global_fade = fade_ms/1000

#----
 
def _compute_embedding(audio):
  global embedding, sample_rate
  # op(c.title, 'Voice source audio:')
  # display(Audio(audio, rate=sample_rate, autoplay=False))
  embedding = None
  embedding = encoder.embed_utterance(encoder.preprocess_wav(audio,sample_rate))
 
def synthesize(embed, text, save_as='', show_player=False):
  #with io.capture_output() as captured:
  specs = synthesizer.synthesize_spectrograms([text], [embed])
  generated_wav = vocoder.infer_waveform(specs[0])
  generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate), mode="constant")
  if save_as != '':
    sf.write(save_as, generated_wav, synthesizer.sample_rate, 'PCM_24')
  if show_player:
    audio_player(generated_wav)
 
#----

def normalize(audio):
  return np.interp(audio, (audio.min(), audio.max()), (-1, 1))
  
def clip_audio(audio_data, start, duration, sr=44100):
  print('clipping to', start, duration)
  global global_fade
  xstart = librosa.time_to_samples(start, sr=sr)
  xduration = librosa.time_to_samples(start+duration, sr=sr)
  audio_data = fade_audio(audio_data[:, xstart:xduration], sr=sr)
  return audio_data

def fade_audio(audio_data, fade_in=global_fade, fade_out=global_fade, sr=44100):
  a_duration = librosa.get_duration(audio_data, sr=sr)
  if fade_in > 0:
    fade_in_to = librosa.time_to_samples(fade_in, sr=sr)
    in_y = audio_data[:, 0:fade_in_to]
    fade_ins = []
    for channel in in_y:
      fade = [ i/len(channel)*smp for i, smp in enumerate(channel) ]
      fade_ins.append(fade)
    fade_ins = np.array(fade_ins)
    tail_start = fade_in_to+1  
    tail = audio_data[:, tail_start:]
    audio_data = np.concatenate([fade_ins, tail], axis=1)
  if fade_out > 0:
    fade_out_start = librosa.time_to_samples(a_duration-fade_out, sr=sr)
    out_y = audio_data[:, fade_out_start:]
    fade_outs = []
    for channel in out_y:
      fade = [ smp-(i/len(channel)*smp) for i, smp in enumerate(channel) ]
      fade_outs.append(fade)
    fade_outs = np.array(fade_outs)
    head_start = fade_out_start-1
    head = audio_data[:, :head_start]
    audio_data = np.concatenate([head, fade_outs], axis=1)
  return audio_data

#---- 

 
output.clear()
# !nvidia-smi
op(c.ok, 'Setup finished.')

In [None]:
#@title # Synthesize
text = "" #@param {type:"string"}
speaker_voice_wav = "" #@param {type:"string"}
normalize_speaker_volume = True #@param {type:"boolean"}
output_dir = "" #@param {type:"string"}

sample_rate = 44100 #@param {type:"slider", min:22050, max:44100, step:22050}

#@markdown <small>Processing a long text in chunks will produce much better results.</small>
chunk_at = "auto" #@param ["never", "auto", "chunk_division"]
chunk_division = 2 #@param {type:"slider", min:0, max:10, step:1}

#@markdown <small>Save an accompanying txt file with the audio file, containing your `text` input.</small>
save_txt = False #@param {type:"boolean"}

#@markdown <small>Guarantees even more eerie output.</small>
eerie = False #@param {type:"boolean"}





if chunk_at == 'auto': chunk_at = 'commas_and_dots'
if chunk_at == 'commas_and_dots' or chunk_at == 'dots':
  chunk_division = 0

 
clip_audio_to = 99999999

clean_dirs([dir_source, dir_chops, dir_clips])
uniq_id = gen_id()

 
SAMPLE_RATE = sample_rate
embedding = None

audio_input = drive_root+speaker_voice_wav
output_dir = fix_path(drive_root+output_dir)

if os.path.isdir(audio_input):
  concat_file = dir_tmp+uniq_id+'_concat.wav'
  wavs = list_audio(audio_input)
  wavs_str = concat_list('-v 1', wavs)
  !sox {sox_q} {wavs_str} "{concat_file}"
  audio_input = concat_file

if chunk_division == 0:
  chunk_division = 999999

if normalize_speaker_volume == True:
  normalized_file = dir_tmp+uniq_id+'_normalized.wav'
  y, sr = librosa.load(audio_input, sr=sample_rate, mono=True)
  y = normalize(y)
  sf.write(normalized_file, y.T, sr)
  audio_input = normalized_file

#--
 


if eerie == True:
  sample_rate = 22050
  librosa_sr = 44100
  chunk_division = 4
else:
  librosa_sr = sample_rate

global_sr = sample_rate

# title = ''.join(text.split(' ')[:5]).lower()
title = ''.join(e for e in text.split(' ')[:10] if e.isalnum()).lower()



op(c.title, 'Run ID:', uniq_id, time=True)
print()



# audio_clip = dir_source+uniq_id+'.wav'
# tmp_y, tmp_sr = librosa.load(audio_input, sr=None, mono=True)
audio, sr = librosa.load(audio_input, sr=librosa_sr)
duration = librosa.get_duration(audio, sr=sr)
if duration > clip_audio_to:
  # print('clip audio')
  # !sox {audio_input} {audio_clip} trim 0 180 {shell_silence}
  last_y = librosa_sr * clip_audio_to
  if audio.ndim > 1:
    audio = audio[:last_y, :last_y]
  else:
    audio = audio[:last_y]

_compute_embedding(audio)

words = text.split(' ')
def divide_chunks(l, n):
  for i in range(0, len(l), n): 
    yield l[i:i + n]

if chunk_at == 'chunk_division':
  chunks = [' '.join(e) for e in list(divide_chunks(words, chunk_division))]
elif chunk_at == 'dots' or chunk_at == 'commas_and_dots':
  chunks = text.split('.')
  if chunk_at is 'commas_and_dots':
    new_chunks = []
    for s in chunks:
      if ',' in s:
        sides = s.split(',')
        new_chunks.extend(sides)
      else:
        new_chunks.append(s)
    chunks = new_chunks
else:
  chunks = [text]
total_chunks = len(chunks)
 
if embedding is None:
  print("Error.")
else:
  save_as = output_dir+uniq_id+'_'+title+'.wav' #path_leaf(audio_file)
  if save_txt == True:
    txtfile = save_as.replace('.wav', '.txt')
    append_txt(txtfile, text)
  op(c.title, 'Syntheisizing:', str(total_chunks)+' chunks', time=True)
  print()

  op(c.title, 'Voice source')
  print()
  audio_player(audio, sr=sample_rate)
  print()

  if len(chunks) is 1:
    synthesize(embedding, chunks[0], save_as)
    op(c.ok, 'Synthesized audio saved as', save_as.replace(drive_root, ''), time=True)
    print()
    audio_player(save_as)
    # display(Audio(save_as))
  else:
    for i, txt in enumerate(chunks):
      tit = str(i)+'/'+str(len(chunks))
      print('____________________________________________________________________')
      print()
      op(c.title, tit+' Sythesizing:', txt, time=True)
      save_chop = dir_chops+str(i).zfill(4)+'.wav'
      synthesize(embedding, txt, save_chop)
      
      chops = list_audio(dir_chops)
      for chop in chops:
        chop_data, clip_sr = librosa.load(chop, sr=sample_rate)
        chop_data = np.array([chop_data, chop_data])
        duration = librosa.get_duration(chop_data, sr=sample_rate)
        shortened_duration = duration-1.1
        # clipped = clip_audio(chop_data, 0.01, shortened_duration)
        clipped, idx = librosa.effects.trim(chop_data)
        save_clip = dir_clips + path_leaf(chop)
        sf.write(save_clip, clipped.T, sample_rate)

    clips = list_audio(dir_clips)
    clip_list = concat_list('-v 1', clips)
    !sox {sox_q} {clip_list} "{save_as}"

    output.clear()
    op(c.title, 'Run ID:', uniq_id)
    print()
    op(c.title, 'Voice source')
    print()
    audio_player(audio, sr=sample_rate)
    print()

    if os.path.isfile(save_as):
      op(c.title, 'Synthesized result')
      print()
      audio_player(save_as, sr=sample_rate)
      print()
      op(c.ok, 'Synthesized audio saved as', save_as.replace(drive_root, ''), time=True)
    else:
      op(c.fail, 'Error saving', save_as.replace(drive_root, ''), time=True)
    # display(Audio(save_as))
    
    print('\n\n')

 
 
 
 
 
# for i, filelist in enumerate(filelists):
#   filelist_len = len(filelist)
#   filelist = concat_list('-v '+str(vol), filelist)
#   if number_files == True:
#     output_file_f = path_dir(output_file)+basename(output_file)+'-'+str(i+1)+'.wav'
#   !sox {filelist} "{output_file_f}"
#   print(filelist_len, 'files concatenated to', output_file_f)
# print('Done.')
# if preview is True:
#   Audio(output_file)