## Setup CorentinJ/Real-Time-Voice-Cloning
- Clone the project

In [1]:
import os
from os.path import exists, join, basename, splitext

git_repo_url = 'https://github.com/CorentinJ/Real-Time-Voice-Cloning.git'
project_name = splitext(basename(git_repo_url))[0]
if not exists(project_name):
    # clone and install
    !git clone -q --recursive {git_repo_url}
    # install dependencies
    !cd {project_name} && pip install -q -r requirements.txt
    !pip install -q --upgrade gdown
    !apt-get install -qq libportaudio2
    !pip install -q https://github.com/tugstugi/dl-colab-notebooks/archive/colab_utils.zip

[K     |████████████████████████████████| 11.3 MB 37.7 MB/s 
[K     |████████████████████████████████| 15.4 MB 57.9 MB/s 
[K     |████████████████████████████████| 3.1 MB 48.6 MB/s 
[K     |████████████████████████████████| 8.3 MB 44.5 MB/s 
[K     |████████████████████████████████| 76 kB 5.3 MB/s 
[K     |████████████████████████████████| 86 kB 5.5 MB/s 
[K     |████████████████████████████████| 235 kB 60.0 MB/s 
[K     |████████████████████████████████| 138 kB 46.7 MB/s 
[K     |████████████████████████████████| 676 kB 63.3 MB/s 
[K     |████████████████████████████████| 66 kB 4.8 MB/s 
[K     |████████████████████████████████| 965 kB 66.3 MB/s 
[K     |████████████████████████████████| 59.9 MB 1.2 MB/s 
[K     |████████████████████████████████| 361 kB 54.4 MB/s 
[K     |████████████████████████████████| 1.1 MB 64.9 MB/s 
[K     |████████████████████████████████| 55 kB 4.3 MB/s 
[K     |████████████████████████████████| 62 kB 1.7 MB/s 
[?25h  Building wheel for umap-

# Mounting the Drive to get the paths of pretrained model and dataset.

In [2]:
from google.colab import drive

drive.mount("/content/drive")

Mounted at /content/drive


## Install necessary libraries, Download pretrained models and Initialize the voice cloning models

In [4]:
import sys
sys.path.append(project_name)

from IPython.display import display, Audio, clear_output
from IPython.utils import io
import ipywidgets as widgets
import numpy as np
from dl_colab_notebooks.audio import record_audio, upload_audio

from synthesizer.inference import Synthesizer
from encoder import inference as encoder
from vocoder import inference as vocoder
from pathlib import Path

!ls 
encoder.load_model(project_name / Path("/content/drive/MyDrive/real_time_voice_cloning/pretrained_models/encoder.pt"))
synthesizer = Synthesizer(project_name / Path("/content/drive/MyDrive/real_time_voice_cloning/pretrained_models/synthesizer.pt"))
vocoder.load_model(project_name / Path("/content/drive/MyDrive/real_time_voice_cloning/pretrained_models/vocoder.pt"))

drive  encoder	Real-Time-Voice-Cloning  sample_data  synthesizer  vocoder
Loaded encoder "encoder.pt" trained to step 1564501
Synthesizer using device: cpu
Building Wave-RNN
Trainable Parameters: 4.481M
Loading model weights at /content/drive/MyDrive/real_time_voice_cloning/pretrained_models/vocoder.pt


In [15]:
SAMPLE_RATE = 22050
record_or_upload = "Upload" # ["Record", "Upload (.mp3 or .wav)"]
#record_seconds =   10#@param {type:"number", min:1, max:10, step:1}

embedding = None
def _compute_embedding(audio):
    display(Audio(audio, rate=SAMPLE_RATE, autoplay=True))
    global embedding
    embedding = None
    embedding = encoder.embed_utterance(encoder.preprocess_wav(audio, SAMPLE_RATE))
def _upload_audio(b):
    clear_output()
    audio = upload_audio(sample_rate=SAMPLE_RATE)
    _compute_embedding(audio)
if record_or_upload == "Upload":
    button = widgets.Button(description="Upload Voice File")
    button.on_click(_upload_audio)
    display(button)
else:
    #button = widgets.Button(description="Record Your Voice")
    #button.on_click(_record_audio)
    _upload_audio("")

Saving audio-00.wav to audio-00 (1).wav
Saving audio-01.wav to audio-01 (1).wav
Saving audio-02.wav to audio-02 (2).wav
Saving audio-03.wav to audio-03 (1).wav
Saving audio-04.wav to audio-04 (1).wav


In [16]:
text = "Hi, welcome to Interactly, a no coding interactive video creation platform to create the personalized video experiences."

def synthesize(embed, text):
    print("Synthesizing new audio...")
    #with io.capture_output() as captured:
    specs = synthesizer.synthesize_spectrograms([text], [embed])
    generated_wav = vocoder.infer_waveform(specs[0])
    generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate), mode="constant")
    clear_output()
    display(Audio(generated_wav, rate=synthesizer.sample_rate, autoplay=True))

if embedding is None:
    print("first record a voice or upload a voice file!")
else:
    synthesize(embedding, text)