<a href="https://colab.research.google.com/github/papercore-dev/perfectly-jogyo/blob/main/vc-run.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Demo zero-shot Voice conversion with YourTTS

##TTS Model setup

### Download and install Coqui TTS


In [17]:
!git clone https://github.com/Edresson/Coqui-TTS -b multilingual-torchaudio-SE TTS
!pip install -q -e TTS/
!pip install -q torchaudio==0.9.0

fatal: destination path 'TTS' already exists and is not an empty directory.
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone


###Download TTS Checkpoint

In [18]:
# TTS checkpoints

# download config  
!gdown --id 1-PfXD66l1ZpsZmJiC-vhL055CDSugLyP
# download language json 
! gdown --id 1_Vb2_XHqcC0OcvRF82F883MTxfTRmerg
# download speakers json
! gdown --id 1SZ9GE0CBM-xGstiXH2-O2QWdmSXsBKdC -O speakers.json
# download checkpoint
! gdown --id 1sgEjHt0lbPSEw9-FSbC_mBoOPwNi87YR -O best_model.pth.tar

Downloading...
From: https://drive.google.com/uc?id=1-PfXD66l1ZpsZmJiC-vhL055CDSugLyP
To: /content/config.json
100% 12.3k/12.3k [00:00<00:00, 9.43MB/s]
Downloading...
From: https://drive.google.com/uc?id=1_Vb2_XHqcC0OcvRF82F883MTxfTRmerg
To: /content/language_ids.json
100% 47.0/47.0 [00:00<00:00, 210kB/s]
Downloading...
From: https://drive.google.com/uc?id=1SZ9GE0CBM-xGstiXH2-O2QWdmSXsBKdC
To: /content/speakers.json
100% 671k/671k [00:00<00:00, 10.6MB/s]
Downloading...
From: https://drive.google.com/uc?id=1sgEjHt0lbPSEw9-FSbC_mBoOPwNi87YR
To: /content/best_model.pth.tar
100% 380M/380M [00:02<00:00, 155MB/s]


### Imports

In [19]:
import sys
TTS_PATH = "TTS/"

# add libraries into environment
sys.path.append(TTS_PATH) # set this if TTS is not installed globally

import os
import string
import time
import argparse
import json

import numpy as np
import IPython
from IPython.display import Audio


import torch

from TTS.tts.utils.synthesis import synthesis
from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols
try:
  from TTS.utils.audio import AudioProcessor
except:
  from TTS.utils.audio import AudioProcessor


from TTS.tts.models import setup_model
from TTS.config import load_config
from TTS.tts.models.vits import *

### Paths definition

In [20]:
OUT_PATH = 'out/'

# create output path
os.makedirs(OUT_PATH, exist_ok=True)

# model vars 
MODEL_PATH = 'best_model.pth.tar'
CONFIG_PATH = 'config.json'
TTS_LANGUAGES = "language_ids.json"
TTS_SPEAKERS = "speakers.json"
USE_CUDA = torch.cuda.is_available()

### Restore model

In [21]:
# load the config
C = load_config(CONFIG_PATH)


# load the audio processor
ap = AudioProcessor(**C.audio)

speaker_embedding = None

C.model_args['d_vector_file'] = TTS_SPEAKERS
C.model_args['use_speaker_encoder_as_loss'] = False

model = setup_model(C)
model.language_manager.set_language_ids_from_file(TTS_LANGUAGES)
# print(model.language_manager.num_languages, model.embedded_language_dim)
# print(model.emb_l)
cp = torch.load(MODEL_PATH, map_location=torch.device('cpu'))
# remove speaker encoder
model_weights = cp['model'].copy()
for key in list(model_weights.keys()):
  if "speaker_encoder" in key:
    del model_weights[key]

model.load_state_dict(model_weights)


model.eval()

if USE_CUDA:
    model = model.cuda()

# synthesize voice
use_griffin_lim = False

 > Setting up Audio Processor...
 | > sample_rate:16000
 | > resample:False
 | > num_mels:80
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:1024
 | > power:1.5
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:False
 | > symmetric_norm:True
 | > mel_fmin:0
 | > mel_fmax:None
 | > spec_gain:1.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:45
 | > do_sound_norm:False
 | > do_amp_to_db_linear:False
 | > do_amp_to_db_mel:True
 | > stats_path:None
 | > base:2.718281828459045
 | > hop_length:256
 | > win_length:1024
 > Using model: vits
 > Speaker manager is loaded with 6 speakers: female-en-5, female-en-5
, female-pt-4
, male-en-2, male-en-2
, male-pt-3



##Speaker encoder setup

### Install helper libraries

In [22]:
! pip install -q pydub ffmpeg-normalize

### Paths definition

In [23]:
CONFIG_SE_PATH = "config_se.json"
CHECKPOINT_SE_PATH = "SE_checkpoint.pth.tar"

# download config 
! gdown --id  19cDrhZZ0PfKf2Zhr_ebB-QASRw844Tn1 -O $CONFIG_SE_PATH
# download checkpoint  
! gdown --id   17JsW6h6TIh7-LkU2EvB_gnNrPcdBxt7X -O $CHECKPOINT_SE_PATH

Downloading...
From: https://drive.google.com/uc?id=19cDrhZZ0PfKf2Zhr_ebB-QASRw844Tn1
To: /content/config_se.json
100% 3.49k/3.49k [00:00<00:00, 2.76MB/s]
Downloading...
From: https://drive.google.com/uc?id=17JsW6h6TIh7-LkU2EvB_gnNrPcdBxt7X
To: /content/SE_checkpoint.pth.tar
100% 44.6M/44.6M [00:00<00:00, 142MB/s] 


###Imports

In [24]:
from TTS.tts.utils.speakers import SpeakerManager
from pydub import AudioSegment
from google.colab import files
import librosa

###Load the Speaker encoder

In [25]:
SE_speaker_manager = SpeakerManager(encoder_model_path=CHECKPOINT_SE_PATH, encoder_config_path=CONFIG_SE_PATH, use_cuda=USE_CUDA)

 > Setting up Audio Processor...
 | > sample_rate:16000
 | > resample:False
 | > num_mels:64
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:512
 | > power:1.5
 | > preemphasis:0.97
 | > griffin_lim_iters:60
 | > signal_norm:False
 | > symmetric_norm:False
 | > mel_fmin:0
 | > mel_fmax:8000.0
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:False
 | > do_trim_silence:False
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > stats_path:None
 | > base:10
 | > hop_length:160
 | > win_length:400


###Define helper function

In [26]:
def compute_spec(ref_file):
  y, sr = librosa.load(ref_file, sr=ap.sample_rate)
  spec = ap.spectrogram(y)
  spec = torch.FloatTensor(spec).unsqueeze(0)
  return spec

##Voice conversion

###Upload, normalize and resample your wav files

Please upload wav files

In [11]:
print("Select target speaker reference audios files:")
target_files = files.upload()
target_files = list(target_files.keys())
for sample in target_files:
    !ffmpeg-normalize $sample -nt rms -t=-27 -o $sample -ar 16000 -f

Select target speaker reference audios files:


Saving 1.wav to 1.wav
Saving 2.wav to 2.wav
Saving 3.wav to 3.wav
Saving 4.wav to 4.wav
Saving 5.wav to 5.wav
Saving 7.wav to 7.wav
Saving 8.wav to 8.wav
Saving 9.wav to 9.wav
Saving 10.wav to 10.wav
Saving 11.wav to 11.wav
Saving 15.wav to 15.wav
Saving 16.wav to 16.wav
Saving 18.wav to 18.wav
Saving 21.wav to 21.wav
Saving 22.wav to 22.wav
Saving 23.wav to 23.wav
Saving 24.wav to 24.wav
Saving 26.wav to 26.wav
Saving 29.wav to 29.wav
Saving 31.wav to 31.wav
Saving 32.wav to 32.wav
Saving 34.wav to 34.wav
Saving 36.wav to 36.wav
Saving 37.wav to 37.wav
Saving 40.wav to 40.wav
Saving 41.wav to 41.wav
Saving 42.wav to 42.wav
Saving 46.wav to 46.wav
Saving 47.wav to 47.wav
Saving 48.wav to 48.wav
Saving 50.wav to 50.wav
Saving 51.wav to 51.wav
Saving 52.wav to 52.wav
Saving 54.wav to 54.wav
Saving 56.wav to 56.wav
Saving 57.wav to 57.wav
Saving 58.wav to 58.wav
Saving 59.wav to 59.wav
Saving 60.wav to 60.wav
Saving 61.wav to 61.wav
Saving 62.wav to 62.wav
Saving 63.wav to 63.wav
Saving 6

In [12]:
print("Select driving speaker reference audios files:")
driving_files = files.upload()
driving_files = list(driving_files.keys())
for sample in driving_files:
    !ffmpeg-normalize $sample -nt rms -t=-27 -o $sample -ar 16000 -f

Select driving speaker reference audios files:


Saving OPEN-0001.wav to OPEN-0001.wav
Saving OPEN-0002.wav to OPEN-0002.wav
Saving OPEN-0003.wav to OPEN-0003.wav
Saving OPEN-0004.wav to OPEN-0004.wav
Saving OPEN-0005.wav to OPEN-0005.wav
Saving OPEN-0006.wav to OPEN-0006.wav
Saving OPEN-0007.wav to OPEN-0007.wav
Saving OPEN-0008.wav to OPEN-0008.wav
Saving OPEN-0009.wav to OPEN-0009.wav
Saving OPEN-0010.wav to OPEN-0010.wav
Saving OPEN-0011.wav to OPEN-0011.wav
Saving OPEN-0012.wav to OPEN-0012.wav
Saving OPEN-0013.wav to OPEN-0013.wav
Saving OPEN-0014.wav to OPEN-0014.wav
Saving OPEN-0015.wav to OPEN-0015.wav
Saving OPEN-0016.wav to OPEN-0016.wav
Saving OPEN-0017.wav to OPEN-0017.wav
Saving OPEN-0018.wav to OPEN-0018.wav
Saving OPEN-0019.wav to OPEN-0019.wav
Saving OPEN-0020.wav to OPEN-0020.wav
Saving OPEN-0021.wav to OPEN-0021.wav
Saving OPEN-0022.wav to OPEN-0022.wav
Saving OPEN-0023.wav to OPEN-0023.wav
Saving OPEN-0024.wav to OPEN-0024.wav
Saving OPEN-0025.wav to OPEN-0025.wav
Saving OPEN-0026.wav to OPEN-0026.wav
Saving OPEN-

In [34]:
print("Select driving audio file:")
driving_file = files.upload()
driving_file = list(driving_file.keys())
for sample in driving_file:
    !ffmpeg-normalize $sample -nt rms -t=-27 -o $sample -ar 16000 -f
driving_file = driving_file[0]

Select driving audio file:


Saving happy_dance.wav to happy_dance.wav


###Compute embeddings

In [14]:
target_emb = SE_speaker_manager.compute_d_vector_from_clip(target_files)
target_emb = torch.FloatTensor(target_emb).unsqueeze(0)

In [15]:
driving_emb = SE_speaker_manager.compute_d_vector_from_clip(driving_files)
driving_emb = torch.FloatTensor(driving_emb).unsqueeze(0)

###Convert the voice

In [35]:
driving_spec = compute_spec(driving_file)
y_lengths = torch.tensor([driving_spec.size(-1)])
if USE_CUDA:
    ref_wav_voc, _, _ = model.voice_conversion(driving_spec.cuda(), y_lengths.cuda(), driving_emb.cuda(), target_emb.cuda())
    ref_wav_voc = ref_wav_voc.squeeze().cpu().detach().numpy()
else:
    ref_wav_voc, _, _ = model.voice_conversion(driving_spec, y_lengths, driving_emb, target_emb)
    ref_wav_voc = ref_wav_voc.squeeze().detach().numpy()


print("Reference Audio after decoder:")
IPython.display.display(Audio(ref_wav_voc, rate=ap.sample_rate))

Reference Audio after decoder:
