# Remember to use GPU runtime (and restart)
Runtime > Change Runtime Type > GPU

# Clone repo and install requirements

In [1]:
%%bash
git clone https://github.com/nicolalandro/autovc.git
cd autovc
pip install wavenet_vocoder

Collecting wavenet_vocoder
  Downloading wavenet_vocoder-0.1.1.tar.gz (13 kB)
Building wheels for collected packages: wavenet-vocoder
  Building wheel for wavenet-vocoder (setup.py): started
  Building wheel for wavenet-vocoder (setup.py): finished with status 'done'
  Created wheel for wavenet-vocoder: filename=wavenet_vocoder-0.1.1-py3-none-any.whl size=12679 sha256=51f734c9655aa9a3f48721d69cb0294edc41dedde3dbb3b3db46863b086b4df7
  Stored in directory: /root/.cache/pip/wheels/45/b9/b3/5961fda4d2ba5bc9a8d416844b30d590f597674a690162766f
Successfully built wavenet-vocoder
Installing collected packages: wavenet-vocoder
Successfully installed wavenet-vocoder-0.1.1


Cloning into 'autovc'...


In [2]:
%cd autovc

/content/autovc


# Import requirements

In [3]:
import os
import pickle
import torch
import numpy as np
from numpy.random import RandomState
from math import ceil
from collections import OrderedDict

from scipy import signal
from scipy.signal import get_window
import librosa
from librosa.filters import mel
import soundfile as sf
from IPython.display import Audio

import matplotlib.pyplot as plt 

from model_bl import D_VECTOR
from model_vc import Generator
from synthesis import build_model
from synthesis import wavegen

# Read Audio

In [4]:
path1="wavs/p225/p225_003.wav"
path2="wavs/p226/p226_005.wav"

In [5]:
Audio(path1)

In [6]:
Audio(path2)

# Compute spectrogram
Generate spectrogram by a Short Time Futrie Transform (STFT)

In [7]:
def butter_highpass(cutoff, fs, order=5):
    nyq = 0.5 * fs
    normal_cutoff = cutoff / nyq
    b, a = signal.butter(order, normal_cutoff, btype='high', analog=False)
    return b, a
    
    
def pySTFT(x, fft_length=1024, hop_length=256):
    
    x = np.pad(x, int(fft_length//2), mode='reflect')
    
    noverlap = fft_length - hop_length
    shape = x.shape[:-1]+((x.shape[-1]-noverlap)//hop_length, fft_length)
    strides = x.strides[:-1]+(hop_length*x.strides[-1], x.strides[-1])
    result = np.lib.stride_tricks.as_strided(x, shape=shape,
                                             strides=strides)
    
    fft_window = get_window('hann', fft_length, fftbins=True)
    result = np.fft.rfft(fft_window * result, n=fft_length).T
    
    return np.abs(result)  

In [8]:
mel_basis = mel(16000, 1024, fmin=90, fmax=7600, n_mels=80).T
min_level = np.exp(-100 / 20 * np.log(10))
b, a = butter_highpass(30, 16000, order=5)

In [9]:
def prepare_spectrogram(path):
    x, fs = sf.read(path)
    y = signal.filtfilt(b, a, x)
    prng = RandomState(int(path.split('/')[-2][1:])) # cosa vuol dire?
    wav = y * 0.96 + (prng.rand(y.shape[0])-0.5)*1e-06
    D = pySTFT(wav).T
    D_mel = np.dot(D, mel_basis)
    D_db = 20 * np.log10(np.maximum(min_level, D_mel)) - 16
    S = np.clip((D_db + 100) / 100, 0, 1)  
    S = S.astype(np.float32)
    return S

In [10]:
s1 = prepare_spectrogram(path1)
print(s1.shape)
s2 = prepare_spectrogram(path2)
print(s2.shape)

(376, 80)
(407, 80)


In [11]:
# we can plot these spectrograms?

# Speaker Encoder
Use model_bl to encode the spectrograms

In [12]:
!wget https://github.com/nicolalandro/autovc/releases/download/0.1/3000000-BL.ckpt

--2021-08-03 09:28:49--  https://github.com/nicolalandro/autovc/releases/download/0.1/3000000-BL.ckpt
Resolving github.com (github.com)... 52.69.186.44
Connecting to github.com (github.com)|52.69.186.44|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://github-releases.githubusercontent.com/392250635/29880038-d71a-4f2e-986e-0a1e976793f6?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20210803%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20210803T092850Z&X-Amz-Expires=300&X-Amz-Signature=55eef1d877f16866672bd61b2ca1850c62187f91a7e7147784225b800ca85282&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=392250635&response-content-disposition=attachment%3B%20filename%3D3000000-BL.ckpt&response-content-type=application%2Foctet-stream [following]
--2021-08-03 09:28:50--  https://github-releases.githubusercontent.com/392250635/29880038-d71a-4f2e-986e-0a1e976793f6?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A

In [13]:
C = D_VECTOR(dim_input=80, dim_cell=768, dim_emb=256).eval().cuda()
c_checkpoint = torch.load('3000000-BL.ckpt')
new_state_dict = OrderedDict()
for key, val in c_checkpoint['model_b'].items():
    new_key = key[7:]
    new_state_dict[new_key] = val
C.load_state_dict(new_state_dict)
num_uttrs = 10
len_crop = 128

In [14]:
def process_speacker(tmp):
    left = np.random.randint(0, tmp.shape[0]-len_crop)
    melsp = torch.from_numpy(tmp[np.newaxis, left:left+len_crop, :]).cuda()
    emb = C(melsp)
    return emb.detach().squeeze().cpu().numpy()

In [15]:
emb1 = process_speacker(s1)
print(emb1.shape)
emb2 = process_speacker(s2)
print(emb2.shape)

(256,)
(256,)


# AutoVC
It use the model_vc to decode both data: input and style

In [16]:
!wget https://github.com/nicolalandro/autovc/releases/download/0.1/autovc.ckpt

--2021-08-03 09:29:08--  https://github.com/nicolalandro/autovc/releases/download/0.1/autovc.ckpt
Resolving github.com (github.com)... 13.114.40.48
Connecting to github.com (github.com)|13.114.40.48|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://github-releases.githubusercontent.com/392250635/e92e4b8d-f850-4e55-93bf-243b39fbf1f0?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20210803%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20210803T092908Z&X-Amz-Expires=300&X-Amz-Signature=69897e279747afc7d9aec68c03e29d1af07e7cb49aee258bdc916a6d5c75d26b&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=392250635&response-content-disposition=attachment%3B%20filename%3Dautovc.ckpt&response-content-type=application%2Foctet-stream [following]
--2021-08-03 09:29:08--  https://github-releases.githubusercontent.com/392250635/e92e4b8d-f850-4e55-93bf-243b39fbf1f0?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20210

In [17]:
def pad_seq(x, base=32):
    len_out = int(base * ceil(float(x.shape[0])/base))
    len_pad = len_out - x.shape[0]
    assert len_pad >= 0
    return np.pad(x, ((0,len_pad),(0,0)), 'constant'), len_pad

In [18]:
device = 'cuda:0'
G = Generator(32,256,512,32).eval().to(device)

g_checkpoint = torch.load('autovc.ckpt', map_location=device)
G.load_state_dict(g_checkpoint['model'])

<All keys matched successfully>

In [19]:
def prepare_input(s1, emb1, emb2):
    x_org, len_pad = pad_seq(s1)
    uttr_org = torch.from_numpy(x_org[np.newaxis, :, :]).to(device)
    emb_org = torch.from_numpy(emb1[np.newaxis, :]).to(device)
    
    emb_trg = torch.from_numpy(emb2[np.newaxis, :]).to(device)
    
    with torch.no_grad():
            _, x_identic_psnt, _ = G(uttr_org, emb_org, emb_trg)
    if len_pad == 0:
        uttr_trg = x_identic_psnt[0, 0, :, :].cpu().numpy()
    else:
        uttr_trg = x_identic_psnt[0, 0, :-len_pad, :].cpu().numpy()
    return uttr_trg

In [20]:
spect_vc1 = prepare_input(s1, emb1, emb2)
print(spect_vc1.shape)

(376, 80)


# Vocoder
Generate .wav audio file from generated spectrogram

In [21]:
!wget https://github.com/nicolalandro/autovc/releases/download/0.1/checkpoint_step001000000_ema.pth

--2021-08-03 09:29:26--  https://github.com/nicolalandro/autovc/releases/download/0.1/checkpoint_step001000000_ema.pth
Resolving github.com (github.com)... 52.192.72.89
Connecting to github.com (github.com)|52.192.72.89|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://github-releases.githubusercontent.com/392250635/39f09916-7fba-470c-81eb-5f4e214ce61e?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20210803%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20210803T092926Z&X-Amz-Expires=300&X-Amz-Signature=ee19d21083c3cfac91eee06a4de2909fa96893d2a5d045b8a9a45a89078a1120&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=392250635&response-content-disposition=attachment%3B%20filename%3Dcheckpoint_step001000000_ema.pth&response-content-type=application%2Foctet-stream [following]
--2021-08-03 09:29:26--  https://github-releases.githubusercontent.com/392250635/39f09916-7fba-470c-81eb-5f4e214ce61e?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-A

In [22]:
device = torch.device("cuda")
model = build_model().to(device)
checkpoint = torch.load("checkpoint_step001000000_ema.pth")
model.load_state_dict(checkpoint["state_dict"])

<All keys matched successfully>

In [None]:
file_output = "a_b.wav"
waveform = wavegen(model, c=spect_vc1)
sf.write(file_output, waveform, 16000, 'PCM_24')

 12%|█▏        | 11786/96256 [02:44<19:03, 73.84it/s]

In [None]:
Audio(file_output)