# Remember to use GPU runtime (and restart)
Runtime > Change Runtime Type > GPU

# Clone repo and install requirements

In [29]:
%%bash
git clone https://github.com/nicolalandro/autovc.git
cd autovc
pip install wavenet_vocoder



Cloning into 'autovc'...


In [30]:
%cd autovc

/content/autovc/autovc


# Import requirements

In [31]:
import os
import pickle
import torch
import numpy as np
from numpy.random import RandomState
from math import ceil
from collections import OrderedDict

from scipy import signal
from scipy.signal import get_window
import librosa
from librosa.filters import mel
import soundfile as sf
from IPython.display import Audio

import matplotlib.pyplot as plt 

from model_bl import D_VECTOR
from model_vc import Generator
from synthesis import build_model
from synthesis import wavegen

# Read Audio

In [32]:
path1="wavs/p225/p225_003.wav"
path2="wavs/p226/p226_005.wav"

In [33]:
Audio(path1)

In [34]:
Audio(path2)

# Compute spectrogram
Generate spectrogram by a Short Time Futrie Transform (STFT)

In [35]:
def butter_highpass(cutoff, fs, order=5):
    nyq = 0.5 * fs
    normal_cutoff = cutoff / nyq
    b, a = signal.butter(order, normal_cutoff, btype='high', analog=False)
    return b, a
    
    
def pySTFT(x, fft_length=1024, hop_length=256):
    
    x = np.pad(x, int(fft_length//2), mode='reflect')
    
    noverlap = fft_length - hop_length
    shape = x.shape[:-1]+((x.shape[-1]-noverlap)//hop_length, fft_length)
    strides = x.strides[:-1]+(hop_length*x.strides[-1], x.strides[-1])
    result = np.lib.stride_tricks.as_strided(x, shape=shape,
                                             strides=strides)
    
    fft_window = get_window('hann', fft_length, fftbins=True)
    result = np.fft.rfft(fft_window * result, n=fft_length).T
    
    return np.abs(result)  

In [36]:
mel_basis = mel(16000, 1024, fmin=90, fmax=7600, n_mels=80).T
min_level = np.exp(-100 / 20 * np.log(10))
b, a = butter_highpass(30, 16000, order=5)

In [37]:
def prepare_spectrogram(path, rd_int=None):
    x, fs = sf.read(path)
    y = signal.filtfilt(b, a, x)
    if not rd_int:
      rd_int = int(path.split('/')[-2][1:])
    prng = RandomState(rd_int) # cosa vuol dire?
    wav = y * 0.96 + (prng.rand(y.shape[0])-0.5)*1e-06
    D = pySTFT(wav).T
    D_mel = np.dot(D, mel_basis)
    D_db = 20 * np.log10(np.maximum(min_level, D_mel)) - 16
    S = np.clip((D_db + 100) / 100, 0, 1)  
    S = S.astype(np.float32)
    return S

In [38]:
s1 = prepare_spectrogram(path1, rd_int=255)
print(s1.shape)
s2 = prepare_spectrogram(path2)
print(s2.shape)

(447, 80)
(407, 80)


In [39]:
# we can plot these spectrograms?

# Speaker Encoder
Use model_bl to encode the spectrograms

In [40]:
!wget https://github.com/nicolalandro/autovc/releases/download/0.1/3000000-BL.ckpt

--2021-08-03 09:57:37--  https://github.com/nicolalandro/autovc/releases/download/0.1/3000000-BL.ckpt
Resolving github.com (github.com)... 52.192.72.89
Connecting to github.com (github.com)|52.192.72.89|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://github-releases.githubusercontent.com/392250635/29880038-d71a-4f2e-986e-0a1e976793f6?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20210803%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20210803T095738Z&X-Amz-Expires=300&X-Amz-Signature=9c8127d9cc09554a649afc0dfbb18d331759a1773a10f2e73034b5efa915e31b&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=392250635&response-content-disposition=attachment%3B%20filename%3D3000000-BL.ckpt&response-content-type=application%2Foctet-stream [following]
--2021-08-03 09:57:38--  https://github-releases.githubusercontent.com/392250635/29880038-d71a-4f2e-986e-0a1e976793f6?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A

In [41]:
C = D_VECTOR(dim_input=80, dim_cell=768, dim_emb=256).eval().cuda()
c_checkpoint = torch.load('3000000-BL.ckpt')
new_state_dict = OrderedDict()
for key, val in c_checkpoint['model_b'].items():
    new_key = key[7:]
    new_state_dict[new_key] = val
C.load_state_dict(new_state_dict)
num_uttrs = 10
len_crop = 128

In [42]:
def process_speacker(tmp):
    left = np.random.randint(0, tmp.shape[0]-len_crop)
    melsp = torch.from_numpy(tmp[np.newaxis, left:left+len_crop, :]).cuda()
    emb = C(melsp)
    return emb.detach().squeeze().cpu().numpy()

In [43]:
emb1 = process_speacker(s1)
print(emb1.shape)
emb2 = process_speacker(s2)
print(emb2.shape)

(256,)
(256,)


# AutoVC
It use the model_vc to decode both data: input and style

In [44]:
!wget https://github.com/nicolalandro/autovc/releases/download/0.1/autovc.ckpt

--2021-08-03 09:57:43--  https://github.com/nicolalandro/autovc/releases/download/0.1/autovc.ckpt
Resolving github.com (github.com)... 13.114.40.48
Connecting to github.com (github.com)|13.114.40.48|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://github-releases.githubusercontent.com/392250635/e92e4b8d-f850-4e55-93bf-243b39fbf1f0?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20210803%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20210803T095743Z&X-Amz-Expires=300&X-Amz-Signature=0818f998b4082ab482295153183a8324e7a66b9bb5bc49365f721ce91703c3ce&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=392250635&response-content-disposition=attachment%3B%20filename%3Dautovc.ckpt&response-content-type=application%2Foctet-stream [following]
--2021-08-03 09:57:43--  https://github-releases.githubusercontent.com/392250635/e92e4b8d-f850-4e55-93bf-243b39fbf1f0?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20210

In [45]:
def pad_seq(x, base=32):
    len_out = int(base * ceil(float(x.shape[0])/base))
    len_pad = len_out - x.shape[0]
    assert len_pad >= 0
    return np.pad(x, ((0,len_pad),(0,0)), 'constant'), len_pad

In [46]:
device = 'cuda:0'
G = Generator(32,256,512,32).eval().to(device)

g_checkpoint = torch.load('autovc.ckpt', map_location=device)
G.load_state_dict(g_checkpoint['model'])

<All keys matched successfully>

In [47]:
def prepare_input(s1, emb1, emb2):
    x_org, len_pad = pad_seq(s1)
    uttr_org = torch.from_numpy(x_org[np.newaxis, :, :]).to(device)
    emb_org = torch.from_numpy(emb1[np.newaxis, :]).to(device)
    
    emb_trg = torch.from_numpy(emb2[np.newaxis, :]).to(device)
    
    with torch.no_grad():
            _, x_identic_psnt, _ = G(uttr_org, emb_org, emb_trg)
    if len_pad == 0:
        uttr_trg = x_identic_psnt[0, 0, :, :].cpu().numpy()
    else:
        uttr_trg = x_identic_psnt[0, 0, :-len_pad, :].cpu().numpy()
    return uttr_trg

In [48]:
spect_vc1 = prepare_input(s1, emb1, emb2)
print(spect_vc1.shape)

(447, 80)


# Vocoder
Generate .wav audio file from generated spectrogram

In [49]:
!wget https://github.com/nicolalandro/autovc/releases/download/0.1/checkpoint_step001000000_ema.pth

--2021-08-03 09:58:02--  https://github.com/nicolalandro/autovc/releases/download/0.1/checkpoint_step001000000_ema.pth
Resolving github.com (github.com)... 13.114.40.48
Connecting to github.com (github.com)|13.114.40.48|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://github-releases.githubusercontent.com/392250635/39f09916-7fba-470c-81eb-5f4e214ce61e?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20210803%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20210803T095802Z&X-Amz-Expires=300&X-Amz-Signature=201486a2b66cdc326e4bd6421443d71836269168bcab0d52c1cf66a134485c85&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=392250635&response-content-disposition=attachment%3B%20filename%3Dcheckpoint_step001000000_ema.pth&response-content-type=application%2Foctet-stream [following]
--2021-08-03 09:58:02--  https://github-releases.githubusercontent.com/392250635/39f09916-7fba-470c-81eb-5f4e214ce61e?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-A

In [50]:
device = torch.device("cuda")
model = build_model().to(device)
checkpoint = torch.load("checkpoint_step001000000_ema.pth")
model.load_state_dict(checkpoint["state_dict"])

<All keys matched successfully>

In [51]:
file_output = "a_b.wav"
waveform = wavegen(model, c=spect_vc1)
sf.write(file_output, waveform, 16000, 'PCM_24')

100%|██████████| 114432/114432 [26:59<00:00, 70.66it/s]


In [52]:
Audio(file_output)