In [None]:
%reset -f 
import os
import pickle
import numpy as np
import soundfile as sf
from scipy import signal
from scipy.signal import get_window
from librosa.filters import mel
from numpy.random import RandomState

In [None]:
def butter_highpass(cutoff, fs, order=5):
    nyq = 0.5 * fs
    normal_cutoff = cutoff / nyq
    b, a = signal.butter(order, normal_cutoff, btype='high', analog=False)
    return b, a

In [None]:
def pySTFT(x, fft_length=1024, hop_length=256):
    
    x = np.pad(x, int((fft_length//2)), mode='reflect')
    
    noverlap = fft_length - hop_length
    shape = x.shape[:-1]+((x.shape[-1]-noverlap)//hop_length, fft_length)
    strides = x.strides[:-1]+(hop_length*x.strides[-1], x.strides[-1])
    result = np.lib.stride_tricks.as_strided(x, shape=shape,
                                             strides=strides)
    
    fft_window = get_window('hann', fft_length, fftbins=True)
    result = np.fft.rfft(fft_window * result, n=fft_length).T
    
    return np.abs(result)    

In [None]:
mel_basis = mel(16000, 1024, fmin=90, fmax=7600, n_mels=80).T
min_level = np.exp(-100 / 20 * np.log(10))
b, a = butter_highpass(30, 16000, order=5)

In [None]:

# audio file directory
rootDir = './wavs2'
# spectrogram directory
targetDir = './spmel2'
! rm -rf spmel2
! rm -rf './spmel2/.ipynb_checkpoints'
! rm -rf './wavs2/.ipynb_checkpoints'

In [None]:
dirName, subdirList, _ = next(os.walk(rootDir))
print('Found directory: %s' % dirName)

In [None]:
from tqdm import tqdm 

for subdir in tqdm(sorted(subdirList)):
#     print(subdir) # 224
    if not os.path.exists(os.path.join(targetDir, subdir)):
        os.makedirs(os.path.join(targetDir, subdir))
    _,_, fileList = next(os.walk(os.path.join(dirName,subdir)))
    prng = RandomState(int(subdir[2:]))
#     prng = int(2) 
    for fileName in sorted(fileList):
        # Read audio file        
        x, fs = sf.read(os.path.join(dirName,subdir,fileName))
#         print ("x:{}".format(x)) # emb 
        # Remove drifting noise
        y = signal.filtfilt(b, a, x)
        # Ddd a little random noise for model roubstness
        wav = y * 0.96 + (prng.rand(y.shape[0])-0.5)*1e-06
        # Compute spect
        D = pySTFT(wav).T
        # Convert to mel and normalize
        D_mel = np.dot(D, mel_basis)
        D_db = 20 * np.log10(np.maximum(min_level, D_mel)) - 16
        S = np.clip((D_db + 100) / 100, 0, 1)    
#         print (S.shape)
        # save spect    
#         print (os.path.join(targetDir, subdir, fileName[:-4]))
        np.save(os.path.join(targetDir, subdir, fileName[:-4]),
                S.astype(np.float32), allow_pickle=False)    

In [None]:
"""
Generate speaker embeddings and metadata for training
"""
import os
import pickle
from model_bl import D_VECTOR
from collections import OrderedDict
import numpy as np
import torch

C = D_VECTOR(dim_input=80, dim_cell=768, dim_emb=256).eval().cuda()
c_checkpoint = torch.load('3000000-BL.ckpt')
new_state_dict = OrderedDict()
for key, val in c_checkpoint['model_b'].items():
    new_key = key[7:]
    new_state_dict[new_key] = val
C.load_state_dict(new_state_dict)
num_uttrs = 10
len_crop = 128

# Directory containing mel-spectrograms
rootDir = './spmel2'
! rm -rf './spmel2/.ipynb_checkpoints'
dirName, subdirList, _ = next(os.walk(rootDir))
# print('Found directory: %s' % dirName)

speakers = []
metadata = []
for speaker in tqdm(sorted(subdirList)):
#     print('Processing speaker: %s' % speaker)        
    m=[]        
    m.append(speaker) 
    
    utterances = []
    utterances.append(speaker) 
    print(speaker) # 0
    _, _, fileList = next(os.walk(os.path.join(dirName,speaker)))
    
    # make speaker embedding
    assert len(fileList) >= num_uttrs
    idx_uttrs = np.random.choice(len(fileList), size=num_uttrs, replace=False)
    embs = []
    for i in range(num_uttrs):
        tmp = np.load(os.path.join(dirName, speaker, fileList[idx_uttrs[i]]))
        candidates = np.delete(np.arange(len(fileList)), idx_uttrs)
        # choose another utterance if the current one is too short
        while tmp.shape[0] < len_crop:
            idx_alt = np.random.choice(candidates)
            tmp = np.load(os.path.join(dirName, speaker, fileList[idx_alt]))
            candidates = np.delete(candidates, np.argwhere(candidates==idx_alt))
        print (tmp.shape[0])
        print (len_crop)
        left = np.random.randint(0, tmp.shape[0]-len_crop)
        melsp_cpu=tmp[np.newaxis, left:left+len_crop, :]
        melsp = torch.from_numpy(melsp_cpu).cuda()        
        emb = C(melsp)
        emb_cpu=emb.detach().squeeze().cpu().numpy()
#         print (emb_cpu) #1
        embs.append(emb_cpu)     
    utterances.append(np.mean(embs, axis=0))
    
    m.append(np.mean(embs, axis=0))
    m.extend(melsp_cpu)
    metadata.append(m)
    
    for fileName in sorted(fileList):
        utterances.append(os.path.join(speaker,fileName))                
    speakers.append(utterances)
    
with open(os.path.join(rootDir, 'train1.pkl'), 'wb') as handle:
    pickle.dump(speakers, handle)
#     print (speakers)

In [None]:
with open(os.path.join('.', 'metadata1.pkl'), 'wb') as handle:
    pickle.dump(metadata, handle)
# for sbmt_i in metadata:
#     print (len(sbmt_i))
# metadata

Train

In [None]:
# go to the command line and run python main.py

In [None]:
import os
import pickle
import torch
import numpy as np
from math import ceil
from model_vc import Generator


def pad_seq(x, base=32):
    len_out = int(base * ceil(float(x.shape[0])/base))
    len_pad = len_out - x.shape[0]
    assert len_pad >= 0
    return np.pad(x, ((0,len_pad),(0,0)), 'constant'), len_pad

device = 'cuda:1'
# G = Generator(32,256,512,32).eval().to(device) # (dim_neck, dim_emb, dim_pre, freq):
G = Generator(16,256,512,32).eval().to(device) # (dim_neck, dim_emb, dim_pre, freq):

# g_checkpoint = torch.load('autovc_orig.ckpt')
g_checkpoint = torch.load('autovc_499999.ckpt')

G.load_state_dict(g_checkpoint['model'])

metadata = pickle.load(open('metadata1.pkl', "rb"))

spect_vc = []

for sbmt_i in metadata:
#     print (len(sbmt_i))
#     print (sbmt_i[0])
#     print (sbmt_i[1])
#     print (sbmt_i[2])
    x_org = sbmt_i[2]
    x_org, len_pad = pad_seq(x_org)
    uttr_org = torch.from_numpy(x_org[np.newaxis, :, :]).to(device)
    emb_org = torch.from_numpy(sbmt_i[1][np.newaxis, :]).to(device)
    for sbmt_j in metadata:
        emb_trg = torch.from_numpy(sbmt_j[1][np.newaxis, :]).to(device)
        with torch.no_grad():
            _, x_identic_psnt, _ = G(uttr_org, emb_org, emb_trg)
        if len_pad == 0:
            uttr_trg = x_identic_psnt[0, 0, :, :].cpu().numpy()
        else:
            uttr_trg = x_identic_psnt[0, 0, :-len_pad, :].cpu().numpy()
        spect_vc.append( ('{}x{}'.format(sbmt_i[0], sbmt_j[0]), uttr_trg) )
with open('results1.pkl', 'wb') as handle:
    pickle.dump(spect_vc, handle)    
    
import torch
import librosa
import pickle
from synthesis import build_model
from synthesis import wavegen

spect_vc = pickle.load(open('results1.pkl', 'rb'))
device = torch.device("cuda")
model = build_model().to(device)
checkpoint = torch.load("checkpoint_step001000000_ema.pth")
model.load_state_dict(checkpoint["state_dict"])

for spect in spect_vc:
    name = spect[0]
    c = spect[1]
    print(name)
    waveform = wavegen(model, c=c)   
    librosa.output.write_wav(name+'.wav', waveform, sr=16000)    