# 演習：声質変換

## 環境構築

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [None]:
cd /content/drive/'My Drive'/vc_exercise2019

/content/drive/My Drive/vc_exercise2019


In [None]:
!pip3 install pyworld
!pip3 install pysptk
!pip3 install dtw

Collecting pyworld
[?25l  Downloading https://files.pythonhosted.org/packages/dc/8d/f83aef293df2fb9f3435f129ea7a02f55fd0fe04ada69bf4207d5ffbc92c/pyworld-0.2.8.tar.gz (74kB)
[K     |████████████████████████████████| 81kB 5.2MB/s 
Building wheels for collected packages: pyworld
  Building wheel for pyworld (setup.py) ... [?25l[?25hdone
  Created wheel for pyworld: filename=pyworld-0.2.8-cp36-cp36m-linux_x86_64.whl size=618600 sha256=c2c92020c83a79053686dd46c023dc20c7a749628b827425eb3a131ba430c8a5
  Stored in directory: /root/.cache/pip/wheels/c3/58/e5/a7e39ab92c56825f976970b97066753c68406c7fb0d80d4a4a
Successfully built pyworld
Installing collected packages: pyworld
Successfully installed pyworld-0.2.8
Collecting pysptk
[?25l  Downloading https://files.pythonhosted.org/packages/c0/7a/fb419737b13aaebb3ccea43e6ef42245a0982e339bc778bcfaddfd32d120/pysptk-0.1.17.tar.gz (412kB)
[K     |████████████████████████████████| 419kB 4.8MB/s 
Building wheels for collected packages: pysptk
  Build

## 特徴量の分析

In [1]:
import os
import sys
import glob

from scipy.io import wavfile # for wavfile I/O
import pyworld as pw
import numpy as np
import pysptk as sptk

In [2]:
spklist = ["SF", "TF"]  # speaker list [source female speaker, target female speaker]
featlist = ["mgc","f0","ap"]

In [3]:
# Making directories for speech features
for s in spklist:
    for f in featlist:
        if not os.path.exists("data/{}/{}".format(s,f)):
            os.mkdir("data/{}/{}".format(s,f))

In [7]:
for s in spklist:
    wavlist = os.listdir("data/{}/wav".format(s))
    for wf in wavlist:
        # WORLD analysis for each file
        print("spekaer: {} file: {}".format(s,wf))
        fs, data = wavfile.read("data/{}/wav/{}".format(s,wf))
        data = data.astype(np.double)

        f0, t = pw.harvest(data, fs)
        sp = pw.cheaptrick(data, f0, t, fs)
        ap = pw.d4c(data, f0, t, fs)

        alpha = 0.42
        dim = 24
        mgc = sptk.sp2mc(sp, dim, alpha)

        bn, _ = os.path.splitext(wf)

        with open("data/{}/mgc/{}.mgc".format(s,bn),"wb") as f:
            mgc.tofile(f)
        with open("data/{}/f0/{}.f0".format(s,bn),"wb") as f:
            f0.tofile(f)
        with open("data/{}/ap/{}.ap".format(s,bn),"wb") as f:
            ap.tofile(f)

spekaer: SF file: atr503_a37.wav
spekaer: SF file: atr503_a47.wav
spekaer: SF file: atr503_a21.wav
spekaer: SF file: atr503_a15.wav
spekaer: SF file: atr503_a19.wav
spekaer: SF file: atr503_a46.wav
spekaer: SF file: atr503_a05.wav
spekaer: SF file: atr503_a07.wav
spekaer: SF file: atr503_a44.wav
spekaer: SF file: atr503_a08.wav
spekaer: SF file: atr503_a09.wav
spekaer: SF file: atr503_a24.wav
spekaer: SF file: atr503_a12.wav
spekaer: SF file: atr503_a18.wav
spekaer: SF file: atr503_a02.wav
spekaer: SF file: atr503_a13.wav
spekaer: SF file: atr503_a39.wav
spekaer: SF file: atr503_a33.wav
spekaer: SF file: atr503_a36.wav
spekaer: SF file: atr503_a30.wav
spekaer: SF file: atr503_a01.wav
spekaer: SF file: atr503_a32.wav
spekaer: SF file: atr503_a04.wav
spekaer: SF file: atr503_a34.wav
spekaer: SF file: atr503_a06.wav
spekaer: SF file: atr503_a29.wav
spekaer: SF file: atr503_a42.wav
spekaer: SF file: atr503_a27.wav
spekaer: SF file: atr503_a38.wav
spekaer: SF file: atr503_a22.wav
spekaer: S

## フレーム毎時間アラインメント

In [8]:
import os
import sys
import array

from dtw import dtw
import numpy as np
import pysptk as sptk

In [9]:
srcspk = "SF"
tgtspk = "TF"

mgclist = os.listdir("data/{}/mgc".format(srcspk))

if not os.path.isdir("data/{}/data".format(srcspk)):
    os.mkdir("data/{}/data".format(srcspk))
if not os.path.isdir("data/{}/data".format(tgtspk)):
    os.mkdir("data/{}/data".format(tgtspk))

In [10]:
def distfunc(x,y):
    # Euclid distance except first dim
    return np.linalg.norm(x[1:]-y[1:])

In [11]:
dim = 25 # mgc dim + 1
for mf in mgclist:
    print(mf)
    bn, _ = os.path.splitext(mf)
    srcfile = "data/{}/mgc/{}".format(srcspk,mf)
    tgtfile = "data/{}/mgc/{}".format(tgtspk,mf)

    with open(srcfile,"rb") as f:
        x = np.fromfile(f, dtype="<f8", sep="")
        x = x.reshape(len(x)//dim,dim)
    with open(tgtfile,"rb") as f:
        y = np.fromfile(f, dtype="<f8", sep="")
        y = y.reshape(len(y)//dim,dim)
    print("framelen: (x,y) = {} {}".format(len(x),len(y)))
    _,_,_, twf = dtw(x,y,distfunc)
    srcout = "data/{}/data/{}.dat".format(srcspk,bn)
    tgtout = "data/{}/data/{}.dat".format(tgtspk,bn)

    with open(srcout,"wb") as f:
        x[twf[0]].tofile(f)
    with open(tgtout,"wb") as f:
        y[twf[1]].tofile(f)

atr503_a08.mgc
framelen: (x,y) = 1153 1039
atr503_a45.mgc
framelen: (x,y) = 1585 1525
atr503_a33.mgc
framelen: (x,y) = 1951 1943
atr503_a42.mgc
framelen: (x,y) = 1249 1291
atr503_a25.mgc
framelen: (x,y) = 1087 1009
atr503_a26.mgc
framelen: (x,y) = 1461 1455
atr503_a04.mgc
framelen: (x,y) = 1095 1149
atr503_a31.mgc
framelen: (x,y) = 1549 1807
atr503_a11.mgc
framelen: (x,y) = 1165 1115
atr503_a32.mgc
framelen: (x,y) = 1079 1313
atr503_a23.mgc
framelen: (x,y) = 1353 1395
atr503_a29.mgc
framelen: (x,y) = 1611 1883
atr503_a49.mgc
framelen: (x,y) = 1357 1465
atr503_a41.mgc
framelen: (x,y) = 913 1109
atr503_a47.mgc
framelen: (x,y) = 1297 1123
atr503_a50.mgc
framelen: (x,y) = 1587 1755
atr503_a10.mgc
framelen: (x,y) = 881 891
atr503_a19.mgc
framelen: (x,y) = 1107 961
atr503_a06.mgc
framelen: (x,y) = 1297 1207
atr503_a02.mgc
framelen: (x,y) = 971 923
atr503_a18.mgc
framelen: (x,y) = 1349 1217
atr503_a34.mgc
framelen: (x,y) = 1633 1573
atr503_a36.mgc
framelen: (x,y) = 863 911
atr503_a03.mgc
fram

## 音声変換モデルの学習

In [12]:
# Listing training/evaluation data
!mkdir -p conf
!ls data/SF/data/ | head -45 | sed -e 's/\.dat//' > conf/train.list
!ls data/SF/data/ | tail -5 | sed -e 's/\.dat//' > conf/eval.list

In [14]:
import numpy as np
import torch
from torch import nn, optim
from torch.nn import functional as F
import os
import sys
import time

In [15]:
def get_dataset(dim=25):
    x = []
    y = []
    datalist = []
    with open("conf/train.list","r") as f:
        for line in f:
            line = line.rstrip()
            datalist.append(line)

    for d in datalist:
        print(d)
        with open("data/SF/data/{}.dat".format(d),"rb") as f:
            dat = np.fromfile(f,dtype="<f8",sep="")
            x.append(dat.reshape(len(dat)//dim,dim))
        with open("data/TF/data/{}.dat".format(d),"rb") as f:
            dat = np.fromfile(f,dtype="<f8",sep="")
            y.append(dat.reshape(len(dat)//dim,dim))
    return x,y

In [16]:
class VCDNN(nn.Module):
        def __init__(self, dim=25, n_units=256):
            super(VCDNN, self).__init__()
            self.fc = nn.ModuleList([
                           nn.Linear(dim, n_units),
                           nn.Linear(n_units, n_units),
                           nn.Linear(n_units, dim)
            ])
            
        def forward(self, x):
            h1 = F.relu(self.fc[0](x))
            h2 = F.relu(self.fc[1](h1))
            h3 = self.fc[2](h2)
            return h3
        
        def get_predata(self, x):
            _x = torch.from_numpy(x.astype(np.float32))
            return self.forward(_x).detach().numpy()

In [17]:
x_train, y_train = get_dataset()
# parameters for training
n_epoch = 50
dim = 25
n_units = 128
N = len(x_train)

model = VCDNN(dim,n_units)
model.double()
optimizer = optim.Adam(model.parameters())

loss_fn = nn.MSELoss()

# loop
model.train()

losses = []
sum_loss = 0

for epoch in range(1, n_epoch + 1):
    sum_loss = 0

    for i in range(0, N):
        x_batch =torch.from_numpy(x_train[i])
        y_batch = torch.from_numpy(y_train[i])
        
        optimizer.zero_grad()
        
        predict_y_batch = model(x_batch)
        loss = loss_fn(predict_y_batch, y_batch)
        loss.backward()
        optimizer.step()
        sum_loss += loss.item()
        
        average_loss = sum_loss / N
        losses.append(average_loss)

        print("epoch: {}/{}  loss: {}".format(epoch, n_epoch, average_loss))

if not os.path.isdir("model"):
    os.mkdir("model")
torch.save(model.state_dict(), "model/vcmodel.model")


atr503_a01
atr503_a02
atr503_a03
atr503_a04
atr503_a05
atr503_a06
atr503_a07
atr503_a08
atr503_a09
atr503_a10
atr503_a11
atr503_a12
atr503_a13
atr503_a14
atr503_a15
atr503_a16
atr503_a17
atr503_a18
atr503_a19
atr503_a20
atr503_a21
atr503_a22
atr503_a23
atr503_a24
atr503_a25
atr503_a26
atr503_a27
atr503_a28
atr503_a29
atr503_a30
atr503_a31
atr503_a32
atr503_a33
atr503_a34
atr503_a35
atr503_a36
atr503_a37
atr503_a38
atr503_a39
atr503_a40
atr503_a41
atr503_a42
atr503_a43
atr503_a44
atr503_a45
epoch: 1/50  loss: 0.021133784034394466
epoch: 1/50  loss: 0.03707276570023339
epoch: 1/50  loss: 0.05307777572367844
epoch: 1/50  loss: 0.07066636330042625
epoch: 1/50  loss: 0.08578565258691914
epoch: 1/50  loss: 0.09952290432452568
epoch: 1/50  loss: 0.11505515111451889
epoch: 1/50  loss: 0.12711918784999815
epoch: 1/50  loss: 0.13985510507344712
epoch: 1/50  loss: 0.15114384561057098
epoch: 1/50  loss: 0.16266611475709733
epoch: 1/50  loss: 0.1729439949566214
epoch: 1/50  loss: 0.1801594937875791

In [18]:
!ls ./model/

vcmodel.model


## 学習したモデルによる音声の変換

In [19]:
import numpy as np
import pysptk as sptk
import pyworld as pw
from scipy.io import wavfile
import os
import sys
import time

In [20]:
dim = 25
n_units = 128

model = VCDNN(dim,n_units)
_ = model.load_state_dict(torch.load("model/vcmodel.model"))

In [21]:
# test data
x = []
datalist = []
with open("conf/eval.list","r") as f:
    for line in f:
        line = line.rstrip()
        datalist.append(line)

for d in datalist:
    with open("data/SF/mgc/{}.mgc".format(d),"rb") as f:
        dat = np.fromfile(f,dtype="<f8",sep="")
        x.append(dat.reshape(len(dat)//dim,dim))

if not os.path.isdir("result"):
    os.mkdir("result")
if not os.path.isdir("result/wav"):
    os.mkdir("result/wav")

fs = 16000
fftlen = 512
alpha = 0.42
for i in range(0,len(datalist)):
    outfile = "result/wav/{}.wav".format(datalist[i])
    with open("data/SF/f0/{}.f0".format(datalist[i]),"rb") as f:
        f0 = np.fromfile(f, dtype="<f8", sep="")
    with open("data/SF/ap/{}.ap".format(datalist[i]),"rb") as f:
        ap = np.fromfile(f, dtype="<f8", sep="")
        ap = ap.reshape(len(ap)//(fftlen+1),fftlen+1)
    y = model.get_predata(x[i])
    y = y.astype(np.float64)
    sp = sptk.mc2sp(y, alpha, fftlen*2)
    owav = pw.synthesize(f0, sp, ap, fs)
    owav = np.clip(owav, -32768, 32767)
    wavfile.write(outfile, fs, owav.astype(np.int16))

In [22]:
!ls result/wav

atr503_a46.wav	atr503_a47.wav	atr503_a48.wav	atr503_a49.wav	atr503_a50.wav
