In [1]:
import sys
sys.path.insert(1, '/home/gyzhang/merlin/src')
# import matplotlib.pyplot as plt
# %matplotlib inline

In [2]:
from nnmnkwii.preprocessing import trim_zeros_frames, remove_zeros_frames
from nnmnkwii.preprocessing import minmax, meanvar, minmax_scale, scale
from nnmnkwii import paramgen
from nnmnkwii.io import hts
from nnmnkwii.frontend import merlin as fe
from nnmnkwii.postfilters import merlin_post_filter

In [3]:
from io_funcs.binary_io import BinaryIOCollection
import pysptk
import pyworld
import numpy as np
from scipy.io import wavfile

In [4]:
filename = "/home/gyzhang/merlin/egs/cmu_arctic/s1/database/wav/arctic_a0004.wav"

In [5]:
fs, x = wavfile.read(filename)
alpha = pysptk.util.mcepalpha(fs)
hopesize = int(0.005 * fs)

In [6]:
f0 = pysptk.rapt(x.astype(np.float32), fs=fs, hopsize=hopesize,
                     min=60, max=600, voice_bias=0.0, otype=1)
f0 = f0.astype(np.float64)

In [7]:
f0.shape

(516,)

In [8]:
x = x.astype(np.float64) / (2**15)

In [9]:
f0h, timeaxis = pyworld.harvest(x, fs, frame_period=5, f0_floor=60.0, f0_ceil=600.0)

In [10]:
f0h.shape

(516,)

In [13]:
bin_io = BinaryIOCollection()

In [14]:
mgc_file = "/home/gyzhang/merlin/egs/cmu_arctic/s1/experiments/cmu_arctic/acoustic_model/inter_module/nn_norm_mgc_lf0_vuv_bap_187/arctic_a0001.cmp"

In [15]:
mgc, dim = bin_io.load_binary_file_frame(mgc_file, 187)

In [17]:
norm_info_file="/home/gyzhang/merlin/egs/cmu_arctic/s1/experiments/cmu_arctic_2/acoustic_model/inter_module/norm_info__mgc_lf0_vuv_bap_187_MVN.dat"
fid = open(norm_info_file, 'rb')
cmp_min_max = np.fromfile(fid, dtype=np.float32)
fid.close()
cmp_min_max = cmp_min_max.reshape((2, -1))
cmp_mean_vector = cmp_min_max[0, ]
cmp_std_vector  = cmp_min_max[1, ]

In [18]:
norm_features = mgc * cmp_std_vector + cmp_mean_vector


In [19]:
mgc_start_idx = 0
lf0_start_idx = 180
vuv_start_idx = 183
bap_start_idx = 184
sr = 16000
alpha = pysptk.util.mcepalpha(sr)
fftlen = 1024
frame_period = 5
windows = [
    (0, 0, np.array([1.0])),
    (1, 1, np.array([-0.5, 0.0, 0.5])),
    (1, 1, np.array([1.0, -2.0, 1.0])),
]

In [20]:
T = norm_features.shape[0]
# Split acoustic features
mgc = norm_features[:,:lf0_start_idx]
lf0 = norm_features[:,lf0_start_idx:vuv_start_idx]
vuv = norm_features[:,vuv_start_idx]
bap = norm_features[:,bap_start_idx:]
cmp_var_vector = cmp_std_vector**2
mgc_variances = np.tile(cmp_var_vector[:lf0_start_idx], (T, 1))
mgc = paramgen.mlpg(mgc, mgc_variances, windows)
lf0_variances = np.tile(cmp_var_vector[lf0_start_idx:vuv_start_idx], (T,1))
lf0 = paramgen.mlpg(lf0, lf0_variances, windows)
bap_variances = np.tile(cmp_var_vector[bap_start_idx:], (T, 1))
bap = paramgen.mlpg(bap, bap_variances, windows)
f0 = lf0.copy()
f0[vuv < 0.5] = 0
f0[np.nonzero(f0)] = np.exp(f0[np.nonzero(f0)])
spectrogram = pysptk.mc2sp(mgc, fftlen=fftlen, alpha=alpha)
aperiodicity = pyworld.decode_aperiodicity(bap.astype(np.float64), sr, fftlen)

In [25]:
lf0_variances

array([[0.0881185 , 0.00064071, 0.00172567],
       [0.0881185 , 0.00064071, 0.00172567],
       [0.0881185 , 0.00064071, 0.00172567],
       ...,
       [0.0881185 , 0.00064071, 0.00172567],
       [0.0881185 , 0.00064071, 0.00172567],
       [0.0881185 , 0.00064071, 0.00172567]], dtype=float32)

In [27]:
from frontend.mlpg import MLParameterGenerationFast as MLParameterGeneration


In [29]:
lf0 = norm_features[:,lf0_start_idx:vuv_start_idx]

lf0_variances = np.tile(cmp_var_vector[lf0_start_idx:vuv_start_idx], (T,1))

mlpg_algo = MLParameterGeneration()
gen_features = mlpg_algo.generation(lf0, lf0_variances, 1)

In [33]:
f0 = gen_features.copy()
f0[vuv < 0.5] = 0
f0[np.nonzero(f0)] = np.exp(f0[np.nonzero(f0)])

In [34]:
f0

array([[  0.        ],
       [  0.        ],
       [  0.        ],
       [  0.        ],
       [130.58197245],
       [128.17944869],
       [127.10792224],
       [127.94282462],
       [127.53259554],
       [127.03745265],
       [125.91283557],
       [125.49726605],
       [127.45575198],
       [129.1869287 ],
       [129.63888408],
       [129.51438336],
       [130.193946  ],
       [130.8458121 ],
       [131.1578323 ],
       [132.76755241],
       [134.12532502],
       [134.89171578],
       [135.93451617],
       [136.62483132],
       [136.17124514],
       [139.0522468 ],
       [141.32190008],
       [134.96268073],
       [136.56113165],
       [139.62603983],
       [141.31448768],
       [140.69128355],
       [  0.        ],
       [  0.        ],
       [  0.        ],
       [  0.        ],
       [  0.        ],
       [  0.        ],
       [  0.        ],
       [  0.        ],
       [  0.        ],
       [  0.        ],
       [  0.        ],
       [  0

In [1]:
y, sr = librosa.load(librosa.util.example_audio_file())
S = np.abs(librosa.stft(y))
librosa.power_to_db(S**2)

NameError: name 'librosa' is not defined

In [57]:
test_static = np.random.rand(20)
test_static = np.reshape(test_static,(20,1))

In [54]:
from frontend.acoustic_composition import AcousticComposition
ac = AcousticComposition()

In [62]:
windows[1]

(1, 1, array([-0.5,  0. ,  0.5]))

In [69]:
delta_features = ac.compute_dynamic_matrix(test_static, [-0.5, 0.0, 0.5], 20,1)
acc_features = ac.compute_dynamic_matrix(test_static, [1, -2.0, 1], 20,1)
merge = np.concatenate((test_static, delta_features, acc_features),axis=1)

In [86]:
mean_test_d = np.mean(merge, axis=0)
std_test_d = np.std(merge,axis=0)
var_test_d = std_test_d**2

In [89]:
lf01 = paramgen.mlpg(merge, var_test_d, windows)
gen_features1 = mlpg_algo.generation(merge, var_test_d, 1)

> /home/gyzhang/merlin/src/frontend/mlpg.py(85)generation()
-> for d in range(static_dimension):
(Pdb) c


In [91]:
gen_features1

array([[0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.]])

In [94]:
lf01 - test_static

array([[-0.14788927],
       [-0.11456224],
       [-0.05361493],
       [-0.03003307],
       [-0.01567416],
       [-0.0084059 ],
       [-0.0044648 ],
       [-0.0023876 ],
       [-0.00128725],
       [-0.00071765],
       [-0.0004431 ],
       [-0.00035088],
       [-0.00040316],
       [-0.00062062],
       [-0.00109685],
       [-0.00200767],
       [-0.0038261 ],
       [-0.00682033],
       [-0.01456656],
       [-0.01880251]])

In [93]:
test_static

array([[0.6071765 ],
       [0.60305104],
       [0.92175918],
       [0.47041947],
       [0.18472407],
       [0.96092644],
       [0.90768257],
       [0.98365084],
       [0.76143108],
       [0.24797026],
       [0.16685686],
       [0.86616368],
       [0.07333278],
       [0.00160913],
       [0.5217674 ],
       [0.09237178],
       [0.25385672],
       [0.01096424],
       [0.60136626],
       [0.0771906 ]])