In [1]:
import sys
sys.path.insert(1, '/home/gyzhang/merlin/src')
# import matplotlib.pyplot as plt
# %matplotlib inline

In [2]:
import numpy as np
import configuration
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
import tensorflow as tf
tf.logging.set_verbosity(tf.logging.ERROR)

In [3]:
from io_funcs.binary_io import BinaryIOCollection
from run_tensorflow_with_merlin_io import TensorflowClass
from tensorflow_lib.train import TrainTensorflowModels




In [4]:
import threading
from IPython.display import display
import pysptk
import pyworld
from scipy.io import wavfile

In [5]:
from nnmnkwii.preprocessing import trim_zeros_frames, remove_zeros_frames
from nnmnkwii.preprocessing import minmax, meanvar, minmax_scale, scale
from nnmnkwii import paramgen
from nnmnkwii.io import hts
from nnmnkwii.frontend import merlin as fe
from nnmnkwii.postfilters import merlin_post_filter

In [6]:
from ipywidgets import widgets
from ipywidgets.widgets import Label, FloatProgress, FloatSlider
from ipywidgets.widgets import Layout, HBox, VBox
import bqplot as bq
from ipywidgets.widgets import Audio

**basic parameters**

In [11]:
windows = [
    (0, 0, np.array([1.0])),
    (1, 1, np.array([-0.5, 0.0, 0.5])),
    (1, 1, np.array([1.0, -2.0, 1.0])),
]

sr = 16000
alpha = pysptk.util.mcepalpha(sr)
fftlen = 1024
frame_period = 5

mgc_start_idx = 0
lf0_start_idx = 180
vuv_start_idx = 183
bap_start_idx = 184

## configuration of neural network
n_in=413
hidden_layer_size=[512, 512, 512, 512, 512, 512]
n_out=187
hidden_layer_type=['tanh', 'tanh', 'tanh', 'tanh', 'tanh', 'tanh']
model_dir='/home/gyzhang/merlin/egs/casia/s1/experiments/liuchang/acoustic_model/nnets_model/tensorflow/feed_forward_6_tanh'
## path 
# acoustic mean and var
norm_info_file="/home/gyzhang/merlin/egs/casia/s1/experiments/liuchang/acoustic_model/inter_module/norm_info__mgc_lf0_vuv_bap_187_MVN.dat"
# linguistic norm features
test_norm_path = "/home/gyzhang/merlin/egs/casia/s1/experiments/liuchang/acoustic_model/inter_module/nn_no_silence_lab_norm_413/liuchanhg-neutral-209.lab"


**load acoustic var and mean and linguistic feature**

In [9]:
fid = open(norm_info_file, 'rb')
cmp_min_max = np.fromfile(fid, dtype=np.float32)
fid.close()
cmp_min_max = cmp_min_max.reshape((2, -1))
cmp_mean_vector = cmp_min_max[0, ]
cmp_std_vector  = cmp_min_max[1, ]

In [10]:
io_funcs = BinaryIOCollection()
inp_features, frame_number = io_funcs.load_binary_file_frame(test_norm_path, 413)
test_lin_x, test_lab_x = np.hsplit(inp_features, np.array([-1]))

In [12]:
tensorflow_models = TrainTensorflowModels(n_in, hidden_layer_size, n_out,
                                                               hidden_layer_type,model_dir)
tensorflow_models.define_feedforward_model_utt()

In [13]:
with tensorflow_models.graph.as_default():
    new_saver = tf.train.Saver()
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        latest_ckpt = tf.train.latest_checkpoint(tensorflow_models.ckpt_dir, latest_filename=None)
        new_saver.restore(sess, latest_ckpt)
        v1 = sess.graph.get_tensor_by_name('utt-embeddings:0')
        v1_array = sess.run(v1)
        v1_array[452] = [-0.4,  0.7 ]
        sess.run(tf.assign(v1, v1_array))
        v2_array = sess.run(v1)
        y_predict = sess.run(tensorflow_models.output_layer, feed_dict={tensorflow_models.input_lin_layer: test_lin_x, tensorflow_models.utt_index_t: test_lab_x, tensorflow_models.is_training_batch: False})

In [17]:
norm_features = y_predict * cmp_std_vector + cmp_mean_vector
T = norm_features.shape[0]
# Split acoustic features
mgc = norm_features[:,:lf0_start_idx]
lf0 = norm_features[:,lf0_start_idx:vuv_start_idx]
vuv = norm_features[:,vuv_start_idx]
bap = norm_features[:,bap_start_idx:]

In [18]:
cmp_var_vector = cmp_std_vector**2
mgc_variances = np.tile(cmp_var_vector[:lf0_start_idx], (T, 1))
mgc = paramgen.mlpg(mgc, mgc_variances, windows)
lf0_variances = np.tile(cmp_var_vector[lf0_start_idx:vuv_start_idx], (T,1))
lf0 = paramgen.mlpg(lf0, lf0_variances, windows)
bap_variances = np.tile(cmp_var_vector[bap_start_idx:], (T, 1))
bap = paramgen.mlpg(bap, bap_variances, windows)

In [19]:
f0 = lf0.copy()
f0[vuv < 0.5] = 0
f0[np.nonzero(f0)] = np.exp(f0[np.nonzero(f0)])

In [20]:
spectrogram = pysptk.mc2sp(mgc, fftlen=fftlen, alpha=alpha)
aperiodicity = pyworld.decode_aperiodicity(bap.astype(np.float64), sr, fftlen)
   

In [21]:
generated_waveform = pyworld.synthesize(f0.flatten().astype(np.float64),
                                            spectrogram.astype(np.float64),
                                            aperiodicity.astype(np.float64),
                                            sr, frame_period)

In [22]:
x2 = generated_waveform/np.max(generated_waveform) * 32768
x2 = x2.astype(np.int16)
wavfile.write("gen.wav", sr, x2)
with open("gen.wav", 'rb') as fd:
    contents = fd.read()

In [23]:
au = widgets.Audio(value=contents, format="wav",loop=False)

In [24]:
# Adding default tooltip to Line Chart
x_sc = bq.LinearScale()
y_sc = bq.LinearScale()
x_ax = bq.Axis(label='time', scale=x_sc)
y_ax = bq.Axis(label='F0', scale=y_sc, orientation="vertical")
x_data = np.arange(len(f0.flatten()))
y_data = f0.flatten()
def_tt = bq.Tooltip(fields=['y'], formats=['.2f'])
line_chart = bq.Scatter(x=x_data, y=y_data, tooltip=def_tt,scales= {'x': x_sc, 'y': y_sc},default_size=5)

In [25]:
# Adding default tooltip to Line Chart
x_sc2 = bq.LinearScale()
y_sc2 = bq.LinearScale()
x_ax2 = bq.Axis(label='time', scale=x_sc2)
y_ax2 = bq.Axis(label='mgc', scale=y_sc2, orientation="vertical")
def_tt2 = bq.Tooltip(fields=['name', 'index'], formats=['', '.2f'])
x_data = np.arange(mgc.shape[0])
y_data = mgc[:,0:5]
line_chart_2 = bq.Lines(x=x_data, y=y_data.T, tooltip=def_tt2, scales= {'x': x_sc2, 'y': y_sc2}, labels=["mgc 1", "mgc 2", "mgc 3", "mgc 4", "mgc 5"],display_legend=True )

In [26]:
fig1 = bq.Figure(layout=Layout(width="700px",height="500px"),
                axes=[x_ax, y_ax],
                marks=[line_chart])
fig2 = bq.Figure(layout=Layout(width="700px",height="500px"),
                axes=[x_ax2, y_ax2],
                marks=[line_chart_2])

In [27]:
wA1 = FloatSlider(
value=0,min=-2,max=2,step=0.01,description="control 1:",layout=Layout(width='490px',margin='0 0 5px 0')
)
wA2 = FloatSlider(
value=0,min=-2,max=2,step=0.01,description="control 2:",layout=Layout(width='490px',margin='0 0 5px 0')
)

In [28]:
box = VBox(children=(fig1, fig2,wA1,wA2, au), layout=Layout(border="solid spx gray",width="900px"))

In [29]:
def parms_gen(z1,z2):
    with tensorflow_models.graph.as_default():
            new_saver = tf.train.Saver()
            with tf.Session() as sess:
                sess.run(tf.global_variables_initializer())
                latest_ckpt = tf.train.latest_checkpoint(tensorflow_models.ckpt_dir, latest_filename=None)
                new_saver.restore(sess, latest_ckpt)
                v1 = sess.graph.get_tensor_by_name('utt-embeddings:0')
                v1_array = sess.run(v1)
                v1_array[452] = [z1,  z2 ]
                sess.run(tf.assign(v1, v1_array))
                v2_array = sess.run(v1)
                y_predict = sess.run(tensorflow_models.output_layer, feed_dict={tensorflow_models.input_lin_layer: test_lin_x, tensorflow_models.utt_index_t: test_lab_x, tensorflow_models.is_training_batch: False})
            norm_features = y_predict * cmp_std_vector + cmp_mean_vector
            T = norm_features.shape[0]
            # Split acoustic features
            mgc = norm_features[:,:lf0_start_idx]
            lf0 = norm_features[:,lf0_start_idx:vuv_start_idx]
            vuv = norm_features[:,vuv_start_idx]
            bap = norm_features[:,bap_start_idx:]
            cmp_var_vector = cmp_std_vector**2
            mgc_variances = np.tile(cmp_var_vector[:lf0_start_idx], (T, 1))
            mgc = paramgen.mlpg(mgc, mgc_variances, windows)
            lf0_variances = np.tile(cmp_var_vector[lf0_start_idx:vuv_start_idx], (T,1))
            lf0 = paramgen.mlpg(lf0, lf0_variances, windows)
            bap_variances = np.tile(cmp_var_vector[bap_start_idx:], (T, 1))
            bap = paramgen.mlpg(bap, bap_variances, windows)
            f0 = lf0.copy()
            f0[vuv < 0.5] = 0
            f0[np.nonzero(f0)] = np.exp(f0[np.nonzero(f0)])
    return f0, mgc, bap

In [30]:
def work():
    old_w1 = 0
    old_w2 = 0
    while True:
        if old_w1 != wA1.value or old_w2 != wA2.value:
            f0, mgc, bap = parms_gen(wA1.value,wA2.value)
            line_chart.y = f0.flatten()
            line_chart_2.y = mgc[:,0:5].T
            spectrogram = pysptk.mc2sp(mgc, fftlen=fftlen, alpha=alpha)
            aperiodicity = pyworld.decode_aperiodicity(bap.astype(np.float64), sr, fftlen)
            generated_waveform = pyworld.synthesize(f0.flatten().astype(np.float64),
                                            spectrogram.astype(np.float64),
                                            aperiodicity.astype(np.float64),
                                            sr, frame_period)
            x2 = generated_waveform/np.max(generated_waveform) * 32768
            x2 = x2.astype(np.int16)
            wavfile.write("gen.wav", sr, x2)
            with open("gen.wav", 'rb') as fd:
                contents = fd.read()
            au.value = contents
            old_w1 = wA1.value
            old_w2 = wA2.value
        

In [31]:
thread = threading.Thread(target=work)
display(box)
thread.start()

VBox(children=(Figure(axes=[Axis(label='time', scale=LinearScale(), side='bottom'), Axis(label='F0', orientati…