In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = ''

In [2]:
import tensorflow as tf
import numpy as np
from glob import glob
from itertools import cycle

mels = glob('output-universal/mels/*.npy')
file_cycle = cycle(mels)

In [3]:
path = 'universal-melgan-1024'
ckpt_path = tf.train.latest_checkpoint(path)
ckpt_path

'universal-melgan-1024/model.ckpt-220000'

In [4]:
def generate(batch_max_steps = 8192, hop_size = 256):
    while True:
        f = next(file_cycle)
        mel = np.load(f)
        audio = np.load(f.replace('mels', 'audios'))

        yield {'mel': mel, 'audio': audio}

In [5]:
dataset = tf.data.Dataset.from_generator(
    generate,
    {'mel': tf.float32, 'audio': tf.float32},
    output_shapes = {
        'mel': tf.TensorShape([None, 80]),
        'audio': tf.TensorShape([None]),
    },
)
features = dataset.make_one_shot_iterator().get_next()
features

Instructions for updating:
Use `for ... in dataset:` to iterate over a dataset. If using `tf.estimator`, return the `Dataset` object directly from your input function. As a last resort, you can use `tf.compat.v1.data.make_one_shot_iterator(dataset)`.


{'mel': <tf.Tensor 'IteratorGetNext:1' shape=(?, 80) dtype=float32>,
 'audio': <tf.Tensor 'IteratorGetNext:0' shape=(?,) dtype=float32>}

In [6]:
import malaya_speech
import malaya_speech.train
from malaya_speech.train.model import universal_melgan as melgan
import malaya_speech.config

melgan_config = malaya_speech.config.universal_melgan_config
melgan_config['melgan_generator_params']['filters'] = 1024
generator = melgan.Generator(
    melgan.GeneratorConfig(**melgan_config['melgan_generator_params']),
    name = 'universalmelgan-generator',
)






The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.





In [7]:
y_hat = generator([features['mel']], training = False)
y_hat

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


<tf.Tensor 'universalmelgan-generator/sequential/activation_6/Tanh:0' shape=(1, ?, 1) dtype=float32>

In [8]:
x = tf.placeholder(tf.float32, [None, None, 80])
y_hat_ = generator(x, training = False)
y_hat_

<tf.Tensor 'universalmelgan-generator_1/sequential/activation_6/Tanh:0' shape=(?, ?, 1) dtype=float32>

In [9]:
y_hat_ = tf.identity(y_hat_, name = 'logits')

In [10]:
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())

In [11]:
var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
saver = tf.train.Saver(var_list = var_list)
saver.restore(sess, ckpt_path)

INFO:tensorflow:Restoring parameters from universal-melgan-1024/model.ckpt-220000


In [12]:
import IPython.display as ipd

In [13]:
%%time
f, y_ = sess.run([features, y_hat])

CPU times: user 8.59 s, sys: 1.02 s, total: 9.62 s
Wall time: 592 ms


In [14]:
ipd.Audio(f['audio'], rate = 22050)

In [15]:
ipd.Audio(y_[0,:,0], rate = 22050)

In [16]:
mel_stats = np.load('universal-stats/stats.npy')

In [17]:
from sklearn.preprocessing import StandardScaler

standard = StandardScaler()
standard.mean_ = mel_stats[0]
standard.scale_ = mel_stats[1]

In [18]:
import librosa

config = {'sampling_rate': 22050,
 'fft_size': 1024,
 'hop_size': 256,
 'win_length': None,
 'window': 'hann',
 'num_mels': 80,
 'fmin': 80,
 'fmax': 7600,
 'global_gain_scale': 1.0,
 'trim_silence': True,
 'trim_threshold_in_db': 20,
 'trim_frame_size': 2048,
 'trim_hop_size': 512}

def get_mel(file):
    audio, _ = malaya_speech.load(file, sr = config['sampling_rate'])
    D = librosa.stft(
        audio,
        n_fft=config['fft_size'],
        hop_length=config['hop_size'],
        win_length=config['win_length'],
        window=config['window'],
        pad_mode='reflect',
    )
    S, _ = librosa.magphase(D) 
    fmin = 0 if config["fmin"] is None else config["fmin"]
    fmax = sampling_rate // 2 if config["fmax"] is None else config["fmax"]
    mel_basis = librosa.filters.mel(
        sr=config['sampling_rate'],
        n_fft=config["fft_size"],
        n_mels=config["num_mels"],
        fmin=fmin,
        fmax=fmax,
    )
    mel = np.log10(np.maximum(np.dot(mel_basis, S), 1e-10)).T
    audio = np.pad(audio, (0, config["fft_size"]), mode="edge")
    audio = audio[: len(mel) * config['hop_size']]
    mel = (mel - mel_stats[0]) / mel_stats[1]
    return audio, mel

In [19]:
audio, mel = get_mel('test-2.wav')

In [20]:
mel

array([[-0.14638856, -0.01382679, -0.16360685, ...,  1.7198304 ,
         1.3034906 ,  1.6417838 ],
       [-0.40992635, -0.05997385, -0.03319113, ...,  1.6473283 ,
         1.361473  ,  1.6237243 ],
       [-0.5363678 , -0.12396435,  0.04365544, ...,  1.706375  ,
         1.4786478 ,  1.5164778 ],
       ...,
       [ 1.0116304 ,  0.3580254 ,  0.6754571 , ..., -0.01249518,
        -0.0976165 , -0.16462201],
       [ 0.9467808 ,  0.31517428,  0.7169664 , ..., -0.10384646,
        -0.23302224, -0.18855421],
       [ 0.9138673 ,  0.46920583,  0.710272  , ..., -0.30128983,
        -0.13648339, -0.21230017]], dtype=float32)

In [21]:
y_ = sess.run(y_hat_, feed_dict = {x: [mel]})
ipd.Audio(y_[0,:,0], rate = 22050)

In [None]:
import pickle

with open('a.pkl', 'rb') as fopen:
    data = pickle.load(fopen)

In [None]:
y_ = sess.run(y_hat_, feed_dict = {x: [data[0]]})
ipd.Audio(y_[0,:,0], rate = 22050)

In [None]:
y_ = sess.run(y_hat_, feed_dict = {x: [data[1]]})
ipd.Audio(y_[0,:,0], rate = 22050)

In [22]:
saver = tf.train.Saver()
saver.save(sess, 'universal-melgan-1024-output/model.ckpt')

'universal-melgan-1024-output/model.ckpt'

In [23]:
strings = ','.join(
    [
        n.name
        for n in tf.get_default_graph().as_graph_def().node
        if ('Variable' in n.op
        or 'gather' in n.op.lower()
        or 'Placeholder' in n.name
        or 'logits' in n.name)
        and 'adam' not in n.name
        and 'global_step' not in n.name
        and 'Assign' not in n.name
        and 'ReadVariableOp' not in n.name
        and 'Gather' not in n.name
    ]
)
strings.split(',')

['Placeholder', 'logits']

In [24]:
def freeze_graph(model_dir, output_node_names):

    if not tf.gfile.Exists(model_dir):
        raise AssertionError(
            "Export directory doesn't exists. Please specify an export "
            'directory: %s' % model_dir
        )

    checkpoint = tf.train.get_checkpoint_state(model_dir)
    input_checkpoint = checkpoint.model_checkpoint_path

    absolute_model_dir = '/'.join(input_checkpoint.split('/')[:-1])
    output_graph = absolute_model_dir + '/frozen_model.pb'
    clear_devices = True
    with tf.Session(graph = tf.Graph()) as sess:
        saver = tf.train.import_meta_graph(
            input_checkpoint + '.meta', clear_devices = clear_devices
        )
        saver.restore(sess, input_checkpoint)
        output_graph_def = tf.graph_util.convert_variables_to_constants(
            sess,
            tf.get_default_graph().as_graph_def(),
            output_node_names.split(','),
        )
        with tf.gfile.GFile(output_graph, 'wb') as f:
            f.write(output_graph_def.SerializeToString())
        print('%d ops in the final graph.' % len(output_graph_def.node))

In [25]:
freeze_graph('universal-melgan-1024-output', strings)

INFO:tensorflow:Restoring parameters from universal-melgan-1024-output/model.ckpt
Instructions for updating:
Use `tf.compat.v1.graph_util.convert_variables_to_constants`
Instructions for updating:
Use `tf.compat.v1.graph_util.extract_sub_graph`
INFO:tensorflow:Froze 82 variables.
INFO:tensorflow:Converted 82 variables to const ops.
1122 ops in the final graph.


In [26]:
def load_graph(frozen_graph_filename):
    with tf.gfile.GFile(frozen_graph_filename, 'rb') as f:
        graph_def = tf.GraphDef()
        graph_def.ParseFromString(f.read())
                
    with tf.Graph().as_default() as graph:
        tf.import_graph_def(graph_def)
        
    return graph

In [27]:
g = load_graph('universal-melgan-1024-output/frozen_model.pb')

In [28]:
test_sess = tf.InteractiveSession(graph = g)



In [29]:
X = g.get_tensor_by_name(f'import/Placeholder:0')
logits = g.get_tensor_by_name(f'import/logits:0')

In [30]:
y_ = test_sess.run(logits, feed_dict = {X: [mel]})
ipd.Audio(y_[0,:,0], rate = 22050)

In [31]:
from tensorflow.tools.graph_transforms import TransformGraph

In [32]:
transforms = ['add_default_attributes',
             'remove_nodes(op=Identity, op=CheckNumerics)',
             'fold_batch_norms',
             'fold_old_batch_norms',
             'quantize_weights(fallback_min=-1024, fallback_max=1024)',
             'strip_unused_nodes',
             'sort_by_execution_order']

In [33]:
pb = 'universal-melgan-1024-output/frozen_model.pb'

In [34]:
input_graph_def = tf.GraphDef()
with tf.gfile.FastGFile(pb, 'rb') as f:
    input_graph_def.ParseFromString(f.read())

transformed_graph_def = TransformGraph(input_graph_def, 
                                           ['Placeholder'],
                                           ['logits'], transforms)
    
with tf.gfile.GFile(f'{pb}.quantized', 'wb') as f:
    f.write(transformed_graph_def.SerializeToString())

Instructions for updating:
Use tf.gfile.GFile.


In [35]:
g = load_graph('universal-melgan-1024-output/frozen_model.pb.quantized')

In [36]:
test_sess = tf.InteractiveSession(graph = g)
X = g.get_tensor_by_name(f'import/Placeholder:0')
logits = g.get_tensor_by_name(f'import/logits:0')

In [37]:
y_ = test_sess.run(logits, feed_dict = {X: [mel]})
ipd.Audio(y_[0,:,0], rate = 22050)