In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = ''
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'

In [2]:
import tensorflow as tf
import numpy as np
import malaya_speech
import malaya_speech.train
import malaya_speech.config
import malaya_speech.train as train
from malaya_speech.train.model.vits import model
from malaya_speech.train.model.vits import commons
from malaya_speech.train.model import vits
from malaya_speech.train.model.vits import gen
from malaya_speech.train.model import fastspeech2
from malaya_speech.train.model.fastspeech2 import model_stochastic
from librosa.filters import mel as librosa_mel_fn
from glob import glob
import random
import re



2022-08-27 17:13:07.000570: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
  from .autonotebook import tqdm as notebook_tqdm


In [3]:
_pad = 'pad'
_start = 'start'
_eos = 'eos'
_punctuation = "!'(),.:;? "
_special = '-'
_letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'

MALAYA_SPEECH_SYMBOLS = (
    [_pad, _start, _eos] + list(_special) + list(_punctuation) + list(_letters)
)

hparams = vits.HParams(**malaya_speech.config.vits_base_config)
spec_channels = hparams.data.filter_length // 2 + 1
segment_size = hparams.train.segment_size // hparams.data.hop_length

melbank = librosa_mel_fn(hparams.data.sampling_rate, hparams.data.filter_length,
                         hparams.data.n_mel_channels, hparams.data.mel_fmin, hparams.data.mel_fmax)

MEL = tf.convert_to_tensor(melbank)

  melbank = librosa_mel_fn(hparams.data.sampling_rate, hparams.data.filter_length,


In [4]:
def dynamic_range_compression(x, C=1, clip_val=1e-5):
    """
    PARAMS
    ------
    C: compression factor
    """
    return tf.log(tf.clip_by_value(x, clip_val, tf.reduce_max(x)) * C)


def dynamic_range_decompression(x, C=1):
    """
    PARAMS
    ------
    C: compression factor used to compress
    """
    return tf.exp(x) / C


def spectral_normalize(magnitudes):
    output = dynamic_range_compression(magnitudes)
    return output


def spectral_de_normalize_torch(magnitudes):
    output = dynamic_range_decompression(magnitudes)
    return output


def spectrogram_tf(audio_norm, filter_length, hop_length):
    p = int((filter_length-hop_length)/2)
    padded = tf.pad(audio_norm, [[p, p]], mode='reflect')
    spec = tf.abs(tf.signal.stft(
        padded,
        filter_length,
        hop_length,
        fft_length=None,
        window_fn=tf.signal.hann_window,
        pad_end=False,
    ))
    spec = tf.sqrt(spec ** 2 + 1e-6)
    return spec


def spec_to_mel(spec):
    spec = tf.matmul(spec, tf.transpose(MEL))
    spec = spectral_normalize(spec)
    return spec

def average_by_duration(x, durs):
    mel_len = durs.sum()
    durs_cum = np.cumsum(np.pad(durs, (1, 0)))

    x_char = np.zeros((durs.shape[0],), dtype=np.float32)
    for idx, start, end in zip(range(mel_len), durs_cum[:-1], durs_cum[1:]):
        values = x[start:end][np.where(x[start:end] != 0.0)[0]]
        x_char[idx] = np.mean(values) if len(values) > 0 else 0.0

    return x_char.astype(np.float32)


def get_alignment(f):
    f = f"tacotron2-osman-alignment/{f.split('speech-bahasa/')[1].replace('/', '-')}"
    if os.path.exists(f):
        return np.load(f)
    else:
        return None

def generate(files):
    while True:
        random.shuffle(files)
        for f in files:
            f = f.decode() if isinstance(f, bytes) else f
            
            alignment = get_alignment(f)
            if alignment is None:
                continue
                
            wav = np.load(f.replace('mels', 'audios'))
            wav_len = len(wav)
            if wav_len > maxlen or wav_len < minlen:
                continue

            text_ids = np.load(f.replace('mels', 'text_ids'), allow_pickle=True)[
                0
            ]
            text_ids = ''.join([c for c in text_ids if c in MALAYA_SPEECH_SYMBOLS])
            text_ids = re.sub(r'[ ]+', ' ', text_ids).strip()
            text_input = np.array(
                [
                    MALAYA_SPEECH_SYMBOLS.index(c)
                    for c in text_ids
                ]
            )
            num_pad = pad_to - ((len(text_input) + 2) % pad_to)
            text_input = np.pad(
                text_input, ((1, 1)), 'constant', constant_values=((1, 2))
            )
            text_input = np.pad(
                text_input, ((0, num_pad)), 'constant', constant_values=0
            )
            
            f0 = np.load(f.replace('mels', 'f0s'))
            f0 = norm_mean_std(f0, f0_stat[0], f0_stat[1])
            f0 = average_by_duration(f0, alignment)
            len_f0 = [len(f0)]

            energy = np.load(f.replace('mels', 'energies'))
            energy = norm_mean_std(energy, energy_stat[0], energy_stat[1])
            energy = average_by_duration(energy, alignment)
            len_energy = [len(energy)]
            
            yield {
                'text_ids': text_input,
                'text_ids_len': [len(text_input)],
                'wav': wav,
                'wav_len': [wav_len],
                'f0': f0,
                'len_f0': len_f0,
                'energy': energy,
                'len_energy': len_energy,
                'alignment': alignment,
            }


def preprocess_inputs(example):
    s = spectrogram_tf(example['wav'], hparams.data.filter_length, hparams.data.hop_length)
    length = tf.cast(tf.shape(s)[0], tf.int32)
    length = tf.expand_dims(length, 0)
    example['inputs'] = s
    example['inputs_length'] = length
    return example


def get_dataset(
    files,
    batch_size=20,
    thread_count=24,
):
    def get():
        dataset = tf.data.Dataset.from_generator(
            generate,
            {
                'text_ids': tf.int32,
                'text_ids_len': tf.int32,
                'wav': tf.float32,
                'wav_len': tf.int32,
                'f0': tf.float32,
                'len_f0': tf.int32,
                'energy': tf.float32,
                'len_energy': tf.int32,
                'alignment': tf.int32,
            },
            output_shapes={
                'text_ids': tf.TensorShape([None]),
                'text_ids_len': tf.TensorShape([None]),
                'wav': tf.TensorShape([None]),
                'wav_len': tf.TensorShape([None]),
                'f0': tf.TensorShape([None]),
                'len_f0': tf.TensorShape([1]),
                'energy': tf.TensorShape([None]),
                'len_energy': tf.TensorShape([1]),
                'alignment': tf.TensorShape([None]),
            },
            args=(files,),
        )
        dataset = dataset.map(
            preprocess_inputs, num_parallel_calls=thread_count
        )
        dataset = dataset.padded_batch(
            batch_size,
            padded_shapes={
                'inputs': tf.TensorShape([None, spec_channels]),
                'inputs_length': tf.TensorShape([None]),
                'text_ids': tf.TensorShape([None]),
                'text_ids_len': tf.TensorShape([None]),
                'wav': tf.TensorShape([None]),
                'wav_len': tf.TensorShape([None]),
                'f0': tf.TensorShape([None]),
                'len_f0': tf.TensorShape([1]),
                'energy': tf.TensorShape([None]),
                'len_energy': tf.TensorShape([1]),
                'alignment': tf.TensorShape([None]),
            },
            padding_values={
                'inputs': tf.constant(0, dtype=tf.float32),
                'inputs_length': tf.constant(0, dtype=tf.int32),
                'text_ids': tf.constant(0, dtype=tf.int32),
                'text_ids_len': tf.constant(0, dtype=tf.int32),
                'wav': tf.constant(0, dtype=tf.float32),
                'wav_len': tf.constant(0, dtype=tf.int32),
                'f0': tf.constant(0, dtype=tf.float32),
                'len_f0': tf.constant(0, dtype=tf.int32),
                'energy': tf.constant(0, dtype=tf.float32),
                'len_energy': tf.constant(0, dtype=tf.int32),
                'alignment': tf.constant(0, dtype=tf.int32),
            },
        )
        dataset = dataset.prefetch(tf.contrib.data.AUTOTUNE)
        return dataset

    return get


def discriminator_loss(disc_real_outputs, disc_generated_outputs):
    loss = 0
    r_losses = []
    g_losses = []
    for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
        r_loss = tf.reduce_mean((1-dr)**2)
        g_loss = tf.reduce_mean(dg**2)
        loss += (r_loss + g_loss)
        r_losses.append(r_loss)
        g_losses.append(g_loss)

    return loss, r_losses, g_losses


def kl_loss(z_p, logs_q, m_p, logs_p, z_mask):
    kl = logs_p - logs_q - 0.5
    kl += 0.5 * ((z_p - m_p)**2) * tf.exp(-2. * logs_p)
    kl = tf.reduce_sum(kl * z_mask)
    l = kl / tf.reduce_sum(z_mask)
    return l


def feature_loss(fmap_r, fmap_g):
    loss = 0
    for dr, dg in zip(fmap_r, fmap_g):
        for rl, gl in zip(dr, dg):
            rl = tf.stop_gradient(rl)
            loss += tf.reduce_mean(tf.abs(rl - gl))

    return loss * 2


def generator_loss(disc_outputs):
    loss = 0
    gen_losses = []
    for dg in disc_outputs:
        l = tf.reduce_mean((1-dg)**2)
        gen_losses.append(l)
        loss += l

    return loss, gen_losses

In [5]:
model = gen.Model(len(MALAYA_SPEECH_SYMBOLS), spec_channels, segment_size, **hparams.model)




In [6]:
config_fs = malaya_speech.config.fastspeech2_config
config_fs = fastspeech2.Config(
    vocab_size=len(MALAYA_SPEECH_SYMBOLS), **config_fs
)
config_fs.enable_postnet = False

In [7]:
model_fs = model_stochastic.Model(config_fs)




In [8]:
input_ids = tf.placeholder(tf.int32, [None, None])
lens = tf.placeholder(tf.int32, [None, None])
energies = tf.placeholder(tf.float32, [None, None])
energies_lengths = tf.placeholder(tf.int32, [None])
f0s = tf.placeholder(tf.float32, [None, None])
f0s_lengths = tf.placeholder(tf.int32, [None])




In [9]:
r_training = model_fs(input_ids, lens, f0s, energies, training = False)

2022-08-27 17:13:10.881662: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1
2022-08-27 17:13:10.884427: E tensorflow/stream_executor/cuda/cuda_driver.cc:282] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2022-08-27 17:13:10.884443: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: huseincomel-desktop
2022-08-27 17:13:10.884446: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: huseincomel-desktop
2022-08-27 17:13:10.884492: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:200] libcuda reported version is: 470.141.3
2022-08-27 17:13:10.884504: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:204] kernel reported version is: 470.141.3
2022-08-27 17:13:10.884507: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:310] kernel version seems to match DSO: 470.141.3
2022-08-27 17:13:10.889297: I tensorflow/

In [10]:
r_training

(<tf.Tensor 'model_1/mel_before/BiasAdd:0' shape=(?, ?, 80) dtype=float32>,
 None,
 <tf.Tensor 'model_1/Sum_1:0' shape=() dtype=float32>,
 <tf.Tensor 'model_1/f0_predictor/Squeeze:0' shape=(?, ?) dtype=float32>,
 <tf.Tensor 'model_1/energy_predictor/Squeeze:0' shape=(?, ?) dtype=float32>)

In [11]:
speed_ratios = tf.placeholder(tf.float32, (), name = 'speed_ratios')
noise_scale_w = tf.placeholder(tf.float32, (), name = 'noise_scale_w')
f0_ratios = tf.placeholder(tf.float32, [None], name = 'f0_ratios')
energy_ratios = tf.placeholder(tf.float32, [None], name = 'energy_ratios')

r = model_fs.inference(input_ids, speed_ratios, f0_ratios, energy_ratios, noise_scale_w)
r

(<tf.Tensor 'mel_before/BiasAdd:0' shape=(?, ?, 80) dtype=float32>,
 None,
 <tf.Tensor 'Cast_1:0' shape=(?, ?) dtype=int32>,
 <tf.Tensor 'mul_2:0' shape=(?, ?) dtype=float32>,
 <tf.Tensor 'mul_3:0' shape=(?, ?) dtype=float32>)

In [12]:
outputs = model.infer(r[0])
y_hat = tf.identity(outputs[0,:,0], name = 'y_hat')

In [13]:
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())





In [14]:
path = 'e2e-fastspeech2-osman-v3'
ckpt_path = tf.train.latest_checkpoint(path)
ckpt_path

'e2e-fastspeech2-osman-v3/model.ckpt-636000'

In [15]:
tvars = tf.trainable_variables()




In [16]:
import collections
import re

def get_assignment_map_from_checkpoint(tvars, init_checkpoint):
    """
    Compute the union of the current variables and checkpoint variables.
    """
    assignment_map = {}
    initialized_variable_names = {}

    name_to_variable = collections.OrderedDict()
    for var in tvars:
        name = var.name
        m = re.match('^(.*):\\d+$', name)
        if m is not None:
            name = m.group(1)
        name_to_variable[name] = var

    init_vars = tf.train.list_variables(init_checkpoint)

    assignment_map = collections.OrderedDict()
    for x in init_vars:
        (name, var) = (x[0], x[1])
        splitted = '/'.join(name.split('/')[1:])
        if name not in name_to_variable and splitted not in name_to_variable:
            continue
        if name in name_to_variable:
            selected = name
        if splitted in name_to_variable:
            selected = splitted
        assignment_map[name] = name_to_variable[selected]
        initialized_variable_names[name] = 1
        initialized_variable_names[name + ':0'] = 1

    return (assignment_map, initialized_variable_names)

In [17]:
assignment_map, initialized_variable_names = get_assignment_map_from_checkpoint(tvars, ckpt_path)

In [18]:
!ls e2e-fastspeech2-osman-v3/

checkpoint
events.out.tfevents.1661274157.huseincomel-desktop
events.out.tfevents.1661355476.huseincomel-desktop
events.out.tfevents.1661440943.huseincomel-desktop
events.out.tfevents.1661491550.huseincomel-desktop
events.out.tfevents.1661532733.huseincomel-desktop
events.out.tfevents.1661591033.huseincomel-desktop
model.ckpt-155000.data-00000-of-00001
model.ckpt-155000.index
model.ckpt-155000.meta
model.ckpt-156000.data-00000-of-00001
model.ckpt-156000.index
model.ckpt-156000.meta
model.ckpt-157000.data-00000-of-00001
model.ckpt-157000.index
model.ckpt-157000.meta
model.ckpt-158000.data-00000-of-00001
model.ckpt-158000.index
model.ckpt-158000.meta
model.ckpt-159000.data-00000-of-00001
model.ckpt-159000.index
model.ckpt-159000.meta
model.ckpt-331000.data-00000-of-00001
model.ckpt-331000.index
model.ckpt-331000.meta
model.ckpt-332000.data-00000-of-00001
model.ckpt-332000.index
model.ckpt-332000.meta
model.ckpt-333000.data-00000-of-00001
model.ckpt-333000.ind

In [19]:
saver = tf.train.Saver(var_list = assignment_map)
saver.restore(sess, 'e2e-fastspeech2-osman-v3/model.ckpt-635000')


INFO:tensorflow:Restoring parameters from e2e-fastspeech2-osman-v3/model.ckpt-635000


In [20]:
import re
from unidecode import unidecode
import malaya

# Regular expression matching text enclosed in curly braces:
_curly_re = re.compile(r"(.*?)\{(.+?)\}(.*)")
pad_to = 8

def tts_encode(string: str, add_eos: bool = True):
    r = [MALAYA_SPEECH_SYMBOLS.index(c) for c in string if c in MALAYA_SPEECH_SYMBOLS]
    if add_eos:
        r = r + [MALAYA_SPEECH_SYMBOLS.index('eos')]
    return r

def put_spacing_num(string):
    string = re.sub('[A-Za-z]+', lambda ele: ' ' + ele[0] + ' ', string)
    return re.sub(r'[ ]+', ' ', string).strip()

def convert_to_ascii(string):
    return unidecode(string)

def collapse_whitespace(string):
    return re.sub(_whitespace_re, ' ', string)

def cleaning(string, add_eos = False):
    sequence = []
    string = convert_to_ascii(string)
    string = re.sub(r'[ ]+', ' ', string).strip()
    string = string.replace('=', ' sama dengan ')
    string = string.replace('*', ' asterisk ')
    if string[-1] in ['-', ',']:
        string = string[:-1]
    if string[-1] != '.':
        string = string + '.'
    string = put_spacing_num(string)
    string = re.sub(r'[ ]+', ' ', string).strip()
    ids = tts_encode(string, add_eos = add_eos)
    text_input = np.array(ids)
    num_pad = pad_to - ((len(text_input) + 2) % pad_to)
    text_input = np.pad(
        text_input, ((1, 1)), 'constant', constant_values = ((1, 2))
    )
    text_input = np.pad(
        text_input, ((0, num_pad)), 'constant', constant_values = 0
    )
    
    return string, text_input

In [21]:
# https://umno-online.my/2020/12/28/isu-kartel-daging-haram-lagi-pihak-gesa-kerajaan-ambil-tindakan-tegas-drastik/

t, ids = cleaning('Haqkiem adalah pelajar tahun akhir yang mengambil Ijazah Sarjana Muda Sains Komputer Kecerdasan Buatan utama dari Universiti Teknikal Malaysia Melaka (UTeM) yang kini berusaha untuk latihan industri di mana dia secara praktikal dapat menerapkan pengetahuannya dalam Perisikan Perisian dan Pengaturcaraan ke arah organisasi atau industri yang berkaitan.')
t, ids

('Haqkiem adalah pelajar tahun akhir yang mengambil Ijazah Sarjana Muda Sains Komputer Kecerdasan Buatan utama dari Universiti Teknikal Malaysia Melaka ( UTeM ) yang kini berusaha untuk latihan industri di mana dia secara praktikal dapat menerapkan pengetahuannya dalam Perisikan Perisian dan Pengaturcaraan ke arah organisasi atau industri yang berkaitan .',
 array([ 1, 21, 40, 56, 50, 48, 44, 52, 13, 40, 43, 40, 51, 40, 47, 13, 55,
        44, 51, 40, 49, 40, 57, 13, 59, 40, 47, 60, 53, 13, 40, 50, 47, 48,
        57, 13, 64, 40, 53, 46, 13, 52, 44, 53, 46, 40, 52, 41, 48, 51, 13,
        22, 49, 40, 65, 40, 47, 13, 32, 40, 57, 49, 40, 53, 40, 13, 26, 60,
        43, 40, 13, 32, 40, 48, 53, 58, 13, 24, 54, 52, 55, 60, 59, 44, 57,
        13, 24, 44, 42, 44, 57, 43, 40, 58, 40, 53, 13, 15, 60, 40, 59, 40,
        53, 13, 60, 59, 40, 52, 40, 13, 43, 40, 57, 48, 13, 34, 53, 48, 61,
        44, 57, 58, 48, 59, 48, 13, 33, 44, 50, 53, 48, 50, 40, 51, 13, 26,
        40, 51, 40, 64, 58, 48, 

In [22]:
%%time

o = sess.run(y_hat, feed_dict = {input_ids: [ids], 
                             speed_ratios: 1.0, noise_scale_w: 0.6666,
                             f0_ratios: [1.0], 
                             energy_ratios: [1.0]})

CPU times: user 32.2 s, sys: 3.58 s, total: 35.8 s
Wall time: 3.94 s


In [23]:
o.shape

(460800,)

In [24]:
import IPython.display as ipd
ipd.Audio(o, rate = 22050)

In [25]:
saver = tf.train.Saver()
saver.save(sess, 'e2e-fastspeech2-osman-output/model.ckpt')

'e2e-fastspeech2-osman-output/model.ckpt'

In [26]:
strings = ','.join(
    [
        n.name
        for n in tf.get_default_graph().as_graph_def().node
        if ('Variable' in n.op
        or 'gather' in n.op.lower()
        or 'Placeholder' in n.name
        or 'ratios' in n.name
        or 'post_mel_outputs' in n.name
        or 'decoder_output' in n.name
        or 'y_hat' in n.name
        or 'alignment_histories' in n.name)
        and 'adam' not in n.name
        and 'global_step' not in n.name
        and 'Assign' not in n.name
        and 'ReadVariableOp' not in n.name
        and 'Gather' not in n.name
        and 'IsVariableInitialized' not in n.name
    ]
)
strings.split(',')





['Variable',
 'Variable_1',
 'Variable_2',
 'Variable_3',
 'Placeholder',
 'Placeholder_1',
 'Placeholder_2',
 'Placeholder_3',
 'Placeholder_4',
 'Placeholder_5',
 'weight',
 'model_1/embeddings/position_embeddings/Placeholder',
 'model_1/decoder/position_embeddings/Placeholder',
 'speed_ratios',
 'f0_ratios',
 'energy_ratios',
 'y_hat']

In [27]:
def freeze_graph(model_dir, output_node_names):

    if not tf.gfile.Exists(model_dir):
        raise AssertionError(
            "Export directory doesn't exists. Please specify an export "
            'directory: %s' % model_dir
        )

    checkpoint = tf.train.get_checkpoint_state(model_dir)
    input_checkpoint = checkpoint.model_checkpoint_path

    absolute_model_dir = '/'.join(input_checkpoint.split('/')[:-1])
    output_graph = absolute_model_dir + '/frozen_model.pb'
    clear_devices = True
    with tf.Session(graph = tf.Graph()) as sess:
        saver = tf.train.import_meta_graph(
            input_checkpoint + '.meta', clear_devices = clear_devices
        )
        saver.restore(sess, input_checkpoint)
        output_graph_def = tf.graph_util.convert_variables_to_constants(
            sess,
            tf.get_default_graph().as_graph_def(),
            output_node_names.split(','),
        )
        with tf.gfile.GFile(output_graph, 'wb') as f:
            f.write(output_graph_def.SerializeToString())
        print('%d ops in the final graph.' % len(output_graph_def.node))

In [28]:
freeze_graph('e2e-fastspeech2-osman-output', strings)




INFO:tensorflow:Restoring parameters from e2e-fastspeech2-osman-output/model.ckpt
INFO:tensorflow:Froze 504 variables.
INFO:tensorflow:Converted 504 variables to const ops.

10213 ops in the final graph.


In [29]:
def load_graph(frozen_graph_filename):
    with tf.gfile.GFile(frozen_graph_filename, 'rb') as f:
        graph_def = tf.GraphDef()
        graph_def.ParseFromString(f.read())
                
    with tf.Graph().as_default() as graph:
        tf.import_graph_def(graph_def)
        
    return graph

In [30]:
g = load_graph('e2e-fastspeech2-osman-output/frozen_model.pb')




In [31]:
test_sess = tf.InteractiveSession(graph = g)

In [35]:
X = g.get_tensor_by_name('import/Placeholder:0')
f0_ratios = g.get_tensor_by_name('import/f0_ratios:0')
energy_ratios = g.get_tensor_by_name('import/energy_ratios:0')
speed_ratios = g.get_tensor_by_name('import/speed_ratios:0')
noise_scale_w = g.get_tensor_by_name('import/noise_scale_w:0')

In [33]:
output_nodes = ['y_hat']
outputs = {n: g.get_tensor_by_name(f'import/{n}:0') for n in output_nodes}

In [36]:
%%time

o = test_sess.run(outputs, feed_dict = {X: [ids], 
                             speed_ratios: 1.0,
                             noise_scale_w: 1.0,
                             f0_ratios: [1.0], 
                             energy_ratios: [1.0]})

CPU times: user 26.3 s, sys: 2.87 s, total: 29.2 s
Wall time: 3.36 s


In [37]:
from tensorflow.tools.graph_transforms import TransformGraph

In [38]:
transforms = ['add_default_attributes',
             'remove_nodes(op=Identity, op=CheckNumerics)',
             'fold_batch_norms',
             'fold_old_batch_norms',
             'quantize_weights(fallback_min=-1024, fallback_max=1024)',
             'strip_unused_nodes',
             'sort_by_execution_order']

In [39]:
pb = 'e2e-fastspeech2-osman-output/frozen_model.pb'

In [40]:
input_graph_def = tf.GraphDef()
with tf.gfile.FastGFile(pb, 'rb') as f:
    input_graph_def.ParseFromString(f.read())

transformed_graph_def = TransformGraph(input_graph_def, 
                                           ['Placeholder', 'speed_ratios', 'f0_ratios', 'energy_ratios',
                                           'noise_scale_w'],
                                           output_nodes, transforms)
    
with tf.gfile.GFile(f'{pb}.quantized', 'wb') as f:
    f.write(transformed_graph_def.SerializeToString())

2022-08-27 17:15:50.823877: I tensorflow/tools/graph_transforms/transform_graph.cc:318] Applying add_default_attributes
2022-08-27 17:15:50.893364: I tensorflow/tools/graph_transforms/transform_graph.cc:318] Applying remove_nodes
2022-08-27 17:15:50.954920: I tensorflow/tools/graph_transforms/remove_nodes.cc:78] Skipping replacement for y_hat
2022-08-27 17:15:51.029795: I tensorflow/tools/graph_transforms/remove_nodes.cc:78] Skipping replacement for y_hat
2022-08-27 17:15:51.168710: I tensorflow/tools/graph_transforms/transform_graph.cc:318] Applying fold_batch_norms
2022-08-27 17:15:51.278570: I tensorflow/tools/graph_transforms/transform_graph.cc:318] Applying fold_old_batch_norms
2022-08-27 17:15:51.588958: I tensorflow/tools/graph_transforms/transform_graph.cc:318] Applying quantize_weights
2022-08-27 17:15:52.136916: I tensorflow/tools/graph_transforms/transform_graph.cc:318] Applying strip_unused_nodes
2022-08-27 17:15:52.244002: I tensorflow/tools/graph_transforms/transform_grap

In [45]:
g = load_graph('e2e-fastspeech2-osman-output/frozen_model.pb.quantized')

In [42]:
!tar -cvf e2e-fastspeech2-osman-output.tar e2e-fastspeech2-osman-output

e2e-fastspeech2-osman-output/
e2e-fastspeech2-osman-output/checkpoint
e2e-fastspeech2-osman-output/frozen_model.pb.quantized
e2e-fastspeech2-osman-output/model.ckpt.index
e2e-fastspeech2-osman-output/model.ckpt.data-00000-of-00001
e2e-fastspeech2-osman-output/model.ckpt.meta
e2e-fastspeech2-osman-output/frozen_model.pb


In [43]:
from malaya_boilerplate.huggingface import upload_dict

In [44]:
files_mapping = {'e2e-fastspeech2-osman-output.tar': 'e2e-fastspeech2-osman-output.tar'}
upload_dict(model = 'pretrained-fastspeech2', files_mapping = files_mapping)

409 Client Error: Conflict for url: https://huggingface.co/api/repos/create - You already created this model repo


In [46]:
files_mapping = {'e2e-fastspeech2-osman-output/frozen_model.pb': 'model.pb'}
upload_dict(model = 'text-to-speech-e2e-fastspeech-osman', files_mapping = files_mapping)

In [47]:
files_mapping = {'e2e-fastspeech2-osman-output/frozen_model.pb.quantized': 'model.pb'}
upload_dict(model = 'text-to-speech-e2e-fastspeech-osman-quantized', files_mapping = files_mapping)