In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '1'

In [2]:
import tensorflow as tf
import malaya_speech
import malaya_speech.augmentation.waveform as augmentation
import malaya_speech.augmentation.spectrogram as mask_augmentation
import malaya_speech.train.model.alconformer as conformer
import malaya_speech.train.model.transducer as transducer
import json
import random






The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



In [3]:
subwords = malaya_speech.subword.load('transducer.subword')

In [4]:
config = malaya_speech.config.conformer_small_encoder_config
conformer_model = conformer.Model(
    kernel_regularizer = None, bias_regularizer = None, **config
)
decoder_config = malaya_speech.config.conformer_small_decoder_config
transducer_model = transducer.rnn.Model(
    conformer_model, vocabulary_size = subwords.vocab_size, **decoder_config
)

In [5]:
X = tf.placeholder(tf.int32, [None, None])
X_len = tf.placeholder(tf.int32, [None])
Y = tf.placeholder(tf.int32, [None, None])
Y_len = tf.placeholder(tf.int32, [None])

In [6]:
logits = transducer_model.predict_net([X, X_len])


Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [7]:
embed = tf.transpose(transducer_model.predict_net.embed.embeddings)
logits = tf.matmul(logits, embed)

In [8]:
masks = tf.sequence_mask(Y_len, tf.reduce_max(Y_len), dtype=tf.float32)
cost = tf.contrib.seq2seq.sequence_loss(logits = logits,
                                             targets = Y,
                                             weights = masks)

In [9]:
learning_rate = 1e-4
optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)

In [10]:
with open('cleaned-rnn-lm.json') as fopen:
    txts = json.load(fopen)

In [11]:
config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.compat.v1.Session(config = config)
sess.run(tf.global_variables_initializer())
saver = tf.train.Saver()

In [12]:
from tqdm import tqdm

total_cost = []
steps = 100000
checkpoint = 10000
checkpoint_folder = 'transducer-rnn-small'
min_len = 2
batch_size = 128
pbar = tqdm(range(0, steps))
for epoch in pbar:
    try:
        batch = random.sample(txts, batch_size)
        batch = [malaya_speech.subword.encode(subwords, t) for t in batch]
        batch_x, batch_y = [], []
        for i in batch:
            try:
                if len(i) < min_len:
                    continue
                random_len = random.randint(min_len, min(len(i) - 1, 300))
                start = random.randint(0, len(i) - random_len)
                batch_x.append(i[start: start + random_len])
                batch_y.append(i[start + 1: start + random_len + 1])
            except:
                pass
        batch_x, batch_x_len = malaya_speech.padding.sequence_1d(batch_x, return_len = True)
        batch_y, batch_y_len = malaya_speech.padding.sequence_1d(batch_y, return_len = True)
        _, c = sess.run([optimizer, cost], feed_dict = {X: batch_x, X_len: batch_x_len,
                                            Y: batch_y, Y_len: batch_y_len})
        total_cost.append(c)
        pbar.set_postfix(cost = c)
    except:
        pass
    if epoch % checkpoint == 0:
        saver.save(sess, f'{checkpoint_folder}/model.ckpt')

100%|██████████| 100000/100000 [8:45:41<00:00,  3.17it/s, cost=3.01]  


In [13]:
saver.save(sess, f'{checkpoint_folder}/model.ckpt')

'transducer-rnn-small/model.ckpt'