In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '1'

In [2]:
import sys

SOURCE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__name__)))
sys.path.insert(0, SOURCE_DIR)

In [3]:
import malaya_speech
import malaya_speech.config
import malaya_speech.train.model.quartznet as quartznet
import tensorflow as tf
import numpy as np






The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.




In [4]:
featurizer = malaya_speech.tf_featurization.STTFeaturizer(normalize_per_feature = True)
n_mels = featurizer.num_feature_bins

In [5]:
import malaya_speech.train.model.ctc as ctc
import malaya_speech.train as train

In [6]:
parameters = {
    'optimizer_params': {
        'beta1': 0.95,
        'beta2': 0.5,
        'epsilon': 1e-08,
        'weight_decay': 0.001,
        'grad_averaging': False,
    },
    'lr_policy_params': {
        'learning_rate': 0.01,
        'min_lr': 0.0,
        'warmup_steps': 1000,
        'decay_steps': 1000,
    },
    'loss_scaling': 'Backoff'
}

def learning_rate_scheduler(global_step):
    return train.schedule.cosine_decay(global_step, **parameters['lr_policy_params'])

In [7]:
import random
import string

def get_random_string(length):
    letters = string.ascii_lowercase
    return ''.join(random.choice(letters) for i in range(length))

def generate(sr = 16000):
    while True:
        random_length = random.randint(1, 5)
        x = np.random.normal(size = (sr * random_length))
        y = get_random_string(random_length * 10)
        encoded = malaya_speech.char.encode(y)
        yield {'x': x, 'targets': encoded}

In [8]:
def preprocess_inputs(example):
    s = featurizer.vectorize(example['x'])
    mel_fbanks = tf.reshape(s, (-1, n_mels))
    length = tf.cast(tf.shape(mel_fbanks)[0], tf.int32)
    length = tf.expand_dims(length, 0)
    example['inputs'] = mel_fbanks
    example['inputs_length'] = length
    
    keys = list(example.keys())
    for k in keys:
        if k not in ['inputs', 'inputs_length', 'targets']:
            example.pop(k, None)
    
    return example

def get_dataset(batch_size = 4):
    def get():
        dataset = tf.data.Dataset.from_generator(
            generate,
            {'x': tf.float32, 'targets': tf.int32},
            output_shapes = {
                'x': tf.TensorShape([None]),
                'targets': tf.TensorShape([None]),
            },
        )
        dataset = dataset.map(preprocess_inputs)
        dataset = dataset.padded_batch(
            batch_size,
            padded_shapes = {
                'inputs': tf.TensorShape([None, n_mels]),
                'inputs_length': tf.TensorShape([None]),
                'targets': tf.TensorShape([None]),
            },
            padding_values = {
                'inputs': tf.constant(0, dtype = tf.float32),
                'inputs_length': tf.constant(0, dtype = tf.int32),
                'targets': tf.constant(0, dtype = tf.int32),
            },
        )
        return dataset

    return get

In [9]:
tf.reset_default_graph()
sess = tf.InteractiveSession()
iterator = get_dataset()()
iterator = iterator.make_one_shot_iterator().get_next()

Instructions for updating:
Use `for ... in dataset:` to iterate over a dataset. If using `tf.estimator`, return the `Dataset` object directly from your input function. As a last resort, you can use `tf.compat.v1.data.make_one_shot_iterator(dataset)`.


In [10]:
r = sess.run(iterator)
r['inputs'].shape, r['inputs_length'].shape, r['targets'].shape

((4, 498, 80), (4, 1), (4, 51))

In [11]:
def model_fn(features, labels, mode, params):
    model = quartznet.Model(
        features['inputs'], features['inputs_length'][:, 0], mode = 'train'
    )
    logits = tf.layers.dense(
        model.logits['outputs'], malaya_speech.char.VOCAB_SIZE
    )
    seq_lens = model.logits['src_length']
    targets_int32 = tf.cast(features['targets'], tf.int32)

    mean_error, sum_error, sum_weight = ctc.loss.ctc_loss(
        logits, targets_int32, seq_lens
    )

    loss = mean_error
    accuracy = ctc.metrics.ctc_sequence_accuracy(
        logits, targets_int32, seq_lens
    )
    tf.identity(loss, 'train_loss')
    tf.identity(accuracy, name = 'train_accuracy')

    if mode == tf.estimator.ModeKeys.TRAIN:
        train_op = train.optimizer.optimize_loss(
            loss,
            train.optimizer.NovoGrad,
            parameters['optimizer_params'],
            learning_rate_scheduler,
            summaries = parameters.get('summaries', None),
            larc_params = parameters.get('larc_params', None),
            loss_scaling = parameters.get('loss_scaling', 1.0),
            loss_scaling_params = parameters.get('loss_scaling_params', None),
        )
        estimator_spec = tf.estimator.EstimatorSpec(
            mode = mode, loss = loss, train_op = train_op
        )
    elif mode == tf.estimator.ModeKeys.EVAL:
        estimator_spec = tf.estimator.EstimatorSpec(
            mode = tf.estimator.ModeKeys.EVAL,
            loss = loss,
            eval_metric_ops = {
                'accuracy': ctc.metrics.ctc_sequence_accuracy_estimator(
                    logits, targets_int32, seq_lens
                ),
                'WER': ctc.metrics.word_error_rate_estimator(
                    logits, targets_int32
                ),
            },
        )

    return estimator_spec

In [12]:
train_hooks = [
    tf.train.LoggingTensorHook(
        ['train_accuracy', 'train_loss'], every_n_iter = 1
    )
]
train_dataset = get_dataset()

In [None]:
train.run_training(
    train_fn = train_dataset,
    model_fn = model_fn,
    model_dir = 'asr-quartznet',
    num_gpus = 1,
    log_step = 1,
    save_checkpoint_step = 10,
    max_steps = 10,
    train_hooks = train_hooks,
)



INFO:tensorflow:Using config: {'_model_dir': 'asr-quartznet', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 10, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 1, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x143ae4d90>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
Instructions for updating:
Use Variable.read_value. Variables in 2.X are initialized automatically bo