In [1]:
import sys
import os

SOURCE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__name__)))
sys.path.insert(0, SOURCE_DIR)

In [2]:
import malaya_speech
import malaya_speech.train as train
import malaya_speech.train.model.ctc as ctc
import numpy as np
import tensorflow as tf






The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.




In [3]:
# according to ASCII, 34 is space.

ref = np.asarray([
    # a b c
    [97, 34, 98, 34, 99],
    [97, 34, 98, 34, 99],
    [97, 34, 98, 34, 99],
    [97, 34, 98, 34, 99],
])

hyp = np.asarray([
    [97, 34, 98, 34, 99],   # a b c
    [97, 34, 98, 0, 0],     # a b
    [97, 34, 98, 34, 100],  # a b d
    [0, 0, 0, 0, 0]         # empty
])

predictions = np.zeros((len(ref), np.max([len(s) for s in hyp]), malaya_speech.char.VOCAB_SIZE))
for i, sample in enumerate(hyp):
    for j, idx in enumerate(sample):
        predictions[i, j, idx] = 1

In [4]:
sess = tf.InteractiveSession()

In [5]:
sess.run(ctc.metrics.word_error_rate(predictions, ref))





(0.41666666, 12.0)

In [6]:
sess.run(ctc.metrics.word_error_rate(predictions[:-1], ref))

(0.41666666, 12.0)

In [7]:
texts = ['hello saya sakit', 'jom makan', 'dangdut 123']
unique_chars = malaya_speech.char.generate_vocab(texts)
unique_chars

['<PAD>',
 '<EOS>',
 'a',
 ' ',
 'l',
 'o',
 's',
 't',
 'm',
 'k',
 'd',
 'n',
 '2',
 'g',
 'j',
 'y',
 'h',
 '1',
 'e',
 'i',
 '3',
 'u']

In [8]:
refs = []
for text in texts:
    encoded = malaya_speech.char.encode(text, lookup = unique_chars)
    refs.append(encoded)
    
refs = malaya_speech.padding.sequence_1d(refs)
refs

array([[16, 18,  4,  4,  5,  3,  6,  2, 15,  2,  3,  6,  2,  9, 19,  7,
         1],
       [14,  5,  8,  3,  8,  2,  9,  2, 11,  1,  0,  0,  0,  0,  0,  0,
         0],
       [10,  2, 11, 13, 10, 21,  7,  3, 17, 12, 20,  1,  0,  0,  0,  0,
         0]])

In [9]:
hyp = refs

In [10]:
predictions = np.zeros((len(refs), np.max([len(s) for s in hyp]), len(unique_chars) + 1))
for i, sample in enumerate(hyp):
    for j, idx in enumerate(sample):
        predictions[i, j, idx] = 1

In [11]:
sess.run(ctc.metrics.word_error_rate(predictions, refs, lookup = unique_chars))

(0.0, 7.0)

In [12]:
hyp = []
for text in texts:
    encoded = malaya_speech.char.encode(text + ' makan', lookup = unique_chars)
    hyp.append(encoded)
    
hyp = malaya_speech.padding.sequence_1d(hyp)

In [13]:
predictions = np.zeros((len(refs), np.max([len(s) for s in hyp]), len(unique_chars) + 1))
for i, sample in enumerate(hyp):
    for j, idx in enumerate(sample):
        predictions[i, j, idx] = 1

In [14]:
sess.run(ctc.metrics.word_error_rate(predictions, refs, lookup = unique_chars))

(0.42857143, 7.0)

In [15]:
sess.run(ctc.metrics.from_tokens(refs, unique_chars))

SparseTensorValue(indices=array([[0, 0],
       [0, 1],
       [0, 2],
       [1, 0],
       [1, 1],
       [2, 0],
       [2, 1]]), values=array([b'hello', b'saya', b'sakit', b'jom', b'makan', b'dangdut', b'123'],
      dtype=object), dense_shape=array([3, 3]))

In [16]:
sess.run(ctc.metrics.from_tokens(tf.argmax(predictions, axis = -1), unique_chars))

SparseTensorValue(indices=array([[0, 0],
       [0, 1],
       [0, 2],
       [0, 3],
       [1, 0],
       [1, 1],
       [1, 2],
       [2, 0],
       [2, 1],
       [2, 2]]), values=array([b'hello', b'saya', b'sakit', b'makan', b'jom', b'makan', b'makan',
       b'dangdut', b'123', b'makan'], dtype=object), dense_shape=array([3, 4]))