In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = ''

In [2]:
from malaya_speech.train.model import wav2vec2, ctc
from malaya_speech.train.model.conformer.model import Model as ConformerModel
import malaya_speech
import tensorflow as tf
import numpy as np
import json
from glob import glob






The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



In [3]:
with open('malaya-speech-sst-vocab.json') as fopen:
    unique_vocab = json.load(fopen) + ['{', '}', '[']

In [4]:
n_mels = 80
sr = 16000
maxlen = 18
minlen_text = 1

def mp3_to_wav(file, sr = sr):
    audio = AudioSegment.from_file(file)
    audio = audio.set_frame_rate(sr).set_channels(1)
    sample = np.array(audio.get_array_of_samples())
    return malaya_speech.astype.int_to_float(sample), sr


def generate(file):
    with open(file) as fopen:
        dataset = json.load(fopen)
    audios, cleaned_texts = dataset['X'], dataset['Y']
    for i in range(len(audios)):
        try:
            if audios[i].endswith('.mp3'):
                # print('found mp3', audios[i])
                wav_data, _ = mp3_to_wav(audios[i])
            else:
                wav_data, _ = malaya_speech.load(audios[i], sr = sr)

            if (len(wav_data) / sr) > maxlen:
                # print(f'skipped audio too long {audios[i]}')
                continue

            if len(cleaned_texts[i]) < minlen_text:
                # print(f'skipped text too short {audios[i]}')
                continue

            t = [unique_vocab.index(c) for c in cleaned_texts[i]]

            yield {
                    'waveforms': wav_data,
                    'waveforms_length': [len(wav_data)],
                    'targets': t,
                    'targets_length': [len(t)],
                }
        except Exception as e:
            print(e)


def get_dataset(
    file,
    batch_size = 3,
    shuffle_size = 20,
    thread_count = 24,
    maxlen_feature = 1800,
):
    def get():
        dataset = tf.data.Dataset.from_generator(
            generate,
            {
                'waveforms': tf.float32,
                'waveforms_length': tf.int32,
                'targets': tf.int32,
                'targets_length': tf.int32,
            },
            output_shapes = {
                'waveforms': tf.TensorShape([None]),
                'waveforms_length': tf.TensorShape([None]),
                'targets': tf.TensorShape([None]),
                'targets_length': tf.TensorShape([None]),
            },
            args = (file,),
        )
        dataset = dataset.prefetch(tf.contrib.data.AUTOTUNE)
        dataset = dataset.padded_batch(
            batch_size,
            padded_shapes = {
                'waveforms': tf.TensorShape([None]),
                'waveforms_length': tf.TensorShape([None]),
                'targets': tf.TensorShape([None]),
                'targets_length': tf.TensorShape([None]),
            },
            padding_values = {
                'waveforms': tf.constant(0, dtype = tf.float32),
                'waveforms_length': tf.constant(0, dtype = tf.int32),
                'targets': tf.constant(0, dtype = tf.int32),
                'targets_length': tf.constant(0, dtype = tf.int32),
            },
        )
        return dataset

    return get

In [5]:
dev_dataset = get_dataset('bahasa-asr-test.json')()
features = dev_dataset.make_one_shot_iterator().get_next()
features

Instructions for updating:
Use `for ... in dataset:` to iterate over a dataset. If using `tf.estimator`, return the `Dataset` object directly from your input function. As a last resort, you can use `tf.compat.v1.data.make_one_shot_iterator(dataset)`.


{'waveforms': <tf.Tensor 'IteratorGetNext:2' shape=(?, ?) dtype=float32>,
 'waveforms_length': <tf.Tensor 'IteratorGetNext:3' shape=(?, ?) dtype=int32>,
 'targets': <tf.Tensor 'IteratorGetNext:0' shape=(?, ?) dtype=int32>,
 'targets_length': <tf.Tensor 'IteratorGetNext:1' shape=(?, ?) dtype=int32>}

In [6]:
training = True

In [7]:
class Encoder:
    def __init__(self, config):
        self.config = config
        self.encoder = ConformerModel(**self.config)

    def __call__(self, x, input_mask, training = True):
        return self.encoder(x, training = training)

In [8]:
config_conformer = malaya_speech.config.conformer_base_encoder_config
config_conformer['subsampling']['type'] = 'none'
config_conformer['dropout'] = 0.0
encoder = Encoder(config_conformer)
cfg = wav2vec2.Wav2Vec2Config(
    extractor_mode = 'layer_norm',
    dropout = 0.0,
    attention_dropout = 0.0,
    encoder_layerdrop = 0.0,
    dropout_input = 0.0,
    dropout_features = 0.0,
    final_dim = 256,
)
model = wav2vec2.Model(cfg, encoder)
X = features['waveforms']
X_len = features['waveforms_length'][:, 0]
r = model(X, padding_mask = X_len, features_only = True, mask = False)
logits = tf.layers.dense(r['x'], len(unique_vocab) + 1)
seq_lens = tf.reduce_sum(
    tf.cast(tf.logical_not(r['padding_mask']), tf.int32), axis = 1
)


Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Use keras.layers.Dense instead.
Instructions for updating:
Please use `layer.__call__` method instead.


In [9]:
logits = tf.transpose(logits, [1, 0, 2])
logits = tf.identity(logits, name = 'logits')
seq_lens = tf.identity(seq_lens, name = 'seq_lens')

In [10]:
# decoded = tf.nn.ctc_beam_search_decoder(
#     logits,
#     seq_lens,
#     beam_width = beam_size,
#     top_paths = 1,
#     merge_repeated = True)[0][0]
# decoded._indices, decoded._values

In [11]:
logits, seq_lens

(<tf.Tensor 'logits:0' shape=(?, ?, 53) dtype=float32>,
 <tf.Tensor 'seq_lens:0' shape=(?,) dtype=int32>)

In [12]:
decoded = tf.nn.ctc_beam_search_decoder(logits, seq_lens, beam_width=100, top_paths=1, merge_repeated=True)
preds = tf.sparse.to_dense(tf.to_int32(decoded[0][0]))
preds = tf.identity(preds, 'preds')
preds

Instructions for updating:
Use `tf.cast` instead.


<tf.Tensor 'preds:0' shape=(?, ?) dtype=int32>

In [13]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())
var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
saver = tf.train.Saver(var_list = var_list)
saver.restore(sess, 'wav2vec2-conformer-base-ctc/model.ckpt-2000000')

INFO:tensorflow:Restoring parameters from wav2vec2-conformer-base-ctc/model.ckpt-2000000


In [14]:
wer, cer = [], []
index = 0
while True:
    try:
        r = sess.run([preds, features['targets']])
        for no, row in enumerate(r[0]):
            d = malaya_speech.char.decode(row, lookup = unique_vocab).replace('<PAD>', '')
            t = malaya_speech.char.decode(r[1][no], lookup = unique_vocab).replace('<PAD>', '')
            wer.append(malaya_speech.metrics.calculate_wer(t, d))
            cer.append(malaya_speech.metrics.calculate_cer(t, d))
        index += 1
    except Exception as e:
        break

In [15]:
np.mean(wer), np.mean(cer)

(0.2589901416517151, 0.0635017168532852)