In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '1'

In [2]:
import sys

SOURCE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__name__)))
sys.path.insert(0, SOURCE_DIR)

In [3]:
import tensorflow as tf
# tf.compat.v1.enable_eager_execution()

In [4]:
import numpy as np
import malaya_speech
import malaya_speech.config
from malaya_speech.train.model import wav2vec2, transducer






The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



In [5]:
from malaya_speech.train.model.conformer.model import Model as ConformerModel

class Encoder:
    def __init__(self, config):
        self.config = config
        self.encoder = ConformerModel(**self.config)

    def __call__(self, x, input_mask, training = True):
        return self.encoder(x, training = training)
    
config_conformer = malaya_speech.config.conformer_base_encoder_config
config_conformer['subsampling']['type'] = 'none'
config_conformer['dropout'] = 0.0
encoder = Encoder(config_conformer)

In [6]:
class Model:
    def __init__(self, cfg, encoder):
        self.model = wav2vec2.Model(cfg, encoder)
        
    def __call__(self, inputs, training = True):
        X, X_len = inputs
        r = self.model(X, padding_mask = X_len, features_only = True)
        self.padding_mask = r['padding_mask']
        return r['x']

In [7]:
cfg = wav2vec2.Wav2Vec2Config()
model = Model(cfg, encoder)




In [8]:
X = tf.placeholder(tf.float32, (None, None))
X_len = tf.placeholder(tf.int32, (None,))

In [9]:
config = malaya_speech.config.conformer_small_decoder_config
small_transducer = transducer.rnn.Model(model, vocabulary_size = 100, **config)

In [10]:
p = tf.placeholder(tf.int32, [None, None])
p_len = tf.placeholder(tf.int32, [None])

In [11]:
small_logits = small_transducer([(X, X_len), p, p_len], training = True)
small_logits

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


<tf.Tensor 'transducer/transducer_joint/transducer_joint_vocab/BiasAdd:0' shape=(?, ?, ?, 100) dtype=float32>

In [12]:
decoded = small_transducer.greedy_decoder(X, X_len, from_wav2vec2 = True,
                                          training = False)

In [13]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())

In [14]:
y, sr = malaya_speech.load('../speech/example-speaker/husein-zolkepli.wav', sr = 16000)
y1, sr = malaya_speech.load('../speech/example-speaker/shafiqah-idayu.wav', sr = 16000)
len(y), len(y1)

(90090, 56298)

In [15]:
padded, lens = malaya_speech.padding.sequence_1d([y, y1], return_len = True)
t = [[1,1,1,1,1,1], [1,1,1,1,1,1]]

In [16]:
%%time

sess.run(small_logits, feed_dict = {X: padded, X_len: lens, p: t, p_len: [6, 6]}).shape

CPU times: user 5.85 s, sys: 806 ms, total: 6.66 s
Wall time: 3.53 s


(2, 281, 6, 100)

In [19]:
%%time

r = sess.run(decoded, feed_dict = {X: padded, X_len: lens, p: t, p_len: [6, 6]})

CPU times: user 4.2 s, sys: 798 ms, total: 5 s
Wall time: 1.29 s


In [17]:
tf.trainable_variables()

[<tf.Variable 'vars:0' shape=(1, 640, 128) dtype=float32_ref>,
 <tf.Variable 'mask_emb:0' shape=(768,) dtype=float32_ref>,
 <tf.Variable 'transducer/wav2vec2/ConvFeatureExtractionModel/sequential/conv1d/kernel:0' shape=(10, 1, 512) dtype=float32>,
 <tf.Variable 'transducer/wav2vec2/ConvFeatureExtractionModel/sequential/group_normalization/gamma:0' shape=(512,) dtype=float32>,
 <tf.Variable 'transducer/wav2vec2/ConvFeatureExtractionModel/sequential/group_normalization/beta:0' shape=(512,) dtype=float32>,
 <tf.Variable 'transducer/wav2vec2/ConvFeatureExtractionModel/sequential_1/conv1d_1/kernel:0' shape=(3, 512, 512) dtype=float32>,
 <tf.Variable 'transducer/wav2vec2/ConvFeatureExtractionModel/sequential_2/conv1d_2/kernel:0' shape=(3, 512, 512) dtype=float32>,
 <tf.Variable 'transducer/wav2vec2/ConvFeatureExtractionModel/sequential_3/conv1d_3/kernel:0' shape=(3, 512, 512) dtype=float32>,
 <tf.Variable 'transducer/wav2vec2/ConvFeatureExtractionModel/sequential_4/conv1d_4/kernel:0' shape=(