In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = ''

In [2]:
import tensorflow as tf
import malaya_speech
from malaya_speech.train.model.conformer.model import Model as ConformerModel
from malaya_speech.train.model import hubert
import string
import tensorflow.keras as keras
import tensorflow.keras.backend as K
import numpy as np

In [3]:
unique_vocab = [''] + list(string.ascii_lowercase + string.digits) + [' ']

In [4]:
class Encoder:
    def __init__(self, config):
        self.config = config
        self.encoder = ConformerModel(**self.config)

    def __call__(self, x, input_mask, training=True):
        return self.encoder(x, training=training)

In [5]:
config_conformer = malaya_speech.config.conformer_base_encoder_config
config_conformer['subsampling']['type'] = 'none'
config_conformer['dropout'] = 0.0
encoder = Encoder(config_conformer)
cfg = hubert.HuBERTConfig(
    extractor_mode='layer_norm',
    dropout=0.0,
    attention_dropout=0.0,
    encoder_layerdrop=0.0,
    dropout_input=0.0,
    dropout_features=0.0,
    final_dim=256,
)
model = hubert.Model(cfg, encoder, ['pad', 'eos', 'unk'] + [str(i) for i in range(100)])




In [6]:
weight_decay = 1e-5
num_class = 5994

In [7]:
X = tf.placeholder(tf.float32, [None, None])
X_len = tf.placeholder(tf.int32, [None])

In [8]:
Y = tf.placeholder(tf.int32, [None])
Y_onehot = tf.one_hot(Y, depth = num_class)
Y_onehot

<tf.Tensor 'one_hot:0' shape=(?, 5994) dtype=float32>

In [9]:
r = model(X, padding_mask=X_len, features_only=True, mask=False)

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


In [10]:
def amsoftmax_loss(y_true, y_pred, scale=30, margin=0.35):
    y_pred = y_true * (y_pred - margin) + (1 - y_true) * y_pred
    y_pred *= scale
    return K.categorical_crossentropy(y_true, y_pred, from_logits=True)

In [11]:
first_token_tensor = tf.squeeze(r['x'][:, 0:1, :], axis=1)

first_token_tensor

<tf.Tensor 'Squeeze:0' shape=(?, 256) dtype=float32>

In [12]:
pooled_output = keras.layers.Dense(cfg.final_dim * 2, activation='tanh',
                           kernel_initializer='orthogonal',
                           use_bias=True, trainable=True,
                           kernel_regularizer=keras.regularizers.l2(weight_decay),
                           bias_regularizer=keras.regularizers.l2(weight_decay))(first_token_tensor)
pooled_output

<tf.Tensor 'dense_2/Tanh:0' shape=(?, 512) dtype=float32>

In [13]:
y = keras.layers.Dense(num_class,
                               kernel_initializer='orthogonal',
                               use_bias=False, trainable=True,
                               kernel_constraint=keras.constraints.unit_norm(),
                               kernel_regularizer=keras.regularizers.l2(weight_decay),
                               bias_regularizer=keras.regularizers.l2(weight_decay),
                               name='prediction')(pooled_output)
y

<tf.Tensor 'prediction/MatMul:0' shape=(?, 5994) dtype=float32>

In [14]:
loss = amsoftmax_loss(Y_onehot, y)
loss

<tf.Tensor 'softmax_cross_entropy_with_logits/Reshape_2:0' shape=(?,) dtype=float32>

In [15]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())

In [16]:
x = np.random.normal(size = (2, 16000))
y = np.array([1, 2])

In [19]:
sess.run(loss, feed_dict = {X: x, Y: y})

array([28.691189, 30.897335], dtype=float32)