In [1]:
# run test-mfcc-kmean.ipynb first to get the data

In [2]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '1'

In [3]:
import sys

SOURCE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__name__)))
sys.path.insert(0, SOURCE_DIR)

In [4]:
import tensorflow as tf
# tf.compat.v1.enable_eager_execution()

In [5]:
import numpy as np
import malaya_speech
import malaya_speech.config
from malaya_speech.train.model import hubert, bert, fastspeech

In [6]:
malaya_speech.config.squeezeformer_s_encoder_config

{'encoder_subsampling': {'type': 'conv2d',
  'filters': 196,
  'kernel_size': 3,
  'strides': 2},
 'encoder_dmodel': 196,
 'encoder_num_blocks': 18,
 'encoder_head_size': 49,
 'encoder_num_heads': 4,
 'encoder_mha_type': 'relmha',
 'encoder_kernel_size': 31,
 'encoder_fc_factor': 1.0,
 'encoder_dropout': 0.1,
 'encoder_time_reduce_idx': [8],
 'encoder_time_recover_idx': [17],
 'encoder_conv_use_glu': False,
 'encoder_ds_subsample': True,
 'encoder_no_post_ln': True,
 'encoder_adaptive_scale': True,
 'encoder_fixed_arch': ['M', 's', 'C', 's']}

In [7]:
from malaya_speech.train.model.squeezeformer.model import Model as ConformerModel

class Encoder:
    def __init__(self, config):
        self.config = config
        self.encoder = ConformerModel(**self.config)

    def __call__(self, x, input_mask, training = True):
        print(input_mask)
        input_lengths = tf.reduce_sum(tf.cast(input_mask, tf.int32), axis = 1)
        return self.encoder(x, input_lengths)
    
config_conformer = malaya_speech.config.squeezeformer_s_encoder_config
config_conformer['encoder_subsampling']['type'] = 'none'
config_conformer['encoder_dropout'] = 0.0
encoder = Encoder(config_conformer)


Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
['M', 's', 'C', 's']
['M', 's', 'C', 's']
['M', 's', 'C', 's']
['M', 's', 'C', 's']
['M', 's', 'C', 's']
['M', 's', 'C', 's']
['M', 's', 'C', 's']
['M', 's', 'C', 's']
['M', 's', 'C', 's']
['M', 's', 'C', 's']
['M', 's', 'C', 's']
['M', 's', 'C', 's']
['M', 's', 'C', 's']
['M', 's', 'C', 's']
['M', 's', 'C', 's']
['M', 's', 'C', 's']
['M', 's', 'C', 's']
['M', 's', 'C', 's']


In [8]:
cfg = hubert.HuBERTConfig()
model = hubert.Model(cfg, encoder, ['pad', 'eos', 'unk'] + [str(i) for i in range(100)])




In [9]:
X = tf.placeholder(tf.float32, (None, None))
Y = tf.placeholder(tf.int32, (None, None))
X_len = tf.placeholder(tf.int32, (None,))

In [10]:
r = model(X, padding_mask = X_len, target_list = Y)
r

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Tensor("hubert/LogicalNot:0", shape=(?, ?), dtype=bool)
Instructions for updating:
dim is deprecated, use axis instead


{'logit_m_list': <tf.Tensor 'hubert/transpose:0' shape=(?, 104) dtype=float32>,
 'logit_u_list': <tf.Tensor 'hubert/transpose_1:0' shape=(?, 104) dtype=float32>,
 'padding_mask': <tf.Tensor 'hubert/LogicalNot:0' shape=(?, ?) dtype=bool>,
 'features_pen': <tf.Tensor 'hubert/Mean:0' shape=() dtype=float32>,
 'x': <tf.Tensor 'hubert/conformer/conformer_encoder/conformer_encoder_block_17/conformer_encoder_block_17_layer3/conformer_encoder_block_17_layer3_ln/batchnorm/add_1:0' shape=(?, ?, 196) dtype=float32>}

In [11]:
target_m = tf.zeros((tf.shape(r['logit_m_list'])[0],),dtype=tf.int32)
target_u = tf.zeros((tf.shape(r['logit_u_list'])[0],),dtype=tf.int32)

In [12]:
sample_size = tf.cast(tf.shape(target_m)[0], tf.float32)
entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels = target_m, logits = r['logit_m_list'])
entropy_m = tf.reduce_sum(entropy) / sample_size

In [13]:
sample_size = tf.cast(tf.shape(target_u)[0], tf.float32)
entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels = target_u, logits = r['logit_u_list'])
entropy_u = tf.reduce_sum(entropy) / sample_size

In [14]:
loss = entropy_m * 0.95 + entropy_u * 0.05

In [15]:
optimizer = tf.train.AdamOptimizer().minimize(loss)

In [16]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())

In [17]:
feat = np.load('out.npy', mmap_mode="r")
leng_path = 'out.len'
with open(leng_path, "r") as f:
    lengs = [int(line.rstrip()) for line in f]
    offsets = [0] + np.cumsum(lengs[:-1]).tolist()
feat.shape

(456, 90)

In [18]:
kmean = hubert.kmeans.ApplyKmeans_TF('kmean.km')

In [19]:
xs, ys = [], []
for offset, leng in zip(offsets, lengs):
    x = feat[offset: offset + leng]
    y = kmean(x) + 3
    ys.append(y)
    xs.append(x)
len(ys), len(xs)

(2, 2)

In [20]:
y, sr = malaya_speech.load('../speech/example-speaker/husein-zolkepli.wav', sr = 16000)
y1, sr = malaya_speech.load('../speech/example-speaker/shafiqah-idayu.wav', sr = 16000)
len(y), len(y1)

(90090, 56298)

In [21]:
X_ = malaya_speech.padding.sequence_1d([y, y1])
X_.shape

(2, 90090)

In [22]:
Y_ = malaya_speech.padding.sequence_1d(ys)
Y_.shape

(2, 281)

In [23]:
o = sess.run([r, loss, optimizer], feed_dict = {X: X_, X_len: [len(y), len(y1)], Y: Y_})

In [24]:
o[1:]

[4.69257, None]