In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = ''

In [2]:
import malaya_speech
import malaya_speech.config
import tensorflow as tf
import numpy as np
import json
import joblib
import random
from malaya_speech.train.model import hubert
from sklearn.utils import shuffle






The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



In [3]:
config = malaya_speech.config.transducer_featurizer_config
config['feature_type'] = 'mfcc'
config['num_feature_bins'] = 30
config['stride_ms'] = 20
featurizer = malaya_speech.utils.tf_featurization.STTFeaturizer(**config)

In [4]:
with open('bahasa-asr-train.json') as fopen:
    dataset = json.load(fopen)
    
audios, cleaned_texts = dataset['X'], dataset['Y']
audios = random.sample(audios, 400000)
len(audios)

400000

In [5]:
i = tf.placeholder(tf.float32, [None])
v = featurizer.vectorize(i)
deltas = malaya_speech.utils.tf_featurization.deltas(v)
ddeltas = malaya_speech.utils.tf_featurization.deltas(deltas)
concated = tf.concat([v, deltas, ddeltas], axis = 1)
concated

<tf.Tensor 'concat_1:0' shape=(?, 90) dtype=float32>

In [6]:
sess = tf.Session()

In [7]:
km_model = hubert.kmeans.get_km_model()
km_model

MiniBatchKMeans(batch_size=10000, compute_labels=False, init='k-means++',
                init_size=None, max_iter=100, max_no_improvement=100,
                n_clusters=100, n_init=20, random_state=None,
                reassignment_ratio=0.0, tol=0.0, verbose=1)

In [8]:
from tqdm import tqdm

batch_size = 10
for n in tqdm(range(0, len(audios), batch_size)):
    x = audios[n: n + batch_size]
    b = []
    for k in range(len(x)):
        if not os.path.exists(x[k]):
            continue
        if x[k].endswith('.mp3'):
            continue
        y, _ = malaya_speech.load(x[k], sr = 16000)
        v1 = sess.run(concated, feed_dict = {i: y})
        b.append(v1)
    if len(b):
        b = np.concatenate(b)
        km_model = km_model.partial_fit(b)

100%|██████████| 40000/40000 [5:21:22<00:00,  2.07it/s]  


In [9]:
joblib.dump(km_model, 'kmean.km')

['kmean.km']

In [10]:
kmean = hubert.kmeans.ApplyKmeans_TF('kmean.km')

In [14]:
kmean(b[:100])

array([58, 62, 51, 19, 11, 29, 75, 82, 27, 27, 27, 27, 27, 53, 45, 45, 54,
        7, 77, 77, 58, 58, 58, 46, 55, 27, 52, 39, 51, 51, 19, 40, 61, 79,
       66, 66,  5,  5,  5,  5,  5, 15,  5,  5,  5,  5, 30, 85, 90,  7, 25,
       52, 11, 37, 82, 77, 46, 63,  2,  2, 96, 96, 16, 58, 17, 39, 37, 82,
       57, 57, 35, 64, 64, 39, 51, 51, 19, 74, 67, 61, 61, 61, 79, 79, 66,
        5,  5, 66,  5, 79, 79,  5,  5, 15,  5, 15,  5, 15, 43, 30])

In [15]:
kmean_tf = kmean(concated)
kmean_tf

Instructions for updating:
keep_dims is deprecated, use keepdims instead


<tf.Tensor 'ArgMin:0' shape=(?,) dtype=int64>

In [17]:
sess.run(kmean_tf, feed_dict = {i: y})

array([50, 74, 45, 74, 50, 78, 78, 78, 78, 37, 37, 89, 95, 40, 99, 86, 10,
       92, 30, 90, 90, 52, 52, 38, 28, 28, 38, 36, 36, 29, 97, 39, 97, 67,
       89, 89, 50, 43, 30, 90, 27, 52, 50, 10, 92, 97, 97, 89, 67, 38, 38,
       38, 36, 36,  1,  1,  1,  1,  1, 21, 73, 73, 44, 18, 46, 25, 25, 95,
       50, 99, 15, 66, 15,  9,  5, 10, 29, 97, 19, 99, 86, 30, 73, 37, 39,
       39, 39, 52, 40, 99, 86, 92, 85, 37, 11, 40, 67, 11, 11, 11, 11, 39,
       37, 37, 37, 37, 37, 26, 26, 37, 37, 37, 37, 37, 37, 37, 37, 26, 26,
       37, 37, 37,  2,  2, 26, 26, 26, 26, 51, 74, 21, 21, 50, 38, 38, 38,
       30, 60, 90, 58, 58, 25, 39, 39, 27, 27, 52, 52, 37, 37, 37, 39, 52,
       52, 22, 50, 78, 81, 81, 81, 81, 30, 54, 90, 52, 95, 78, 78, 34, 78,
       95, 43, 30, 31, 90, 58, 64, 51, 19, 74, 89, 50, 15, 78, 78, 78, 78,
       78, 60, 27, 52, 50, 38, 21, 37, 61, 66, 28, 28, 28, 50, 99, 99, 43,
       30,  1, 21, 38, 43, 86, 36, 67, 50, 99, 86, 92, 85,  1, 28, 28, 50,
       99,  1,  1,  1, 28