In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="3"

In [2]:
import pickle
import youtokentome as yttm
import json
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import ComplementNB
import numpy as np

In [3]:
with open('bow-language-detection.pkl', 'rb') as fopen:
    bow = pickle.load(fopen)

In [4]:
v = bow.transform(['▁dengan ▁stim ▁dan ▁pengeluaran'])
v

<1x300000 sparse matrix of type '<class 'numpy.int64'>'
	with 4 stored elements in Compressed Sparse Row format>

In [5]:
with open('train-test.json') as fopen:
    train_test = json.load(fopen)
    
train_test.keys()

dict_keys(['train_X', 'test_X', 'train_Y', 'test_Y'])

In [6]:
train_Y = LabelEncoder().fit_transform(train_test['train_Y'])

In [7]:
test_Y = LabelEncoder().fit_transform(train_test['test_Y'])

In [8]:
bpe = yttm.BPE(model='language-detection.model')

In [9]:
train_test['train_Y'][8]

'malay'

In [10]:
subs = [' '.join(s) for s in bpe.encode(train_test['train_X'], output_type=yttm.OutputType.SUBWORD)]
len(subs)

8993612

In [11]:
test_subs = [' '.join(s) for s in bpe.encode(train_test['test_X'], output_type=yttm.OutputType.SUBWORD)]
len(test_subs)

2248403

In [12]:
train_X = bow.transform(subs)

In [13]:
test_X = bow.transform(test_subs)

In [14]:
import tensorflow as tf

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [15]:
def convert_sparse_matrix_to_sparse_tensor(X):
    coo = X.tocoo()
    indices = np.mat([coo.row, coo.col]).transpose()
    # coo.data[coo.data > limit] = limit
    return tf.SparseTensorValue(indices, coo.col, coo.shape), tf.SparseTensorValue(indices, coo.data, coo.shape)

In [16]:
class Model:
    def __init__(self, learning_rate, dimension = 64, output = 6):
        self.X = tf.sparse_placeholder(tf.int32)
        self.W = tf.sparse_placeholder(tf.int32)
        self.Y = tf.placeholder(tf.int32, [None])
        embeddings = tf.Variable(tf.truncated_normal([train_X.shape[1],dimension]))
        embed = tf.nn.embedding_lookup_sparse(embeddings, self.X, self.W, combiner='mean')
        self.embed = embed
        self.logits = tf.layers.dense(embed, output)
        self.cost = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits = self.logits, labels = self.Y))
        self.optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(self.cost)
        correct_pred = tf.equal(tf.argmax(self.logits, 1,output_type=tf.int32), self.Y)
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [17]:
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(1e-3)
sess.run(tf.global_variables_initializer())

Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Instructions for updating:
Use keras.layers.dense instead.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [18]:
batch_size = 256
epoch = 10

In [19]:
saver = tf.train.Saver(tf.trainable_variables())
saver.save(sess, 'lang-detection-w/model.ckpt')

'lang-detection-w/model.ckpt'

In [23]:
import time
from tqdm import tqdm

for e in range(epoch):
    lasttime = time.time()
    train_acc, train_loss, test_acc, test_loss = [], [], [], []
    pbar = tqdm(
        range(0, train_X.shape[0], batch_size), desc = 'train minibatch loop'
    )
    for i in pbar:
        batch_x = convert_sparse_matrix_to_sparse_tensor(train_X[i : min(i + batch_size, train_X.shape[0])])
        batch_y = train_Y[i : min(i + batch_size, train_X.shape[0])]
        acc, cost, _ = sess.run(
            [model.accuracy, model.cost, model.optimizer],
            feed_dict = {
                model.X: batch_x[0],
                model.W: batch_x[1],
                model.Y: batch_y
            },
        )
        assert not np.isnan(cost)
        train_loss.append(cost)
        train_acc.append(acc)
        pbar.set_postfix(cost = cost, accuracy = acc)
    
    pbar = tqdm(range(0, test_X.shape[0], batch_size), desc = 'test minibatch loop')
    for i in pbar:
        batch_x = convert_sparse_matrix_to_sparse_tensor(test_X[i : min(i + batch_size, test_X.shape[0])])
        batch_y = test_Y[i : min(i + batch_size, test_X.shape[0])]
        batch_x_expand = np.expand_dims(batch_x,axis = 1)
        acc, cost = sess.run(
            [model.accuracy, model.cost],
            feed_dict = {
                model.X: batch_x[0],
                model.W: batch_x[1],
                model.Y: batch_y
            },
        )
        test_loss.append(cost)
        test_acc.append(acc)
        pbar.set_postfix(cost = cost, accuracy = acc)

    train_loss = np.mean(train_loss)
    train_acc = np.mean(train_acc)
    test_loss = np.mean(test_loss)
    test_acc = np.mean(test_acc)
    
    print('time taken:', time.time() - lasttime)
    print(
        'epoch: %d, training loss: %f, training acc: %f, valid loss: %f, valid acc: %f\n'
        % (e, train_loss, train_acc, test_loss, test_acc)
    )

train minibatch loop: 100%|██████████| 35132/35132 [04:23<00:00, 133.27it/s, accuracy=0.947, cost=0.284] 
test minibatch loop: 100%|██████████| 8783/8783 [00:42<00:00, 205.31it/s, accuracy=0.938, cost=0.274] 
train minibatch loop:   0%|          | 14/35132 [00:00<04:11, 139.67it/s, accuracy=0.953, cost=0.121] 

time taken: 306.4216287136078
epoch: 0, training loss: 0.156121, training acc: 0.959438, valid loss: 0.183558, valid acc: 0.952933



train minibatch loop: 100%|██████████| 35132/35132 [04:22<00:00, 133.92it/s, accuracy=0.947, cost=0.28]  
test minibatch loop: 100%|██████████| 8783/8783 [00:42<00:00, 206.69it/s, accuracy=0.938, cost=0.275] 
train minibatch loop:   0%|          | 14/35132 [00:00<04:14, 137.95it/s, accuracy=0.969, cost=0.119] 

time taken: 304.8448679447174
epoch: 1, training loss: 0.154890, training acc: 0.959840, valid loss: 0.184024, valid acc: 0.952879



train minibatch loop: 100%|██████████| 35132/35132 [04:22<00:00, 133.78it/s, accuracy=0.947, cost=0.274] 
test minibatch loop: 100%|██████████| 8783/8783 [00:42<00:00, 207.42it/s, accuracy=0.943, cost=0.276] 
train minibatch loop:   0%|          | 14/35132 [00:00<04:18, 135.66it/s, accuracy=0.969, cost=0.118] 

time taken: 304.9676516056061
epoch: 2, training loss: 0.153498, training acc: 0.960270, valid loss: 0.184866, valid acc: 0.952716



train minibatch loop: 100%|██████████| 35132/35132 [04:22<00:00, 133.64it/s, accuracy=0.947, cost=0.266] 
test minibatch loop: 100%|██████████| 8783/8783 [00:42<00:00, 206.05it/s, accuracy=0.938, cost=0.279] 
train minibatch loop:   0%|          | 14/35132 [00:00<04:12, 138.96it/s, accuracy=0.953, cost=0.125] 

time taken: 305.5243253707886
epoch: 3, training loss: 0.151917, training acc: 0.960743, valid loss: 0.186118, valid acc: 0.952455



train minibatch loop: 100%|██████████| 35132/35132 [04:22<00:00, 133.82it/s, accuracy=0.961, cost=0.256] 
test minibatch loop: 100%|██████████| 8783/8783 [00:43<00:00, 204.22it/s, accuracy=0.938, cost=0.283] 
train minibatch loop:   0%|          | 14/35132 [00:00<04:19, 135.48it/s, accuracy=0.953, cost=0.125] 

time taken: 305.5525336265564
epoch: 4, training loss: 0.150175, training acc: 0.961276, valid loss: 0.187784, valid acc: 0.952113



train minibatch loop: 100%|██████████| 35132/35132 [04:22<00:00, 134.04it/s, accuracy=0.961, cost=0.245] 
test minibatch loop: 100%|██████████| 8783/8783 [00:42<00:00, 207.00it/s, accuracy=0.938, cost=0.289] 
train minibatch loop:   0%|          | 14/35132 [00:00<04:12, 139.31it/s, accuracy=0.953, cost=0.123] 

time taken: 304.55069947242737
epoch: 5, training loss: 0.148361, training acc: 0.961827, valid loss: 0.189791, valid acc: 0.951668



train minibatch loop: 100%|██████████| 35132/35132 [04:22<00:00, 133.86it/s, accuracy=0.961, cost=0.235] 
test minibatch loop: 100%|██████████| 8783/8783 [00:42<00:00, 206.89it/s, accuracy=0.938, cost=0.295] 
train minibatch loop:   0%|          | 15/35132 [00:00<04:08, 141.22it/s, accuracy=0.957, cost=0.12]  

time taken: 304.92592549324036
epoch: 6, training loss: 0.146584, training acc: 0.962368, valid loss: 0.191855, valid acc: 0.951175



train minibatch loop: 100%|██████████| 35132/35132 [04:22<00:00, 134.05it/s, accuracy=0.961, cost=0.227] 
test minibatch loop: 100%|██████████| 8783/8783 [00:41<00:00, 209.41it/s, accuracy=0.943, cost=0.301] 
train minibatch loop:   0%|          | 15/35132 [00:00<04:07, 141.98it/s, accuracy=0.957, cost=0.116] 

time taken: 304.05493998527527
epoch: 7, training loss: 0.144904, training acc: 0.962868, valid loss: 0.193770, valid acc: 0.950830



train minibatch loop: 100%|██████████| 35132/35132 [04:21<00:00, 134.15it/s, accuracy=0.961, cost=0.222] 
test minibatch loop: 100%|██████████| 8783/8783 [00:42<00:00, 207.03it/s, accuracy=0.938, cost=0.306] 
train minibatch loop:   0%|          | 14/35132 [00:00<04:19, 135.35it/s, accuracy=0.965, cost=0.112] 

time taken: 304.331600189209
epoch: 8, training loss: 0.143346, training acc: 0.963358, valid loss: 0.195505, valid acc: 0.950540



train minibatch loop: 100%|██████████| 35132/35132 [04:22<00:00, 133.98it/s, accuracy=0.961, cost=0.218] 
test minibatch loop: 100%|██████████| 8783/8783 [00:42<00:00, 207.25it/s, accuracy=0.938, cost=0.311] 


time taken: 304.6103057861328
epoch: 9, training loss: 0.141917, training acc: 0.963811, valid loss: 0.197138, valid acc: 0.950260



In [24]:
sess.run(
    [model.accuracy, model.cost, tf.nn.softmax(model.logits)],
    feed_dict = {
        model.X: batch_x[0],
        model.W: batch_x[1],
        model.Y: batch_y
    },
)

[0.93838865,
 0.31134793,
 array([[5.0617434e-11, 2.0252114e-04, 9.9978167e-01, 1.1864756e-11,
         1.0962449e-09, 1.5879028e-05],
        [2.7475774e-10, 7.1108835e-03, 9.9227810e-01, 4.1108267e-10,
         5.5411442e-07, 6.1040366e-04],
        [2.9119096e-09, 6.4927568e-05, 9.9993169e-01, 1.5422963e-10,
         1.3789548e-07, 3.2100718e-06],
        ...,
        [1.9514712e-08, 9.9913657e-02, 8.8389254e-01, 8.2639948e-09,
         2.6031348e-06, 1.6191164e-02],
        [1.5951296e-34, 3.2408203e-16, 5.4073075e-16, 1.0000000e+00,
         2.1566860e-17, 3.7014437e-16],
        [1.5463880e-09, 4.0730190e-01, 4.3093303e-01, 1.9030123e-08,
         5.5830089e-05, 1.6170926e-01]], dtype=float32)]

In [25]:
saver.save(sess, 'lang-detection-w/model.ckpt')

'lang-detection-w/model.ckpt'