In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="3"

In [2]:
import pickle
import youtokentome as yttm
import json
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import ComplementNB
import numpy as np

In [3]:
with open('bow-language-detection.pkl', 'rb') as fopen:
    bow = pickle.load(fopen)

In [4]:
v = bow.transform(['▁dengan ▁stim ▁dan ▁pengeluaran'])
v

<1x400000 sparse matrix of type '<class 'numpy.int64'>'
	with 4 stored elements in Compressed Sparse Row format>

In [5]:
with open('train-test.json') as fopen:
    train_test = json.load(fopen)
    
train_test.keys()

dict_keys(['train_X', 'test_X', 'train_Y', 'test_Y'])

In [6]:
train_Y = LabelEncoder().fit_transform(train_test['train_Y'])

In [7]:
test_Y = LabelEncoder().fit_transform(train_test['test_Y'])

In [8]:
bpe = yttm.BPE(model='language-detection.model')

In [9]:
train_test['train_Y'][8]

'other'

In [10]:
subs = [' '.join(s) for s in bpe.encode(train_test['train_X'], output_type=yttm.OutputType.SUBWORD)]
len(subs)

18918596

In [11]:
test_subs = [' '.join(s) for s in bpe.encode(train_test['test_X'], output_type=yttm.OutputType.SUBWORD)]
len(test_subs)

4729650

In [12]:
train_X = bow.transform(subs)

In [13]:
test_X = bow.transform(test_subs)

In [14]:
import tensorflow as tf

In [15]:
def convert_sparse_matrix_to_sparse_tensor(X):
    coo = X.tocoo()
    indices = np.mat([coo.row, coo.col]).transpose()
    # coo.data[coo.data > limit] = limit
    return tf.SparseTensorValue(indices, coo.col, coo.shape), tf.SparseTensorValue(indices, coo.data, coo.shape)

In [16]:
class Model:
    def __init__(self, learning_rate, dimension = 32, output = 6):
        self.X = tf.sparse_placeholder(tf.int32)
        self.W = tf.sparse_placeholder(tf.int32)
        self.Y = tf.placeholder(tf.int32, [None])
        embeddings = tf.Variable(tf.truncated_normal([train_X.shape[1],dimension]))
        embed = tf.nn.embedding_lookup_sparse(embeddings, self.X, self.W, combiner='mean')
        self.embed = embed
        self.logits = tf.layers.dense(embed, output)
        self.cost = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits = self.logits, labels = self.Y))
        self.optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(self.cost)
        correct_pred = tf.equal(tf.argmax(self.logits, 1,output_type=tf.int32), self.Y)
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [17]:
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(1e-3)
sess.run(tf.global_variables_initializer())

Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Instructions for updating:
Use keras.layers.Dense instead.
Instructions for updating:
Please use `layer.__call__` method instead.


In [18]:
batch_size = 256
epoch = 10

In [19]:
saver = tf.train.Saver(tf.trainable_variables())
saver.save(sess, 'lang-detection-w/model.ckpt')

'lang-detection-w/model.ckpt'

In [20]:
import time
from tqdm import tqdm

for e in range(epoch):
    lasttime = time.time()
    train_acc, train_loss, test_acc, test_loss = [], [], [], []
    pbar = tqdm(
        range(0, train_X.shape[0], batch_size), desc = 'train minibatch loop'
    )
    for i in pbar:
        batch_x = convert_sparse_matrix_to_sparse_tensor(train_X[i : min(i + batch_size, train_X.shape[0])])
        batch_y = train_Y[i : min(i + batch_size, train_X.shape[0])]
        acc, cost, _ = sess.run(
            [model.accuracy, model.cost, model.optimizer],
            feed_dict = {
                model.X: batch_x[0],
                model.W: batch_x[1],
                model.Y: batch_y
            },
        )
        assert not np.isnan(cost)
        train_loss.append(cost)
        train_acc.append(acc)
        pbar.set_postfix(cost = cost, accuracy = acc)
    
    pbar = tqdm(range(0, test_X.shape[0], batch_size), desc = 'test minibatch loop')
    for i in pbar:
        batch_x = convert_sparse_matrix_to_sparse_tensor(test_X[i : min(i + batch_size, test_X.shape[0])])
        batch_y = test_Y[i : min(i + batch_size, test_X.shape[0])]
        batch_x_expand = np.expand_dims(batch_x,axis = 1)
        acc, cost = sess.run(
            [model.accuracy, model.cost],
            feed_dict = {
                model.X: batch_x[0],
                model.W: batch_x[1],
                model.Y: batch_y
            },
        )
        test_loss.append(cost)
        test_acc.append(acc)
        pbar.set_postfix(cost = cost, accuracy = acc)

    train_loss = np.mean(train_loss)
    train_acc = np.mean(train_acc)
    test_loss = np.mean(test_loss)
    test_acc = np.mean(test_acc)
    
    print('time taken:', time.time() - lasttime)
    print(
        'epoch: %d, training loss: %f, training acc: %f, valid loss: %f, valid acc: %f\n'
        % (e, train_loss, train_acc, test_loss, test_acc)
    )

train minibatch loop: 100%|██████████| 73901/73901 [07:16<00:00, 169.40it/s, accuracy=0.974, cost=0.123] 
test minibatch loop: 100%|██████████| 18476/18476 [01:10<00:00, 260.37it/s, accuracy=0.98, cost=0.043]  
train minibatch loop:   0%|          | 18/73901 [00:00<07:14, 170.19it/s, accuracy=0.977, cost=0.0879]

time taken: 507.2389669418335
epoch: 0, training loss: 0.168605, training acc: 0.949238, valid loss: 0.103548, valid acc: 0.972824



train minibatch loop: 100%|██████████| 73901/73901 [07:16<00:00, 169.34it/s, accuracy=0.98, cost=0.101]  
test minibatch loop: 100%|██████████| 18476/18476 [01:11<00:00, 259.92it/s, accuracy=1, cost=0.0307]    
train minibatch loop:   0%|          | 18/73901 [00:00<07:07, 172.92it/s, accuracy=0.969, cost=0.103] 

time taken: 507.52713799476624
epoch: 1, training loss: 0.093084, training acc: 0.975911, valid loss: 0.094879, valid acc: 0.975816



train minibatch loop: 100%|██████████| 73901/73901 [07:15<00:00, 169.68it/s, accuracy=0.98, cost=0.0923] 
test minibatch loop: 100%|██████████| 18476/18476 [01:11<00:00, 259.97it/s, accuracy=1, cost=0.0261]    
train minibatch loop:   0%|          | 18/73901 [00:00<07:02, 174.90it/s, accuracy=0.973, cost=0.0773]

time taken: 506.637300491333
epoch: 2, training loss: 0.086510, training acc: 0.977947, valid loss: 0.092312, valid acc: 0.976671



train minibatch loop: 100%|██████████| 73901/73901 [07:16<00:00, 169.16it/s, accuracy=0.98, cost=0.0861] 
test minibatch loop: 100%|██████████| 18476/18476 [01:10<00:00, 260.24it/s, accuracy=1, cost=0.0229]    
train minibatch loop:   0%|          | 18/73901 [00:00<06:58, 176.73it/s, accuracy=0.969, cost=0.105] 

time taken: 507.90590500831604
epoch: 3, training loss: 0.083693, training acc: 0.978807, valid loss: 0.091118, valid acc: 0.977115



train minibatch loop: 100%|██████████| 73901/73901 [07:14<00:00, 169.92it/s, accuracy=0.98, cost=0.0815] 
test minibatch loop: 100%|██████████| 18476/18476 [01:11<00:00, 259.59it/s, accuracy=1, cost=0.0207]    
train minibatch loop:   0%|          | 18/73901 [00:00<07:11, 171.40it/s, accuracy=0.969, cost=0.105] 

time taken: 506.1323070526123
epoch: 4, training loss: 0.082089, training acc: 0.979305, valid loss: 0.090457, valid acc: 0.977387



train minibatch loop: 100%|██████████| 73901/73901 [07:15<00:00, 169.50it/s, accuracy=0.98, cost=0.078]  
test minibatch loop: 100%|██████████| 18476/18476 [01:10<00:00, 260.34it/s, accuracy=1, cost=0.0194]    
train minibatch loop:   0%|          | 18/73901 [00:00<07:11, 171.23it/s, accuracy=0.98, cost=0.0619] 

time taken: 506.99291372299194
epoch: 5, training loss: 0.081031, training acc: 0.979639, valid loss: 0.090023, valid acc: 0.977549



train minibatch loop: 100%|██████████| 73901/73901 [07:16<00:00, 169.14it/s, accuracy=0.98, cost=0.0742] 
test minibatch loop: 100%|██████████| 18476/18476 [01:11<00:00, 259.65it/s, accuracy=1, cost=0.0188]    
train minibatch loop:   0%|          | 18/73901 [00:00<07:07, 172.77it/s, accuracy=0.973, cost=0.103] 

time taken: 508.1200499534607
epoch: 6, training loss: 0.080218, training acc: 0.979908, valid loss: 0.089695, valid acc: 0.977713



train minibatch loop: 100%|██████████| 73901/73901 [07:14<00:00, 170.19it/s, accuracy=0.98, cost=0.0709] 
test minibatch loop: 100%|██████████| 18476/18476 [01:11<00:00, 259.44it/s, accuracy=1, cost=0.0187]    
train minibatch loop:   0%|          | 18/73901 [00:00<07:08, 172.29it/s, accuracy=0.977, cost=0.102] 

time taken: 505.47494888305664
epoch: 7, training loss: 0.079552, training acc: 0.980120, valid loss: 0.089535, valid acc: 0.977784



train minibatch loop: 100%|██████████| 73901/73901 [07:14<00:00, 170.15it/s, accuracy=0.98, cost=0.0685] 
test minibatch loop: 100%|██████████| 18476/18476 [01:10<00:00, 260.69it/s, accuracy=1, cost=0.0188]    
train minibatch loop:   0%|          | 18/73901 [00:00<06:59, 175.99it/s, accuracy=0.977, cost=0.101] 

time taken: 505.2287175655365
epoch: 8, training loss: 0.079030, training acc: 0.980288, valid loss: 0.089536, valid acc: 0.977805



train minibatch loop: 100%|██████████| 73901/73901 [07:16<00:00, 169.30it/s, accuracy=0.98, cost=0.0668] 
test minibatch loop: 100%|██████████| 18476/18476 [01:11<00:00, 259.80it/s, accuracy=1, cost=0.0193]    


time taken: 507.66829776763916
epoch: 9, training loss: 0.078613, training acc: 0.980419, valid loss: 0.089660, valid acc: 0.977789



In [21]:
sess.run(
    [model.accuracy, model.cost, tf.nn.softmax(model.logits)],
    feed_dict = {
        model.X: batch_x[0],
        model.W: batch_x[1],
        model.Y: batch_y
    },
)

[1.0,
 0.019317048,
 array([[1.84335859e-05, 1.39014555e-09, 1.04053121e-04, 2.16701878e-07,
         9.99714434e-01, 1.62753495e-04],
        [9.76559639e-01, 5.64002767e-06, 6.04392244e-06, 2.34174430e-02,
         5.90871352e-08, 1.12169728e-05],
        [3.14420220e-07, 8.30683261e-02, 1.10712834e-01, 1.18992460e-09,
         1.53369969e-03, 8.04684877e-01],
        [1.15116722e-10, 1.63969863e-02, 9.56447959e-01, 4.57371727e-12,
         1.53386327e-06, 2.71535385e-02],
        [9.90425646e-01, 2.91815994e-08, 3.31482725e-13, 9.50208399e-03,
         6.22980920e-08, 7.21805773e-05],
        [3.67394226e-10, 1.41238458e-02, 9.78182018e-01, 1.79507581e-12,
         1.75307184e-06, 7.69246696e-03],
        [9.99271452e-01, 1.35744153e-06, 1.43437719e-04, 5.67995885e-04,
         8.24067513e-07, 1.49628686e-05],
        [9.84049559e-01, 1.24926373e-05, 1.76603135e-04, 1.55735407e-02,
         1.02394843e-04, 8.55438993e-05],
        [1.27655198e-09, 5.64305713e-09, 3.10502263e-10, 1.3

In [22]:
saver.save(sess, 'lang-detection-w/model.ckpt')

'lang-detection-w/model.ckpt'

In [23]:
import boto3

bucketName = 'huseinhouse-storage'
Key = 'lang-detection-w/model.ckpt.data-00000-of-00001'
outPutname = "v34/language-detection/model.ckpt.data-00000-of-00001"

s3 = boto3.client('s3')
s3.upload_file(Key,bucketName,outPutname)

In [24]:
Key = 'lang-detection-w/model.ckpt.index'
outPutname = "v34/language-detection/model.ckpt.index"

s3 = boto3.client('s3')
s3.upload_file(Key,bucketName,outPutname)

In [25]:
Key = 'lang-detection-w/model.ckpt.meta'
outPutname = "v34/language-detection/model.ckpt.meta"

s3 = boto3.client('s3')
s3.upload_file(Key,bucketName,outPutname)

In [26]:
Key = 'bow-language-detection.pkl'
outPutname = "v34/language-detection/bow-language-detection.pkl"

s3 = boto3.client('s3')
s3.upload_file(Key,bucketName,outPutname)

In [27]:
Key = 'language-detection.model'
outPutname = "v34/language-detection/language-detection.model"

s3 = boto3.client('s3')
s3.upload_file(Key,bucketName,outPutname)