In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="3"

In [2]:
import pickle
import youtokentome as yttm
import json
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import ComplementNB
import numpy as np

In [3]:
with open('bow-language-detection.pkl', 'rb') as fopen:
    bow = pickle.load(fopen)

In [4]:
v = bow.transform(['▁dengan ▁stim ▁dan ▁pengeluaran'])
v

<1x400000 sparse matrix of type '<class 'numpy.int64'>'
	with 4 stored elements in Compressed Sparse Row format>

In [5]:
with open('train-test.json') as fopen:
    train_test = json.load(fopen)
    
train_test.keys()

dict_keys(['train_X', 'test_X', 'train_Y', 'test_Y'])

In [6]:
train_Y = LabelEncoder().fit_transform(train_test['train_Y'])

In [7]:
test_Y = LabelEncoder().fit_transform(train_test['test_Y'])

In [8]:
bpe = yttm.BPE(model='language-detection.model')

In [9]:
train_test['train_Y'][8]

'rojak'

In [10]:
subs = [' '.join(s) for s in bpe.encode(train_test['train_X'], output_type=yttm.OutputType.SUBWORD)]
len(subs)

9092852

In [11]:
test_subs = [' '.join(s) for s in bpe.encode(train_test['test_X'], output_type=yttm.OutputType.SUBWORD)]
len(test_subs)

2273214

In [12]:
train_X = bow.transform(subs)

In [13]:
test_X = bow.transform(test_subs)

In [14]:
import tensorflow as tf

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [15]:
def convert_sparse_matrix_to_sparse_tensor(X):
    coo = X.tocoo()
    indices = np.mat([coo.row, coo.col]).transpose()
    # coo.data[coo.data > limit] = limit
    return tf.SparseTensorValue(indices, coo.col, coo.shape), tf.SparseTensorValue(indices, coo.data, coo.shape)

In [16]:
class Model:
    def __init__(self, learning_rate, dimension = 32, output = 6):
        self.X = tf.sparse_placeholder(tf.int32)
        self.W = tf.sparse_placeholder(tf.int32)
        self.Y = tf.placeholder(tf.int32, [None])
        embeddings = tf.Variable(tf.truncated_normal([train_X.shape[1],dimension]))
        embed = tf.nn.embedding_lookup_sparse(embeddings, self.X, self.W, combiner='mean')
        self.embed = embed
        self.logits = tf.layers.dense(embed, output)
        self.cost = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits = self.logits, labels = self.Y))
        self.optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(self.cost)
        correct_pred = tf.equal(tf.argmax(self.logits, 1,output_type=tf.int32), self.Y)
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [17]:
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(1e-3)
sess.run(tf.global_variables_initializer())

Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Instructions for updating:
Use keras.layers.dense instead.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [18]:
batch_size = 256
epoch = 10

In [19]:
saver = tf.train.Saver(tf.trainable_variables())
saver.save(sess, 'lang-detection-w/model.ckpt')

'lang-detection-w/model.ckpt'

In [20]:
import time
from tqdm import tqdm

for e in range(epoch):
    lasttime = time.time()
    train_acc, train_loss, test_acc, test_loss = [], [], [], []
    pbar = tqdm(
        range(0, train_X.shape[0], batch_size), desc = 'train minibatch loop'
    )
    for i in pbar:
        batch_x = convert_sparse_matrix_to_sparse_tensor(train_X[i : min(i + batch_size, train_X.shape[0])])
        batch_y = train_Y[i : min(i + batch_size, train_X.shape[0])]
        acc, cost, _ = sess.run(
            [model.accuracy, model.cost, model.optimizer],
            feed_dict = {
                model.X: batch_x[0],
                model.W: batch_x[1],
                model.Y: batch_y
            },
        )
        assert not np.isnan(cost)
        train_loss.append(cost)
        train_acc.append(acc)
        pbar.set_postfix(cost = cost, accuracy = acc)
    
    pbar = tqdm(range(0, test_X.shape[0], batch_size), desc = 'test minibatch loop')
    for i in pbar:
        batch_x = convert_sparse_matrix_to_sparse_tensor(test_X[i : min(i + batch_size, test_X.shape[0])])
        batch_y = test_Y[i : min(i + batch_size, test_X.shape[0])]
        batch_x_expand = np.expand_dims(batch_x,axis = 1)
        acc, cost = sess.run(
            [model.accuracy, model.cost],
            feed_dict = {
                model.X: batch_x[0],
                model.W: batch_x[1],
                model.Y: batch_y
            },
        )
        test_loss.append(cost)
        test_acc.append(acc)
        pbar.set_postfix(cost = cost, accuracy = acc)

    train_loss = np.mean(train_loss)
    train_acc = np.mean(train_acc)
    test_loss = np.mean(test_loss)
    test_acc = np.mean(test_acc)
    
    print('time taken:', time.time() - lasttime)
    print(
        'epoch: %d, training loss: %f, training acc: %f, valid loss: %f, valid acc: %f\n'
        % (e, train_loss, train_acc, test_loss, test_acc)
    )

train minibatch loop: 100%|██████████| 35519/35519 [04:08<00:00, 142.94it/s, accuracy=0.984, cost=0.0713]
test minibatch loop: 100%|██████████| 8880/8880 [00:43<00:00, 206.07it/s, accuracy=0.963, cost=0.128] 
train minibatch loop:   0%|          | 16/35519 [00:00<03:53, 151.91it/s, accuracy=0.953, cost=0.141] 

time taken: 291.5948407649994
epoch: 0, training loss: 0.242081, training acc: 0.922339, valid loss: 0.128911, valid acc: 0.964310



train minibatch loop: 100%|██████████| 35519/35519 [04:06<00:00, 144.01it/s, accuracy=0.984, cost=0.0498]
test minibatch loop: 100%|██████████| 8880/8880 [00:42<00:00, 208.91it/s, accuracy=0.968, cost=0.122] 
train minibatch loop:   0%|          | 15/35519 [00:00<04:08, 142.64it/s, accuracy=0.969, cost=0.38]  

time taken: 289.1633069515228
epoch: 1, training loss: 0.109026, training acc: 0.970937, valid loss: 0.108970, valid acc: 0.971439



train minibatch loop: 100%|██████████| 35519/35519 [04:06<00:00, 143.89it/s, accuracy=0.988, cost=0.0421]
test minibatch loop: 100%|██████████| 8880/8880 [00:42<00:00, 207.95it/s, accuracy=0.968, cost=0.125] 
train minibatch loop:   0%|          | 16/35519 [00:00<03:44, 157.88it/s, accuracy=0.98, cost=0.0763] 

time taken: 289.5675256252289
epoch: 2, training loss: 0.094672, training acc: 0.975500, valid loss: 0.103545, valid acc: 0.973257



train minibatch loop: 100%|██████████| 35519/35519 [04:06<00:00, 143.81it/s, accuracy=0.988, cost=0.0384]
test minibatch loop: 100%|██████████| 8880/8880 [00:42<00:00, 209.77it/s, accuracy=0.968, cost=0.128] 
train minibatch loop:   0%|          | 17/35519 [00:00<03:35, 164.92it/s, accuracy=0.965, cost=0.113] 

time taken: 289.3350875377655
epoch: 3, training loss: 0.088695, training acc: 0.977267, valid loss: 0.101133, valid acc: 0.974143



train minibatch loop: 100%|██████████| 35519/35519 [04:05<00:00, 144.79it/s, accuracy=0.992, cost=0.0364]
test minibatch loop: 100%|██████████| 8880/8880 [00:42<00:00, 206.77it/s, accuracy=0.968, cost=0.13]  
train minibatch loop:   0%|          | 15/35519 [00:00<04:02, 146.50it/s, accuracy=0.973, cost=0.346] 

time taken: 288.2864398956299
epoch: 4, training loss: 0.085259, training acc: 0.978300, valid loss: 0.099764, valid acc: 0.974620



train minibatch loop: 100%|██████████| 35519/35519 [04:06<00:00, 144.10it/s, accuracy=0.992, cost=0.0351]
test minibatch loop: 100%|██████████| 8880/8880 [00:43<00:00, 206.32it/s, accuracy=0.968, cost=0.132] 
train minibatch loop:   0%|          | 15/35519 [00:00<03:59, 148.02it/s, accuracy=0.961, cost=0.0981]

time taken: 289.55000376701355
epoch: 5, training loss: 0.082959, training acc: 0.978992, valid loss: 0.098903, valid acc: 0.974944



train minibatch loop: 100%|██████████| 35519/35519 [04:05<00:00, 144.56it/s, accuracy=0.992, cost=0.034] 
test minibatch loop: 100%|██████████| 8880/8880 [00:42<00:00, 209.00it/s, accuracy=0.963, cost=0.133] 
train minibatch loop:   0%|          | 17/35519 [00:00<03:40, 160.93it/s, accuracy=0.984, cost=0.0651]

time taken: 288.22125267982483
epoch: 6, training loss: 0.081302, training acc: 0.979492, valid loss: 0.098345, valid acc: 0.975168



train minibatch loop: 100%|██████████| 35519/35519 [03:54<00:00, 151.44it/s, accuracy=0.992, cost=0.0332]
test minibatch loop: 100%|██████████| 8880/8880 [00:41<00:00, 215.55it/s, accuracy=0.963, cost=0.134] 
train minibatch loop:   0%|          | 16/35519 [00:00<03:51, 153.48it/s, accuracy=0.984, cost=0.0635]

time taken: 275.7643311023712
epoch: 7, training loss: 0.080048, training acc: 0.979871, valid loss: 0.097943, valid acc: 0.975329



train minibatch loop: 100%|██████████| 35519/35519 [04:03<00:00, 145.78it/s, accuracy=0.992, cost=0.0324]
test minibatch loop:  28%|██▊       | 2459/8880 [00:12<00:31, 204.05it/s, accuracy=0.973, cost=0.0906]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

test minibatch loop: 100%|██████████| 8880/8880 [00:43<00:00, 202.52it/s, accuracy=0.963, cost=0.136] 
train minibatch loop:   0%|          | 15/35519 [00:00<03:59, 147.95it/s, accuracy=0.961, cost=0.0907]

time taken: 287.51899456977844
epoch: 8, training loss: 0.079045, training acc: 0.980166, valid loss: 0.097645, valid acc: 0.975452



train minibatch loop:   4%|▎         | 1296/35519 [00:09<03:58, 143.54it/s, accuracy=0.984, cost=0.101] IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

train minibatch loop:  25%|██▍       | 8791/35519 [01:01<03:07, 142.61it/s, accuracy=0.984, cost=0.0453]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

train minibatch loop:  46%|████▌     | 16395/35519 [01:54<02:10, 146.05it/s, accuracy=0.977, cost=0.108] IOPub message rate exceeded.
The note

time taken: 291.4929618835449
epoch: 9, training loss: 0.078234, training acc: 0.980402, valid loss: 0.097460, valid acc: 0.975538



In [28]:
sess.run(
    [model.accuracy, model.cost, tf.nn.softmax(model.logits)],
    feed_dict = {
        model.X: batch_x[0],
        model.W: batch_x[1],
        model.Y: batch_y
    },
)

[0.9631579,
 0.13742845,
 array([[8.4514604e-11, 5.7017021e-03, 9.9155653e-01, 2.0396769e-09,
         3.6607255e-06, 2.7380467e-03],
        [1.9640605e-14, 4.1212270e-04, 9.9814296e-01, 1.6417166e-10,
         1.9403663e-08, 1.4447921e-03],
        [4.0854372e-11, 1.0252681e-03, 9.9603099e-01, 3.1995379e-10,
         2.6191387e-06, 2.9410752e-03],
        ...,
        [2.7619373e-07, 1.1631415e-02, 6.8352171e-03, 1.5306705e-06,
         1.0520320e-05, 9.8152107e-01],
        [3.1336927e-07, 4.7213722e-05, 9.9055862e-01, 1.7231294e-07,
         6.8178160e-06, 9.3869381e-03],
        [8.4683719e-13, 1.0000000e+00, 8.2563373e-11, 3.6112133e-12,
         2.8948026e-09, 2.9940570e-09]], dtype=float32)]

In [29]:
saver.save(sess, 'lang-detection-w/model.ckpt')

'lang-detection-w/model.ckpt'

In [30]:
import boto3

bucketName = 'huseinhouse-storage'
Key = 'lang-detection-w/model.ckpt.data-00000-of-00001'
outPutname = "v33/language-detection/model.ckpt.data-00000-of-00001"

s3 = boto3.client('s3')
s3.upload_file(Key,bucketName,outPutname)

In [31]:
Key = 'lang-detection-w/model.ckpt.index'
outPutname = "v33/language-detection/model.ckpt.index"

s3 = boto3.client('s3')
s3.upload_file(Key,bucketName,outPutname)

In [32]:
Key = 'lang-detection-w/model.ckpt.meta'
outPutname = "v33/language-detection/model.ckpt.meta"

s3 = boto3.client('s3')
s3.upload_file(Key,bucketName,outPutname)

In [33]:
Key = 'bow-language-detection.pkl'
outPutname = "v33/language-detection/bow-language-detection.pkl"

s3 = boto3.client('s3')
s3.upload_file(Key,bucketName,outPutname)

In [34]:
Key = 'language-detection.model'
outPutname = "v33/language-detection/language-detection.model"

s3 = boto3.client('s3')
s3.upload_file(Key,bucketName,outPutname)