In [1]:
import tensorflow as tf
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import os
import re
import pickle
import json

def clean_text(string):
    string = re.sub(u'[0-9!@#$%^&*()_\-+{}|\~`\'";:?/.>,<]', ' ', string.lower(), flags=re.UNICODE)
    return re.sub(r'[ ]+', ' ', string.lower()).strip()

In [2]:
with open('language-detection-data-v5.json','r') as fopen:
    loaded = json.load(fopen)
    sentences = [clean_text(text) for text in loaded['text']]
    langs = loaded['label']

In [3]:
with open('language-detection-vectorizer.pkl','rb') as fopen:
    bow_chars = pickle.load(fopen)

In [4]:
%%time
target = LabelEncoder().fit_transform(langs)
features = bow_chars.transform(sentences)
features.shape

CPU times: user 1min 22s, sys: 124 ms, total: 1min 22s
Wall time: 1min 22s


In [5]:
train_X, test_X, train_Y, test_Y = train_test_split(features, target, test_size = 0.2)
del features

In [6]:
from sklearn import metrics

In [7]:
train_X.shape

(166343, 660726)

In [8]:
def convert_sparse_matrix_to_sparse_tensor(X, limit = 5):
    coo = X.tocoo()
    indices = np.mat([coo.row, coo.col]).transpose()
    coo.data[coo.data > limit] = limit
    return tf.SparseTensorValue(indices, coo.col, coo.shape), tf.SparseTensorValue(indices, coo.data, coo.shape)

In [9]:
class Model:
    def __init__(self, learning_rate):
        self.X = tf.sparse_placeholder(tf.int32)
        self.W = tf.sparse_placeholder(tf.int32)
        self.Y = tf.placeholder(tf.int32, [None])
        embeddings = tf.Variable(tf.truncated_normal([train_X.shape[1],40]))
        embed = tf.nn.embedding_lookup_sparse(embeddings, self.X, self.W, combiner='mean')
        self.logits = tf.layers.dense(embed, 4)
        self.cost = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits = self.logits, labels = self.Y))
        self.optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(self.cost)
        correct_pred = tf.equal(tf.argmax(self.logits, 1,output_type=tf.int32), self.Y)
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [10]:
sess = tf.InteractiveSession()
model = Model(1e-4)
sess.run(tf.global_variables_initializer())

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [11]:
saver = tf.train.Saver(tf.trainable_variables())
saver.save(sess, 'lang-detection-w/model.ckpt')

'lang-detection-w/model.ckpt'

In [12]:
strings = ','.join(
    [
        n.name
        for n in tf.get_default_graph().as_graph_def().node
        if ('Variable' in n.op
        or 'Placeholder' in n.name
        or 'logits' in n.name)
        and 'Adam' not in n.name
        and 'beta' not in n.name
    ]
)
strings.split(',')

['Placeholder',
 'Placeholder_1',
 'Placeholder_2',
 'Placeholder_3',
 'Placeholder_4',
 'Placeholder_5',
 'Placeholder_6',
 'Variable',
 'dense/kernel',
 'dense/bias']

In [13]:
tf.trainable_variables()

[<tf.Variable 'Variable:0' shape=(660726, 40) dtype=float32_ref>,
 <tf.Variable 'dense/kernel:0' shape=(40, 4) dtype=float32_ref>,
 <tf.Variable 'dense/bias:0' shape=(4,) dtype=float32_ref>]

In [14]:
import time
from tqdm import tqdm

batch_size = 64
for e in range(20):
    lasttime = time.time()
    train_acc, train_loss, test_acc, test_loss = 0, 0, 0, 0
    pbar = tqdm(
        range(0, train_X.shape[0], batch_size), desc = 'train minibatch loop'
    )
    for i in pbar:
        batch_x = convert_sparse_matrix_to_sparse_tensor(train_X[i : min(i + batch_size, train_X.shape[0])])
        batch_y = train_Y[i : min(i + batch_size, train_X.shape[0])]
        acc, cost, _ = sess.run(
            [model.accuracy, model.cost, model.optimizer],
            feed_dict = {
                model.X: batch_x[0],
                model.W: batch_x[1],
                model.Y: batch_y
            },
        )
        assert not np.isnan(cost)
        train_loss += cost
        train_acc += acc
        pbar.set_postfix(cost = cost, accuracy = acc)

    pbar = tqdm(range(0, test_X.shape[0], batch_size), desc = 'test minibatch loop')
    for i in pbar:
        batch_x = convert_sparse_matrix_to_sparse_tensor(test_X[i : min(i + batch_size, test_X.shape[0])])
        batch_y = test_Y[i : min(i + batch_size, test_X.shape[0])]
        batch_x_expand = np.expand_dims(batch_x,axis = 1)
        acc, cost = sess.run(
            [model.accuracy, model.cost],
            feed_dict = {
                model.X: batch_x[0],
                model.W: batch_x[1],
                model.Y: batch_y
            },
        )
        test_loss += cost
        test_acc += acc
        pbar.set_postfix(cost = cost, accuracy = acc)

    train_loss /= train_X.shape[0] / batch_size
    train_acc /= train_X.shape[0] / batch_size
    test_loss /= test_X.shape[0] / batch_size
    test_acc /= test_X.shape[0] / batch_size

    print('time taken:', time.time() - lasttime)
    print(
        'epoch: %d, training loss: %f, training acc: %f, valid loss: %f, valid acc: %f\n'
        % (e, train_loss, train_acc, test_loss, test_acc)
    )

train minibatch loop: 100%|██████████| 2600/2600 [01:19<00:00, 33.00it/s, accuracy=0.857, cost=0.732]
test minibatch loop: 100%|██████████| 650/650 [00:03<00:00, 211.46it/s, accuracy=0.94, cost=0.558] 
train minibatch loop:   0%|          | 4/2600 [00:00<01:19, 32.75it/s, accuracy=0.75, cost=0.814] 

time taken: 82.36624956130981
epoch: 0, training loss: 1.042807, training acc: 0.739603, valid loss: nan, valid acc: 0.870874



train minibatch loop: 100%|██████████| 2600/2600 [01:19<00:00, 33.05it/s, accuracy=1, cost=0.309]    
test minibatch loop: 100%|██████████| 650/650 [00:03<00:00, 211.62it/s, accuracy=0.98, cost=0.204] 
train minibatch loop:   0%|          | 4/2600 [00:00<01:18, 32.88it/s, accuracy=0.891, cost=0.442]

time taken: 82.26686263084412
epoch: 1, training loss: 0.420440, training acc: 0.917406, valid loss: nan, valid acc: 0.951756



train minibatch loop: 100%|██████████| 2600/2600 [01:19<00:00, 33.09it/s, accuracy=1, cost=0.144]     
test minibatch loop: 100%|██████████| 650/650 [00:03<00:00, 211.66it/s, accuracy=0.98, cost=0.0981] 
train minibatch loop:   0%|          | 4/2600 [00:00<01:18, 32.91it/s, accuracy=0.953, cost=0.275]

time taken: 82.24010252952576
epoch: 2, training loss: 0.196528, training acc: 0.970849, valid loss: nan, valid acc: 0.979674



train minibatch loop: 100%|██████████| 2600/2600 [01:19<00:00, 32.97it/s, accuracy=1, cost=0.0669]    
test minibatch loop: 100%|██████████| 650/650 [00:03<00:00, 210.46it/s, accuracy=1, cost=0.0534]    
train minibatch loop:   0%|          | 4/2600 [00:00<01:19, 32.74it/s, accuracy=0.953, cost=0.193] 

time taken: 82.17026662826538
epoch: 3, training loss: 0.105076, training acc: 0.985867, valid loss: nan, valid acc: 0.987784



train minibatch loop: 100%|██████████| 2600/2600 [01:19<00:00, 33.01it/s, accuracy=1, cost=0.0318]    
test minibatch loop: 100%|██████████| 650/650 [00:03<00:00, 211.34it/s, accuracy=1, cost=0.0312]    
train minibatch loop:   0%|          | 4/2600 [00:00<01:18, 32.96it/s, accuracy=0.953, cost=0.151] 

time taken: 82.29047513008118
epoch: 4, training loss: 0.062966, training acc: 0.990598, valid loss: nan, valid acc: 0.991103



train minibatch loop: 100%|██████████| 2600/2600 [01:19<00:00, 32.93it/s, accuracy=1, cost=0.0163]    
test minibatch loop: 100%|██████████| 650/650 [00:03<00:00, 211.20it/s, accuracy=1, cost=0.0194]    
train minibatch loop:   0%|          | 4/2600 [00:00<01:18, 32.92it/s, accuracy=0.953, cost=0.125] 

time taken: 82.23359727859497
epoch: 5, training loss: 0.042258, training acc: 0.993159, valid loss: nan, valid acc: 0.993051



train minibatch loop: 100%|██████████| 2600/2600 [01:19<00:00, 33.00it/s, accuracy=1, cost=0.00904]   
test minibatch loop: 100%|██████████| 650/650 [00:03<00:00, 212.27it/s, accuracy=1, cost=0.0128]    
train minibatch loop:   0%|          | 4/2600 [00:00<01:18, 32.87it/s, accuracy=0.969, cost=0.108] 

time taken: 82.21385073661804
epoch: 6, training loss: 0.030973, training acc: 0.994511, valid loss: nan, valid acc: 0.994036



train minibatch loop: 100%|██████████| 2600/2600 [01:19<00:00, 33.08it/s, accuracy=1, cost=0.00537]   
test minibatch loop: 100%|██████████| 650/650 [00:03<00:00, 211.77it/s, accuracy=1, cost=0.00886]   
train minibatch loop:   0%|          | 4/2600 [00:00<01:19, 32.82it/s, accuracy=0.969, cost=0.0947]

time taken: 82.13079595565796
epoch: 7, training loss: 0.024160, training acc: 0.995479, valid loss: nan, valid acc: 0.994878



train minibatch loop: 100%|██████████| 2600/2600 [01:19<00:00, 32.97it/s, accuracy=1, cost=0.00337]   
test minibatch loop: 100%|██████████| 650/650 [00:03<00:00, 212.59it/s, accuracy=1, cost=0.00643]   
train minibatch loop:   0%|          | 4/2600 [00:00<01:18, 32.87it/s, accuracy=0.969, cost=0.0848]

time taken: 82.28797364234924
epoch: 8, training loss: 0.019677, training acc: 0.996183, valid loss: nan, valid acc: 0.995527



train minibatch loop: 100%|██████████| 2600/2600 [01:19<00:00, 33.04it/s, accuracy=1, cost=0.00221]   
test minibatch loop: 100%|██████████| 650/650 [00:03<00:00, 214.02it/s, accuracy=1, cost=0.00483]   
train minibatch loop:   0%|          | 4/2600 [00:00<01:19, 32.86it/s, accuracy=0.969, cost=0.0769]

time taken: 82.11047410964966
epoch: 9, training loss: 0.016514, training acc: 0.996748, valid loss: nan, valid acc: 0.995864



train minibatch loop: 100%|██████████| 2600/2600 [01:19<00:00, 33.02it/s, accuracy=1, cost=0.0015]    
test minibatch loop: 100%|██████████| 650/650 [00:03<00:00, 213.58it/s, accuracy=1, cost=0.00375]   
train minibatch loop:   0%|          | 4/2600 [00:00<01:18, 33.02it/s, accuracy=0.969, cost=0.0705]

time taken: 82.07126927375793
epoch: 10, training loss: 0.014159, training acc: 0.997277, valid loss: nan, valid acc: 0.996080



train minibatch loop: 100%|██████████| 2600/2600 [01:19<00:00, 32.91it/s, accuracy=1, cost=0.00104]   
test minibatch loop: 100%|██████████| 650/650 [00:03<00:00, 212.99it/s, accuracy=1, cost=0.00298]   
train minibatch loop:   0%|          | 4/2600 [00:00<01:19, 32.82it/s, accuracy=0.969, cost=0.0651]

time taken: 82.13535642623901
epoch: 11, training loss: 0.012330, training acc: 0.997601, valid loss: nan, valid acc: 0.996417



train minibatch loop: 100%|██████████| 2600/2600 [01:19<00:00, 33.03it/s, accuracy=1, cost=0.000745]  
test minibatch loop: 100%|██████████| 650/650 [00:03<00:00, 214.46it/s, accuracy=1, cost=0.00241]   
train minibatch loop:   0%|          | 4/2600 [00:00<01:18, 32.89it/s, accuracy=0.969, cost=0.0605]

time taken: 82.07676434516907
epoch: 12, training loss: 0.010864, training acc: 0.997842, valid loss: nan, valid acc: 0.996802



train minibatch loop: 100%|██████████| 2600/2600 [01:19<00:00, 33.11it/s, accuracy=1, cost=0.000542]  
test minibatch loop: 100%|██████████| 650/650 [00:02<00:00, 216.87it/s, accuracy=1, cost=0.00199]   
train minibatch loop:   0%|          | 4/2600 [00:00<01:18, 33.09it/s, accuracy=0.969, cost=0.0565]

time taken: 82.01633334159851
epoch: 13, training loss: 0.009658, training acc: 0.998100, valid loss: nan, valid acc: 0.996874



train minibatch loop: 100%|██████████| 2600/2600 [01:18<00:00, 33.13it/s, accuracy=1, cost=0.000401]  
test minibatch loop: 100%|██████████| 650/650 [00:03<00:00, 214.32it/s, accuracy=1, cost=0.00167]   
train minibatch loop:   0%|          | 4/2600 [00:00<01:18, 32.94it/s, accuracy=0.984, cost=0.053]

time taken: 81.9391098022461
epoch: 14, training loss: 0.008647, training acc: 0.998317, valid loss: nan, valid acc: 0.996970



train minibatch loop: 100%|██████████| 2600/2600 [01:19<00:00, 32.96it/s, accuracy=1, cost=0.000301]  
test minibatch loop: 100%|██████████| 650/650 [00:03<00:00, 215.30it/s, accuracy=1, cost=0.00142]   
train minibatch loop:   0%|          | 4/2600 [00:00<01:18, 32.95it/s, accuracy=0.984, cost=0.0498]

time taken: 82.04596328735352
epoch: 15, training loss: 0.007786, training acc: 0.998563, valid loss: nan, valid acc: 0.997066



train minibatch loop: 100%|██████████| 2600/2600 [01:18<00:00, 33.06it/s, accuracy=1, cost=0.000229]  
test minibatch loop: 100%|██████████| 650/650 [00:03<00:00, 215.03it/s, accuracy=1, cost=0.00122]   
train minibatch loop:   0%|          | 4/2600 [00:00<01:18, 32.87it/s, accuracy=0.984, cost=0.0469]

time taken: 81.97728490829468
epoch: 16, training loss: 0.007043, training acc: 0.998707, valid loss: nan, valid acc: 0.997211



train minibatch loop: 100%|██████████| 2600/2600 [01:19<00:00, 33.06it/s, accuracy=1, cost=0.000176]  
test minibatch loop: 100%|██████████| 650/650 [00:03<00:00, 214.48it/s, accuracy=1, cost=0.00106]   
train minibatch loop:   0%|          | 4/2600 [00:00<01:19, 32.84it/s, accuracy=0.984, cost=0.0442]

time taken: 82.04397535324097
epoch: 17, training loss: 0.006396, training acc: 0.998828, valid loss: nan, valid acc: 0.997283



train minibatch loop: 100%|██████████| 2600/2600 [01:19<00:00, 33.05it/s, accuracy=1, cost=0.000138]  
test minibatch loop: 100%|██████████| 650/650 [00:03<00:00, 215.15it/s, accuracy=1, cost=0.000927]  
train minibatch loop:   0%|          | 4/2600 [00:00<01:18, 33.01it/s, accuracy=0.984, cost=0.0417]

time taken: 82.10938167572021
epoch: 18, training loss: 0.005828, training acc: 0.999002, valid loss: nan, valid acc: 0.997331



train minibatch loop: 100%|██████████| 2600/2600 [01:19<00:00, 33.10it/s, accuracy=1, cost=0.000109]  
test minibatch loop: 100%|██████████| 650/650 [00:03<00:00, 214.98it/s, accuracy=1, cost=0.000821]  

time taken: 82.04319453239441
epoch: 19, training loss: 0.005327, training acc: 0.999152, valid loss: nan, valid acc: 0.997475






In [18]:
real_Y, predict_Y = [], []

pbar = tqdm(
    range(0, test_X.shape[0], batch_size), desc = 'validation minibatch loop'
)
for i in pbar:
    batch_x = convert_sparse_matrix_to_sparse_tensor(test_X[i : min(i + batch_size, test_X.shape[0])])
    batch_y = test_Y[i : min(i + batch_size, test_X.shape[0])].tolist()
    predict_Y += np.argmax(
        sess.run(
            model.logits, feed_dict = {model.X: batch_x[0], model.W: batch_x[1], model.Y: batch_y}
        ),
        1,
    ).tolist()
    real_Y += batch_y



validation minibatch loop:   0%|          | 0/650 [00:00<?, ?it/s][A[A

validation minibatch loop:   3%|▎         | 21/650 [00:00<00:03, 205.41it/s][A[A

validation minibatch loop:   8%|▊         | 51/650 [00:00<00:02, 225.54it/s][A[A

validation minibatch loop:  13%|█▎        | 83/650 [00:00<00:02, 246.72it/s][A[A

validation minibatch loop:  18%|█▊        | 115/650 [00:00<00:02, 262.74it/s][A[A

validation minibatch loop:  23%|██▎       | 147/650 [00:00<00:01, 275.87it/s][A[A

validation minibatch loop:  27%|██▋       | 178/650 [00:00<00:01, 281.85it/s][A[A

validation minibatch loop:  32%|███▏      | 210/650 [00:00<00:01, 291.31it/s][A[A

validation minibatch loop:  37%|███▋      | 241/650 [00:00<00:01, 295.55it/s][A[A

validation minibatch loop:  42%|████▏     | 272/650 [00:00<00:01, 297.95it/s][A[A

validation minibatch loop:  47%|████▋     | 305/650 [00:01<00:01, 304.03it/s][A[A

validation minibatch loop:  52%|█████▏    | 337/650 [00:01<00:01, 307.32it/s]

In [20]:
print(
    metrics.classification_report(
        real_Y, predict_Y, target_names = ['other','english','indonesian','malay']
    )
)

              precision    recall  f1-score   support

       other       1.00      0.99      0.99      9445
     english       1.00      1.00      1.00      9987
  indonesian       1.00      1.00      1.00     11518
       malay       1.00      1.00      1.00     10636

   micro avg       1.00      1.00      1.00     41586
   macro avg       1.00      1.00      1.00     41586
weighted avg       1.00      1.00      1.00     41586



In [21]:
chinese_text = '今天是６月１８号，也是Muiriel的生日！'
english_text = 'i totally love it man'
indon_text = 'menjabat saleh perombakan menjabat periode komisi energi fraksi partai pengurus partai periode periode partai terpilih periode menjabat komisi perdagangan investasi persatuan periode'
malay_text = 'beliau berkata program Inisitif Peduli Rakyat (IPR) yang diperkenalkan oleh kerajaan negeri Selangor lebih besar sumbangannya'

In [24]:
transformed = bow_chars.transform([chinese_text,english_text,indon_text,malay_text])
batch_x = convert_sparse_matrix_to_sparse_tensor(transformed)
np.argmax(sess.run(model.logits, feed_dict = {model.X: batch_x[0], model.W: batch_x[1]}),axis=1)

array([0, 1, 2, 3])

In [25]:
train_X.shape

(166343, 660726)

In [26]:
saver.save(sess, 'lang-detection-w/model.ckpt')

'lang-detection-w/model.ckpt'