In [1]:
import re
import numpy as np
import pandas as pd
import collections
from sklearn import metrics
from sklearn.cross_validation import train_test_split
import tensorflow as tf
import pandas as pd
from unidecode import unidecode
from nltk.util import ngrams
from tqdm import tqdm
import time



In [2]:
permulaan = [
    'bel',
    'se',
    'ter',
    'men',
    'meng',
    'mem',
    'memper',
    'di',
    'pe',
    'me',
    'ke',
    'ber',
    'pen',
    'per',
]

hujung = ['kan', 'kah', 'lah', 'tah', 'nya', 'an', 'wan', 'wati', 'ita']

def naive_stemmer(word):
    assert isinstance(word, str), 'input must be a string'
    hujung_result = [e for e in hujung if word.endswith(e)]
    if len(hujung_result):
        hujung_result = max(hujung_result, key = len)
        if len(hujung_result):
            word = word[: -len(hujung_result)]
    permulaan_result = [e for e in permulaan if word.startswith(e)]
    if len(permulaan_result):
        permulaan_result = max(permulaan_result, key = len)
        if len(permulaan_result):
            word = word[len(permulaan_result) :]
    return word

def build_dataset(words, n_words):
    count = [['GO', 0], ['PAD', 1], ['EOS', 2], ['UNK', 3]]
    counter = collections.Counter(words).most_common(n_words)
    count.extend(counter)
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        index = dictionary.get(word, 3)
        if index == 0:
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary


def classification_textcleaning(string):
    string = re.sub(
        'http\S+|www.\S+',
        '',
        ' '.join(
            [i for i in string.split() if i.find('#') < 0 and i.find('@') < 0]
        ),
    )
    string = unidecode(string).replace('.', ' . ').replace(',', ' , ')
    string = re.sub('[^A-Za-z ]+', ' ', string)
    string = re.sub(r'[ ]+', ' ', string).strip()
    string = ' '.join(
        [i for i in re.findall('[\\w\']+|[;:\-\(\)&.,!?"]', string) if len(i)]
    )
    string = string.lower().split()
    string = [naive_stemmer(word) for word in string]
    return ' '.join([word for word in string if len(word) > 1])


def str_idx(corpus, dic, maxlen, UNK = 3):
    X = np.zeros((len(corpus), maxlen))
    for i in range(len(corpus)):
        for no, k in enumerate(corpus[i].split()[:maxlen][::-1]):
            X[i, -1 - no] = dic.get(k, UNK)
    return X

In [3]:
classification_textcleaning('kerajaan sebenarnya sangat bencikan rakyatnya, minyak naik dan segalanya')

'raja benar sangat benci rakyat minyak naik gala'

In [4]:
df = pd.read_csv('toxic-bm.csv')
df = df.dropna()
df.shape

(40911, 7)

In [5]:
texts = df.iloc[:,0].tolist()

In [6]:
for i in range(len(texts)):
    texts[i] = classification_textcleaning(texts[i])

In [7]:
concat = ' '.join(texts).split()
vocabulary_size = len(list(set(concat)))
data, count, dictionary, rev_dictionary = build_dataset(concat, vocabulary_size)
print('vocab from size: %d'%(vocabulary_size))
print('Most common words', count[4:10])
print('Sample data', data[:10], [rev_dictionary[i] for i in data[:10]])

vocab from size: 56964
Most common words [('yang', 103249), ('anda', 68711), ('tidak', 54325), ('untuk', 50521), ('ada', 39375), ('saya', 32581)]
Sample data [69, 14, 208, 4, 41, 124, 277, 2543, 6095, 74] ['jelas', 'apa', 'gedit', 'yang', 'buat', 'bawah', 'minat', 'tegar', 'tallica', 'nama']


In [8]:
max_features = len(dictionary)
maxlen = 100
batch_size = 32
embedded_size = 256

In [9]:
max_features
X = str_idx(texts, dictionary, maxlen)

In [10]:
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
Y = df[list_classes].values
Y.shape

(40911, 6)

In [11]:
train_X, test_X, train_Y, test_Y = train_test_split(X, Y, test_size = 0.2)

In [12]:
class Model:
    def __init__(
        self, embedded_size, dict_size, dimension_output, learning_rate
    ):

        self.X = tf.placeholder(tf.int32, [None, None])
        self.Y = tf.placeholder(tf.float32, [None, dimension_output])
        encoder_embeddings = tf.Variable(
            tf.random_uniform([dict_size, embedded_size], -1, 1)
        )
        encoder_embedded = tf.nn.embedding_lookup(encoder_embeddings, self.X)
        self.logits = tf.identity(
            tf.layers.dense(
                tf.reduce_mean(encoder_embedded, 1), dimension_output
            ),
            name = 'logits',
        )
        self.cost = tf.reduce_mean(
            tf.nn.sigmoid_cross_entropy_with_logits(
                logits = self.logits, labels = self.Y
            )
        )
        self.optimizer = tf.train.AdamOptimizer(learning_rate).minimize(
            self.cost
        )
        correct_prediction = tf.equal(tf.round(tf.nn.sigmoid(self.logits)), tf.round(self.Y))
        all_labels_true = tf.reduce_min(tf.cast(correct_prediction, tf.float32), 1)
        self.accuracy = tf.reduce_mean(all_labels_true)

In [13]:
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(embedded_size, max_features, Y.shape[1], 5e-4)
sess.run(tf.global_variables_initializer())
saver = tf.train.Saver(tf.trainable_variables())
saver.save(sess, 'fast-text/model.ckpt')

'fast-text/model.ckpt'

In [14]:
strings = ','.join(
    [
        n.name
        for n in tf.get_default_graph().as_graph_def().node
        if ('Variable' in n.op
        or 'Placeholder' in n.name
        or 'logits' in n.name)
        and 'Adam' not in n.name
        and 'beta' not in n.name
    ]
)
strings.split(',')

['Placeholder',
 'Placeholder_1',
 'Variable',
 'dense/kernel',
 'dense/bias',
 'logits']

In [15]:
tf.trainable_variables()

[<tf.Variable 'Variable:0' shape=(56968, 256) dtype=float32_ref>,
 <tf.Variable 'dense/kernel:0' shape=(256, 6) dtype=float32_ref>,
 <tf.Variable 'dense/bias:0' shape=(6,) dtype=float32_ref>]

In [16]:
EARLY_STOPPING, CURRENT_CHECKPOINT, CURRENT_ACC, EPOCH = 5, 0, 0, 0

while True:
    lasttime = time.time()
    if CURRENT_CHECKPOINT == EARLY_STOPPING:
        print('break epoch:%d\n' % (EPOCH))
        break

    train_acc, train_loss, test_acc, test_loss = 0, 0, 0, 0
    pbar = tqdm(
        range(0, len(train_X), batch_size), desc = 'train minibatch loop'
    )
    for i in pbar:
        batch_x = train_X[i : min(i + batch_size, train_X.shape[0])]
        batch_y = train_Y[i : min(i + batch_size, train_X.shape[0])]
        acc, cost, _ = sess.run(
            [model.accuracy, model.cost, model.optimizer],
            feed_dict = {
                model.X: batch_x,
                model.Y: batch_y
            },
        )
        assert not np.isnan(cost)
        train_loss += cost
        train_acc += acc
        pbar.set_postfix(cost = cost, accuracy = acc)

    pbar = tqdm(range(0, len(test_X), batch_size), desc = 'test minibatch loop')
    for i in pbar:
        batch_x = test_X[i : min(i + batch_size, test_X.shape[0])]
        batch_y = test_Y[i : min(i + batch_size, test_X.shape[0])]
        acc, cost = sess.run(
            [model.accuracy, model.cost],
            feed_dict = {
                model.X: batch_x,
                model.Y: batch_y
            },
        )
        test_loss += cost
        test_acc += acc
        pbar.set_postfix(cost = cost, accuracy = acc)

    train_loss /= len(train_X) / batch_size
    train_acc /= len(train_X) / batch_size
    test_loss /= len(test_X) / batch_size
    test_acc /= len(test_X) / batch_size

    if test_acc > CURRENT_ACC:
        print(
            'epoch: %d, pass acc: %f, current acc: %f'
            % (EPOCH, CURRENT_ACC, test_acc)
        )
        CURRENT_ACC = test_acc
        CURRENT_CHECKPOINT = 0
    else:
        CURRENT_CHECKPOINT += 1

    print('time taken:', time.time() - lasttime)
    print(
        'epoch: %d, training loss: %f, training acc: %f, valid loss: %f, valid acc: %f\n'
        % (EPOCH, train_loss, train_acc, test_loss, test_acc)
    )
    EPOCH += 1

saver.save(sess, "fast-text/model.ckpt")

train minibatch loop: 100%|██████████| 1023/1023 [00:17<00:00, 59.28it/s, accuracy=0.833, cost=0.176] 
test minibatch loop: 100%|██████████| 256/256 [00:00<00:00, 623.44it/s, accuracy=0.87, cost=0.15]   
train minibatch loop:   1%|          | 6/1023 [00:00<00:16, 59.94it/s, accuracy=0.812, cost=0.26] 

epoch: 0, pass acc: 0.000000, current acc: 0.900749
time taken: 17.67021131515503
epoch: 0, training loss: 0.175856, training acc: 0.888678, valid loss: 0.126303, valid acc: 0.900749



train minibatch loop: 100%|██████████| 1023/1023 [00:17<00:00, 58.55it/s, accuracy=0.833, cost=0.159] 
test minibatch loop: 100%|██████████| 256/256 [00:00<00:00, 580.00it/s, accuracy=0.87, cost=0.139]  
train minibatch loop:   1%|          | 6/1023 [00:00<00:17, 58.19it/s, accuracy=0.844, cost=0.196] 

time taken: 17.9156494140625
epoch: 1, training loss: 0.126398, training acc: 0.895584, valid loss: 0.116331, valid acc: 0.900749



train minibatch loop: 100%|██████████| 1023/1023 [00:26<00:00, 38.49it/s, accuracy=0.833, cost=0.13]  
test minibatch loop: 100%|██████████| 256/256 [00:00<00:00, 357.52it/s, accuracy=0.87, cost=0.133]  
train minibatch loop:   0%|          | 3/1023 [00:00<00:34, 29.89it/s, accuracy=0.812, cost=0.124] 

time taken: 27.295220375061035
epoch: 2, training loss: 0.115411, training acc: 0.895798, valid loss: 0.108283, valid acc: 0.900626



train minibatch loop: 100%|██████████| 1023/1023 [00:28<00:00, 35.89it/s, accuracy=0.875, cost=0.105] 
test minibatch loop: 100%|██████████| 256/256 [00:00<00:00, 355.35it/s, accuracy=0.913, cost=0.129] 
train minibatch loop:   0%|          | 4/1023 [00:00<00:26, 38.99it/s, accuracy=0.844, cost=0.127] 

epoch: 3, pass acc: 0.900749, current acc: 0.901530
time taken: 29.228374481201172
epoch: 3, training loss: 0.105398, training acc: 0.896572, valid loss: 0.102002, valid acc: 0.901530



train minibatch loop: 100%|██████████| 1023/1023 [00:28<00:00, 35.90it/s, accuracy=0.875, cost=0.0883]
test minibatch loop: 100%|██████████| 256/256 [00:00<00:00, 354.32it/s, accuracy=0.87, cost=0.128]  
train minibatch loop:   0%|          | 4/1023 [00:00<00:29, 34.43it/s, accuracy=0.844, cost=0.117] 

epoch: 4, pass acc: 0.901530, current acc: 0.901971
time taken: 29.222991704940796
epoch: 4, training loss: 0.097251, training acc: 0.897763, valid loss: 0.097868, valid acc: 0.901971



train minibatch loop: 100%|██████████| 1023/1023 [00:28<00:00, 35.89it/s, accuracy=0.917, cost=0.077] 
test minibatch loop: 100%|██████████| 256/256 [00:00<00:00, 348.90it/s, accuracy=0.87, cost=0.127]  
train minibatch loop:   0%|          | 4/1023 [00:00<00:26, 38.90it/s, accuracy=0.844, cost=0.11]  

epoch: 5, pass acc: 0.901971, current acc: 0.902704
time taken: 29.240747928619385
epoch: 5, training loss: 0.090905, training acc: 0.899240, valid loss: 0.095145, valid acc: 0.902704



train minibatch loop: 100%|██████████| 1023/1023 [00:28<00:00, 35.80it/s, accuracy=0.917, cost=0.0689]
test minibatch loop: 100%|██████████| 256/256 [00:00<00:00, 347.63it/s, accuracy=0.87, cost=0.126]  
train minibatch loop:   0%|          | 4/1023 [00:00<00:30, 33.60it/s, accuracy=0.844, cost=0.104] 

epoch: 6, pass acc: 0.902704, current acc: 0.902826
time taken: 29.31447458267212
epoch: 6, training loss: 0.085769, training acc: 0.900982, valid loss: 0.093267, valid acc: 0.902826



train minibatch loop: 100%|██████████| 1023/1023 [00:28<00:00, 35.87it/s, accuracy=0.958, cost=0.0629]
test minibatch loop: 100%|██████████| 256/256 [00:00<00:00, 348.21it/s, accuracy=0.87, cost=0.124]  
train minibatch loop:   0%|          | 4/1023 [00:00<00:31, 32.30it/s, accuracy=0.812, cost=0.1]   

epoch: 7, pass acc: 0.902826, current acc: 0.903315
time taken: 29.26025366783142
epoch: 7, training loss: 0.081451, training acc: 0.902703, valid loss: 0.091942, valid acc: 0.903315



train minibatch loop: 100%|██████████| 1023/1023 [00:28<00:00, 35.87it/s, accuracy=0.958, cost=0.0584]
test minibatch loop: 100%|██████████| 256/256 [00:00<00:00, 357.90it/s, accuracy=0.87, cost=0.122]  
train minibatch loop:   0%|          | 4/1023 [00:00<00:28, 35.55it/s, accuracy=0.812, cost=0.0968]

epoch: 8, pass acc: 0.903315, current acc: 0.904415
time taken: 29.236499071121216
epoch: 8, training loss: 0.077724, training acc: 0.904934, valid loss: 0.091003, valid acc: 0.904415



train minibatch loop: 100%|██████████| 1023/1023 [00:28<00:00, 35.75it/s, accuracy=0.958, cost=0.055] 
test minibatch loop: 100%|██████████| 256/256 [00:00<00:00, 350.84it/s, accuracy=0.87, cost=0.119]  
train minibatch loop:   0%|          | 4/1023 [00:00<00:29, 34.19it/s, accuracy=0.812, cost=0.0937]

epoch: 9, pass acc: 0.904415, current acc: 0.904904
time taken: 29.344812393188477
epoch: 9, training loss: 0.074444, training acc: 0.906309, valid loss: 0.090348, valid acc: 0.904904



train minibatch loop: 100%|██████████| 1023/1023 [00:28<00:00, 35.58it/s, accuracy=0.958, cost=0.0524]
test minibatch loop: 100%|██████████| 256/256 [00:00<00:00, 347.77it/s, accuracy=0.87, cost=0.116]  
train minibatch loop:   0%|          | 5/1023 [00:00<00:23, 42.49it/s, accuracy=0.906, cost=0.0409]

epoch: 10, pass acc: 0.904904, current acc: 0.905148
time taken: 29.26092839241028
epoch: 10, training loss: 0.071510, training acc: 0.908631, valid loss: 0.089916, valid acc: 0.905148



train minibatch loop: 100%|██████████| 1023/1023 [00:28<00:00, 36.33it/s, accuracy=0.958, cost=0.0502]
test minibatch loop: 100%|██████████| 256/256 [00:00<00:00, 349.04it/s, accuracy=0.87, cost=0.114]  
train minibatch loop:   0%|          | 4/1023 [00:00<00:32, 31.31it/s, accuracy=0.812, cost=0.0884]

epoch: 11, pass acc: 0.905148, current acc: 0.905637
time taken: 29.171502828598022
epoch: 11, training loss: 0.068853, training acc: 0.909670, valid loss: 0.089668, valid acc: 0.905637



train minibatch loop: 100%|██████████| 1023/1023 [00:28<00:00, 35.90it/s, accuracy=0.958, cost=0.0485]
test minibatch loop: 100%|██████████| 256/256 [00:00<00:00, 353.38it/s, accuracy=0.87, cost=0.111]  
train minibatch loop:   0%|          | 5/1023 [00:00<00:26, 37.96it/s, accuracy=0.875, cost=0.0752]

time taken: 29.224568128585815
epoch: 12, training loss: 0.066423, training acc: 0.910647, valid loss: 0.089580, valid acc: 0.905270



train minibatch loop: 100%|██████████| 1023/1023 [00:28<00:00, 35.85it/s, accuracy=0.958, cost=0.047] 
test minibatch loop: 100%|██████████| 256/256 [00:00<00:00, 357.96it/s, accuracy=0.87, cost=0.109]  
train minibatch loop:   0%|          | 4/1023 [00:00<00:30, 33.35it/s, accuracy=0.875, cost=0.0718]

time taken: 29.255865335464478
epoch: 13, training loss: 0.064184, training acc: 0.912389, valid loss: 0.089629, valid acc: 0.905515



train minibatch loop: 100%|██████████| 1023/1023 [00:28<00:00, 36.00it/s, accuracy=0.958, cost=0.0456]
test minibatch loop: 100%|██████████| 256/256 [00:00<00:00, 354.33it/s, accuracy=0.87, cost=0.107]  
train minibatch loop:   0%|          | 4/1023 [00:00<00:33, 30.33it/s, accuracy=0.844, cost=0.0828]

epoch: 14, pass acc: 0.905637, current acc: 0.905759
time taken: 29.13993811607361
epoch: 14, training loss: 0.062105, training acc: 0.914283, valid loss: 0.089792, valid acc: 0.905759



train minibatch loop: 100%|██████████| 1023/1023 [00:28<00:00, 35.81it/s, accuracy=0.958, cost=0.0444]
test minibatch loop: 100%|██████████| 256/256 [00:00<00:00, 359.60it/s, accuracy=0.87, cost=0.106]  
train minibatch loop:   0%|          | 5/1023 [00:00<00:23, 43.47it/s, accuracy=0.938, cost=0.0344]

time taken: 29.2790687084198
epoch: 15, training loss: 0.060164, training acc: 0.916300, valid loss: 0.090051, valid acc: 0.905637



train minibatch loop: 100%|██████████| 1023/1023 [00:28<00:00, 35.84it/s, accuracy=0.958, cost=0.0432]
test minibatch loop: 100%|██████████| 256/256 [00:00<00:00, 354.78it/s, accuracy=0.87, cost=0.105]  
train minibatch loop:   0%|          | 4/1023 [00:00<00:29, 33.98it/s, accuracy=0.875, cost=0.0639]

time taken: 29.27013373374939
epoch: 16, training loss: 0.058344, training acc: 0.917736, valid loss: 0.090388, valid acc: 0.905392



train minibatch loop: 100%|██████████| 1023/1023 [00:28<00:00, 35.91it/s, accuracy=0.958, cost=0.042] 
test minibatch loop: 100%|██████████| 256/256 [00:00<00:00, 350.08it/s, accuracy=0.87, cost=0.104]  
train minibatch loop:   0%|          | 4/1023 [00:00<00:30, 33.31it/s, accuracy=0.875, cost=0.062] 

epoch: 17, pass acc: 0.905759, current acc: 0.906370
time taken: 29.219120979309082
epoch: 17, training loss: 0.056629, training acc: 0.919325, valid loss: 0.090790, valid acc: 0.906370



train minibatch loop: 100%|██████████| 1023/1023 [00:28<00:00, 35.88it/s, accuracy=0.958, cost=0.0408]
test minibatch loop: 100%|██████████| 256/256 [00:00<00:00, 352.86it/s, accuracy=0.87, cost=0.103]  
train minibatch loop:   0%|          | 4/1023 [00:00<00:28, 36.34it/s, accuracy=0.844, cost=0.0765]

epoch: 18, pass acc: 0.906370, current acc: 0.906614
time taken: 29.237867832183838
epoch: 18, training loss: 0.055008, training acc: 0.920883, valid loss: 0.091249, valid acc: 0.906614



train minibatch loop: 100%|██████████| 1023/1023 [00:28<00:00, 35.85it/s, accuracy=0.958, cost=0.0397]
test minibatch loop: 100%|██████████| 256/256 [00:00<00:00, 348.56it/s, accuracy=0.87, cost=0.102]  
train minibatch loop:   0%|          | 5/1023 [00:00<00:24, 42.08it/s, accuracy=0.938, cost=0.0307]

epoch: 19, pass acc: 0.906614, current acc: 0.906859
time taken: 29.268934726715088
epoch: 19, training loss: 0.053469, training acc: 0.922472, valid loss: 0.091758, valid acc: 0.906859



train minibatch loop: 100%|██████████| 1023/1023 [00:28<00:00, 36.01it/s, accuracy=0.958, cost=0.0385]
test minibatch loop: 100%|██████████| 256/256 [00:00<00:00, 339.79it/s, accuracy=0.87, cost=0.102]  
train minibatch loop:   0%|          | 3/1023 [00:00<00:34, 29.86it/s, accuracy=0.844, cost=0.0731]

epoch: 20, pass acc: 0.906859, current acc: 0.907103
time taken: 29.162500619888306
epoch: 20, training loss: 0.052005, training acc: 0.924305, valid loss: 0.092313, valid acc: 0.907103



train minibatch loop: 100%|██████████| 1023/1023 [00:28<00:00, 35.33it/s, accuracy=0.958, cost=0.0372]
test minibatch loop: 100%|██████████| 256/256 [00:00<00:00, 358.43it/s, accuracy=0.87, cost=0.102]  
train minibatch loop:   0%|          | 4/1023 [00:00<00:29, 34.42it/s, accuracy=0.906, cost=0.0567]

time taken: 29.101248264312744
epoch: 21, training loss: 0.050609, training acc: 0.925986, valid loss: 0.092906, valid acc: 0.907103



train minibatch loop: 100%|██████████| 1023/1023 [00:28<00:00, 35.82it/s, accuracy=0.958, cost=0.036] 
test minibatch loop: 100%|██████████| 256/256 [00:00<00:00, 355.42it/s, accuracy=0.87, cost=0.102]  
train minibatch loop:   0%|          | 4/1023 [00:00<00:33, 30.48it/s, accuracy=0.875, cost=0.0695]

epoch: 22, pass acc: 0.907103, current acc: 0.907225
time taken: 29.283747673034668
epoch: 22, training loss: 0.049276, training acc: 0.927483, valid loss: 0.093537, valid acc: 0.907225



train minibatch loop: 100%|██████████| 1023/1023 [00:28<00:00, 34.35it/s, accuracy=0.958, cost=0.0348]
test minibatch loop: 100%|██████████| 256/256 [00:00<00:00, 352.76it/s, accuracy=0.87, cost=0.102]  
train minibatch loop:   0%|          | 5/1023 [00:00<00:25, 40.34it/s, accuracy=0.906, cost=0.0547]

epoch: 23, pass acc: 0.907225, current acc: 0.907348
time taken: 29.24654722213745
epoch: 23, training loss: 0.048001, training acc: 0.929225, valid loss: 0.094200, valid acc: 0.907348



train minibatch loop: 100%|██████████| 1023/1023 [00:28<00:00, 35.87it/s, accuracy=1, cost=0.0336]    
test minibatch loop: 100%|██████████| 256/256 [00:00<00:00, 360.20it/s, accuracy=0.87, cost=0.102]  
train minibatch loop:   0%|          | 4/1023 [00:00<00:30, 33.43it/s, accuracy=0.906, cost=0.0537]

time taken: 29.238682508468628
epoch: 24, training loss: 0.046778, training acc: 0.930946, valid loss: 0.094891, valid acc: 0.907225



train minibatch loop: 100%|██████████| 1023/1023 [00:28<00:00, 36.51it/s, accuracy=1, cost=0.0323]    
test minibatch loop: 100%|██████████| 256/256 [00:00<00:00, 348.39it/s, accuracy=0.87, cost=0.102]  
train minibatch loop:   0%|          | 4/1023 [00:00<00:30, 33.57it/s, accuracy=0.906, cost=0.0527]

time taken: 29.24742031097412
epoch: 25, training loss: 0.045603, training acc: 0.932688, valid loss: 0.095596, valid acc: 0.907225



train minibatch loop: 100%|██████████| 1023/1023 [00:28<00:00, 36.15it/s, accuracy=1, cost=0.0311]    
test minibatch loop: 100%|██████████| 256/256 [00:00<00:00, 351.29it/s, accuracy=0.87, cost=0.102]  
train minibatch loop:   0%|          | 4/1023 [00:00<00:30, 33.10it/s, accuracy=0.906, cost=0.0517]

time taken: 29.029107570648193
epoch: 26, training loss: 0.044470, training acc: 0.934093, valid loss: 0.096319, valid acc: 0.906981



train minibatch loop: 100%|██████████| 1023/1023 [00:28<00:00, 35.99it/s, accuracy=1, cost=0.0298]    
test minibatch loop: 100%|██████████| 256/256 [00:00<00:00, 322.37it/s, accuracy=0.87, cost=0.102]  
train minibatch loop:   0%|          | 5/1023 [00:00<00:24, 41.90it/s, accuracy=0.938, cost=0.0245]

time taken: 29.21823024749756
epoch: 27, training loss: 0.043376, training acc: 0.935560, valid loss: 0.097054, valid acc: 0.906737



train minibatch loop: 100%|██████████| 1023/1023 [00:28<00:00, 35.96it/s, accuracy=1, cost=0.0285]    
test minibatch loop: 100%|██████████| 256/256 [00:00<00:00, 349.76it/s, accuracy=0.87, cost=0.103]  


time taken: 29.18390130996704
epoch: 28, training loss: 0.042318, training acc: 0.937088, valid loss: 0.097813, valid acc: 0.907225

break epoch:29



'fast-text/model.ckpt'

In [17]:
stack = []
pbar = tqdm(range(0, len(test_X), batch_size), desc = 'test minibatch loop')
for i in pbar:
    batch_x = test_X[i : min(i + batch_size, test_X.shape[0])]
    batch_y = test_Y[i : min(i + batch_size, test_X.shape[0])]
    stack.append(sess.run(tf.nn.sigmoid(model.logits),
                         feed_dict = {model.X: batch_x}))

test minibatch loop: 100%|██████████| 256/256 [00:01<00:00, 136.64it/s]


In [18]:
print(metrics.classification_report(test_Y,np.around(np.concatenate(stack,axis=0)),
                                    target_names=["toxic", "severe_toxic", "obscene", 
                                            "threat", "insult", "identity_hate"]))

               precision    recall  f1-score   support

        toxic       0.80      0.52      0.63       772
 severe_toxic       0.49      0.25      0.33        72
      obscene       0.78      0.44      0.56       422
       threat       0.29      0.15      0.20        26
       insult       0.70      0.46      0.56       380
identity_hate       0.28      0.14      0.19        57

  avg / total       0.74      0.46      0.56      1729



In [19]:
text = 'kerajaan sebenarnya sangat bencikan rakyatnya, minyak naik dan segalanya'
new_vector = str_idx([classification_textcleaning(text)], dictionary, len(text.split()))
sess.run(tf.nn.sigmoid(model.logits), feed_dict={model.X:new_vector})

array([[0.99999166, 0.26489863, 0.97606987, 0.00815156, 0.09597817,
        0.00756723]], dtype=float32)

In [20]:
import json
with open('fast-text-toxic.json','w') as fopen:
    fopen.write(json.dumps({'dictionary':dictionary,'reverse_dictionary':rev_dictionary}))

In [21]:
def freeze_graph(model_dir, output_node_names):

    if not tf.gfile.Exists(model_dir):
        raise AssertionError(
            "Export directory doesn't exists. Please specify an export "
            'directory: %s' % model_dir
        )

    checkpoint = tf.train.get_checkpoint_state(model_dir)
    input_checkpoint = checkpoint.model_checkpoint_path

    absolute_model_dir = '/'.join(input_checkpoint.split('/')[:-1])
    output_graph = absolute_model_dir + '/frozen_model.pb'
    clear_devices = True
    with tf.Session(graph = tf.Graph()) as sess:
        saver = tf.train.import_meta_graph(
            input_checkpoint + '.meta', clear_devices = clear_devices
        )
        saver.restore(sess, input_checkpoint)
        output_graph_def = tf.graph_util.convert_variables_to_constants(
            sess,
            tf.get_default_graph().as_graph_def(),
            output_node_names.split(','),
        )
        with tf.gfile.GFile(output_graph, 'wb') as f:
            f.write(output_graph_def.SerializeToString())
        print('%d ops in the final graph.' % len(output_graph_def.node))

In [22]:
freeze_graph('fast-text', strings)

INFO:tensorflow:Restoring parameters from fast-text/model.ckpt
INFO:tensorflow:Froze 3 variables.
INFO:tensorflow:Converted 3 variables to const ops.
16 ops in the final graph.


In [23]:
def load_graph(frozen_graph_filename):
    with tf.gfile.GFile(frozen_graph_filename, 'rb') as f:
        graph_def = tf.GraphDef()
        graph_def.ParseFromString(f.read())
    with tf.Graph().as_default() as graph:
        tf.import_graph_def(graph_def)
    return graph

In [24]:
g = load_graph('fast-text/frozen_model.pb')
x = g.get_tensor_by_name('import/Placeholder:0')
logits = g.get_tensor_by_name('import/logits:0')
test_sess = tf.InteractiveSession(graph = g)
test_sess.run(tf.nn.sigmoid(logits), feed_dict = {x: new_vector})



array([[0.99999166, 0.26489863, 0.97606987, 0.00815156, 0.09597817,
        0.00756723]], dtype=float32)