In [1]:
from entity_network import EntityNetwork
import re
import numpy as np
import pandas as pd
import collections
from sklearn import metrics
from sklearn.cross_validation import train_test_split
import tensorflow as tf
import pandas as pd
from unidecode import unidecode
from nltk.util import ngrams
from tqdm import tqdm
import time



In [2]:
permulaan = [
    'bel',
    'se',
    'ter',
    'men',
    'meng',
    'mem',
    'memper',
    'di',
    'pe',
    'me',
    'ke',
    'ber',
    'pen',
    'per',
]

hujung = ['kan', 'kah', 'lah', 'tah', 'nya', 'an', 'wan', 'wati', 'ita']

def naive_stemmer(word):
    assert isinstance(word, str), 'input must be a string'
    hujung_result = re.findall(r'^(.*?)(%s)$' % ('|'.join(hujung)), word)
    word = hujung_result[0][0] if len(hujung_result) else word
    permulaan_result = re.findall(r'^(.*?)(%s)' % ('|'.join(permulaan[::-1])), word)
    permulaan_result.extend(re.findall(r'^(.*?)(%s)' % ('|'.join(permulaan)), word))
    mula = permulaan_result if len(permulaan_result) else ''
    if len(mula):
        mula = mula[1][1] if len(mula[1][1]) > len(mula[0][1]) else mula[0][1]
    return word.replace(mula, '')

def build_dataset(words, n_words):
    count = [['GO', 0], ['PAD', 1], ['EOS', 2], ['UNK', 3]]
    counter = collections.Counter(words).most_common(n_words)
    count.extend(counter)
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        index = dictionary.get(word, 3)
        if index == 0:
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary


def classification_textcleaning(string):
    string = re.sub(
        'http\S+|www.\S+',
        '',
        ' '.join(
            [i for i in string.split() if i.find('#') < 0 and i.find('@') < 0]
        ),
    )
    string = unidecode(string).replace('.', ' . ').replace(',', ' , ')
    string = re.sub('[^A-Za-z ]+', ' ', string)
    string = re.sub(r'[ ]+', ' ', string).strip()
    string = ' '.join(
        [i for i in re.findall('[\\w\']+|[;:\-\(\)&.,!?"]', string) if len(i)]
    )
    string = string.lower().split()
    string = [(naive_stemmer(word), word) for word in string]
    return (
        ' '.join([word[0] for word in string if len(word[0]) > 1]),
        ' '.join([word[1] for word in string if len(word[0]) > 1]),
    )


def str_idx(corpus, dic, maxlen, UNK = 3):
    X = np.zeros((len(corpus), maxlen))
    for i in range(len(corpus)):
        for no, k in enumerate(corpus[i].split()[:maxlen][::-1]):
            val = dic[k] if k in dic else UNK
            X[i, -1 - no] = val
    return X

In [3]:
df = pd.read_csv('toxic-bm.csv')
df = df.dropna()
df.shape

(40911, 7)

In [4]:
for i in range(df.shape[0]):
    df.iloc[i,0] = classification_textcleaning(df.iloc[i,0])[0]

In [5]:
texts = df.iloc[:,0].tolist()
concat = ' '.join(texts).split()
vocabulary_size = len(list(set(concat)))
data, count, dictionary, rev_dictionary = build_dataset(concat, vocabulary_size)
print('vocab from size: %d'%(vocabulary_size))
print('Most common words', count[4:10])
print('Sample data', data[:10], [rev_dictionary[i] for i in data[:10]])

vocab from size: 55906
Most common words [('yang', 103249), ('anda', 68711), ('tidak', 54325), ('untuk', 50517), ('ada', 39335), ('saya', 32581)]
Sample data [68, 96, 78, 4, 41, 126, 276, 2602, 6427, 73] ['jelas', 'gapa', 'gedit', 'yang', 'buat', 'bawah', 'minat', 'tegar', 'tallica', 'nama']


In [6]:
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
Y = df[list_classes].values
Y.shape

(40911, 6)

In [7]:
size_layer = 256
num_layers = 2
embedded_size = 256
dimension_output = Y.shape[1]
learning_rate = 1e-3
maxlen = 80
batch_size = 128
decay_step = 1e4
decay_rate = 1.0
story_len = 1

In [8]:
tf.reset_default_graph()
sess = tf.InteractiveSession()

model = EntityNetwork(
    dimension_output,
    learning_rate,
    decay_step,
    decay_rate,
    maxlen,
    story_len,
    len(dictionary),
    embedded_size,
    size_layer
)

sess.run(tf.global_variables_initializer())

In [9]:
vectors = str_idx(df.iloc[:,0].tolist(), dictionary, maxlen)
train_X, test_X, train_Y, test_Y = train_test_split(
    vectors, Y, test_size = 0.2
)

In [10]:
strings = ','.join(
    [
        n.name
        for n in tf.get_default_graph().as_graph_def().node
        if ('Variable' in n.op
        or 'Placeholder' in n.name
        or 'logits' in n.name
        or 'alphas' in n.name)
        and 'Adam' not in n.name
        and 'beta' not in n.name
        and 'learning_rate' not in n.name
        and 'OptimizeLoss' not in n.name
        and 'Global_Step' not in n.name
        and 'Epoch_Step' not in n.name
    ]
)

In [11]:
saver = tf.train.Saver(tf.trainable_variables())
saver.save(sess, 'entity_network/model.ckpt')

'entity_network/model.ckpt'

In [12]:
from tqdm import tqdm
import time

EARLY_STOPPING, CURRENT_CHECKPOINT, CURRENT_ACC, EPOCH = 3, 0, 0, 0

while True:
    lasttime = time.time()
    if CURRENT_CHECKPOINT == EARLY_STOPPING:
        print('break epoch:%d\n' % (EPOCH))
        break

    train_acc, train_loss, test_acc, test_loss = 0, 0, 0, 0
    pbar = tqdm(
        range(0, len(train_X), batch_size), desc = 'train minibatch loop'
    )
    for i in pbar:
        batch_x = train_X[i : min(i + batch_size, train_X.shape[0])]
        batch_y = train_Y[i : min(i + batch_size, train_X.shape[0])]
        batch_x_expand = np.expand_dims(batch_x,axis = 1)
        acc, cost, _ = sess.run(
            [model.accuracy, model.cost, model.optimizer],
            feed_dict = {
                model.answer_single: batch_y,
                model.query: batch_x,
                model.story: batch_x_expand,
                model.dropout_keep_prob: 1.0
            },
        )
        assert not np.isnan(cost)
        train_loss += cost
        train_acc += acc
        pbar.set_postfix(cost = cost, accuracy = acc)
        
    pbar = tqdm(range(0, len(test_X), batch_size), desc = 'test minibatch loop')
    for i in pbar:
        batch_x = test_X[i : min(i + batch_size, test_X.shape[0])]
        batch_y = test_Y[i : min(i + batch_size, test_X.shape[0])]
        batch_x_expand = np.expand_dims(batch_x,axis = 1)
        acc, cost = sess.run(
            [model.accuracy, model.cost],
            feed_dict = {
                model.answer_single: batch_y,
                model.query: batch_x,
                model.story: batch_x_expand,
                model.dropout_keep_prob: 1.0
            },
        )
        test_loss += cost
        test_acc += acc
        pbar.set_postfix(cost = cost, accuracy = acc)

    train_loss /= len(train_X) / batch_size
    train_acc /= len(train_X) / batch_size
    test_loss /= len(test_X) / batch_size
    test_acc /= len(test_X) / batch_size

    if test_acc > CURRENT_ACC:
        print(
            'epoch: %d, pass acc: %f, current acc: %f'
            % (EPOCH, CURRENT_ACC, test_acc)
        )
        CURRENT_ACC = test_acc
        CURRENT_CHECKPOINT = 0
    else:
        CURRENT_CHECKPOINT += 1
        
    print('time taken:', time.time() - lasttime)
    print(
        'epoch: %d, training loss: %f, training acc: %f, valid loss: %f, valid acc: %f\n'
        % (EPOCH, train_loss, train_acc, test_loss, test_acc)
    )
    EPOCH += 1

train minibatch loop: 100%|██████████| 256/256 [00:17<00:00, 14.86it/s, accuracy=0.983, cost=0.282]
test minibatch loop: 100%|██████████| 64/64 [00:00<00:00, 86.02it/s, accuracy=0.968, cost=0.297]
train minibatch loop:   1%|          | 2/256 [00:00<00:16, 15.01it/s, accuracy=0.956, cost=0.311]

epoch: 0, pass acc: 0.000000, current acc: 0.973446
time taken: 17.992044925689697
epoch: 0, training loss: 1.953666, training acc: 0.959789, valid loss: 0.284721, valid acc: 0.973446



train minibatch loop: 100%|██████████| 256/256 [00:16<00:00, 14.94it/s, accuracy=0.979, cost=0.103] 
test minibatch loop: 100%|██████████| 64/64 [00:00<00:00, 99.93it/s, accuracy=0.968, cost=0.127]  
train minibatch loop:   1%|          | 2/256 [00:00<00:19, 13.36it/s, accuracy=0.961, cost=0.14]

epoch: 1, pass acc: 0.973446, current acc: 0.977296
time taken: 17.50123882293701
epoch: 1, training loss: 0.156030, training acc: 0.976269, valid loss: 0.112349, valid acc: 0.977296



train minibatch loop: 100%|██████████| 256/256 [00:16<00:00, 15.37it/s, accuracy=0.989, cost=0.0795]
test minibatch loop: 100%|██████████| 64/64 [00:00<00:00, 97.40it/s, accuracy=0.968, cost=0.122] 
train minibatch loop:   1%|          | 2/256 [00:00<00:18, 13.76it/s, accuracy=0.966, cost=0.0969]

epoch: 2, pass acc: 0.977296, current acc: 0.978457
time taken: 17.61652660369873
epoch: 2, training loss: 0.093206, training acc: 0.980222, valid loss: 0.099878, valid acc: 0.978457



train minibatch loop: 100%|██████████| 256/256 [00:16<00:00, 16.33it/s, accuracy=0.985, cost=0.0607]
test minibatch loop: 100%|██████████| 64/64 [00:00<00:00, 98.74it/s, accuracy=0.965, cost=0.121]  
train minibatch loop:   1%|          | 2/256 [00:00<00:18, 13.71it/s, accuracy=0.977, cost=0.0766]

time taken: 17.451355934143066
epoch: 3, training loss: 0.080592, training acc: 0.981893, valid loss: 0.096978, valid acc: 0.977639



train minibatch loop: 100%|██████████| 256/256 [00:16<00:00, 15.59it/s, accuracy=0.989, cost=0.055] 
test minibatch loop: 100%|██████████| 64/64 [00:00<00:00, 99.77it/s, accuracy=0.966, cost=0.116]  
train minibatch loop:   1%|          | 2/256 [00:00<00:15, 16.89it/s, accuracy=0.99, cost=0.0595]

time taken: 17.438061237335205
epoch: 4, training loss: 0.073441, training acc: 0.983415, valid loss: 0.094267, valid acc: 0.977946



train minibatch loop: 100%|██████████| 256/256 [00:16<00:00, 15.58it/s, accuracy=0.989, cost=0.0532]
test minibatch loop: 100%|██████████| 64/64 [00:00<00:00, 89.09it/s, accuracy=0.966, cost=0.115] 

time taken: 17.63407874107361
epoch: 5, training loss: 0.068948, training acc: 0.984322, valid loss: 0.093276, valid acc: 0.977783

break epoch:6






In [13]:
stack = []
pbar = tqdm(range(0, len(test_X), batch_size), desc = 'test minibatch loop')
for i in pbar:
    batch_x = test_X[i : min(i + batch_size, test_X.shape[0])]
    batch_y = test_Y[i : min(i + batch_size, test_X.shape[0])]
    batch_x_expand = np.expand_dims(batch_x,axis = 1)
    stack.append(sess.run(tf.nn.sigmoid(model.logits),
                         feed_dict = {model.query: batch_x,
                model.story: batch_x_expand,
                model.dropout_keep_prob: 1.0}))

test minibatch loop: 100%|██████████| 64/64 [00:04<00:00, 13.25it/s]


In [14]:
print(metrics.classification_report(test_Y,np.around(np.concatenate(stack,axis=0))))

             precision    recall  f1-score   support

          0       0.77      0.61      0.68       787
          1       1.00      0.08      0.14        92
          2       0.75      0.66      0.71       432
          3       0.44      0.19      0.27        21
          4       0.68      0.61      0.65       400
          5       0.57      0.05      0.09        79

avg / total       0.75      0.57      0.62      1811



In [15]:
saver.save(sess, 'entity_network/model.ckpt')

'entity_network/model.ckpt'

In [16]:
strings = ','.join(
    [
        n.name
        for n in tf.get_default_graph().as_graph_def().node
        if ('Variable' in n.op
        or 'Placeholder' in n.name
        or 'logits' in n.name
        or 'alphas' in n.name)
        and 'Adam' not in n.name
        and 'beta' not in n.name
        and 'learning_rate' not in n.name
        and 'OptimizeLoss' not in n.name
        and 'Global_Step' not in n.name
        and 'Epoch_Step' not in n.name
    ]
)

In [17]:
text = 'bodoh lah anti sosial'
batch_x = str_idx([classification_textcleaning(text)[0]], dictionary, maxlen)
batch_x_expand = np.expand_dims(batch_x,axis = 1)
sess.run(tf.nn.sigmoid(model.logits),
                         feed_dict = {model.query: batch_x,
                model.story: batch_x_expand,
                model.dropout_keep_prob: 1.0})

array([[0.9761134 , 0.03188965, 0.5244638 , 0.08057603, 0.76619023,
        0.17096736]], dtype=float32)

In [18]:
import json
with open('entity-toxic.json','w') as fopen:
    fopen.write(json.dumps({'dictionary':dictionary,'reverse_dictionary':rev_dictionary}))

In [19]:
def freeze_graph(model_dir, output_node_names):

    if not tf.gfile.Exists(model_dir):
        raise AssertionError(
            "Export directory doesn't exists. Please specify an export "
            'directory: %s' % model_dir
        )

    checkpoint = tf.train.get_checkpoint_state(model_dir)
    input_checkpoint = checkpoint.model_checkpoint_path

    absolute_model_dir = '/'.join(input_checkpoint.split('/')[:-1])
    output_graph = absolute_model_dir + '/frozen_model.pb'
    clear_devices = True
    with tf.Session(graph = tf.Graph()) as sess:
        saver = tf.train.import_meta_graph(
            input_checkpoint + '.meta', clear_devices = clear_devices
        )
        saver.restore(sess, input_checkpoint)
        output_graph_def = tf.graph_util.convert_variables_to_constants(
            sess,
            tf.get_default_graph().as_graph_def(),
            output_node_names.split(','),
        )
        with tf.gfile.GFile(output_graph, 'wb') as f:
            f.write(output_graph_def.SerializeToString())
        print('%d ops in the final graph.' % len(output_graph_def.node))

In [21]:
freeze_graph('entity_network', strings)

INFO:tensorflow:Restoring parameters from entity_network/model.ckpt
INFO:tensorflow:Froze 17 variables.
INFO:tensorflow:Converted 17 variables to const ops.
139 ops in the final graph.


In [22]:
def load_graph(frozen_graph_filename):
    with tf.gfile.GFile(frozen_graph_filename, 'rb') as f:
        graph_def = tf.GraphDef()
        graph_def.ParseFromString(f.read())
    with tf.Graph().as_default() as graph:
        tf.import_graph_def(graph_def)
    return graph

In [24]:
g = load_graph('entity_network/frozen_model.pb')

Placeholder_story = g.get_tensor_by_name('import/Placeholder_story:0')
Placeholder_question = g.get_tensor_by_name('import/Placeholder_question:0')
Placeholder_dropout_keep_prob = g.get_tensor_by_name(
    'import/Placeholder_dropout_keep_prob:0'
)
logits = g.get_tensor_by_name('import/logits:0')
test_sess = tf.InteractiveSession(graph = g)
test_sess.run(
    tf.nn.sigmoid(logits),
    feed_dict = {
        Placeholder_question: batch_x,
        Placeholder_story: batch_x_expand,
        Placeholder_dropout_keep_prob: 1.0,
    },
)



array([[0.9761134 , 0.03188965, 0.5244638 , 0.08057603, 0.76619023,
        0.17096736]], dtype=float32)