In [1]:
import re
import numpy as np
import pandas as pd
import collections
from sklearn import metrics
from sklearn.cross_validation import train_test_split
import tensorflow as tf
import pandas as pd
from unidecode import unidecode
from nltk.util import ngrams
from tqdm import tqdm
import time



In [2]:
permulaan = [
    'bel',
    'se',
    'ter',
    'men',
    'meng',
    'mem',
    'memper',
    'di',
    'pe',
    'me',
    'ke',
    'ber',
    'pen',
    'per',
]

hujung = ['kan', 'kah', 'lah', 'tah', 'nya', 'an', 'wan', 'wati', 'ita']

def naive_stemmer(word):
    assert isinstance(word, str), 'input must be a string'
    hujung_result = re.findall(r'^(.*?)(%s)$' % ('|'.join(hujung)), word)
    word = hujung_result[0][0] if len(hujung_result) else word
    permulaan_result = re.findall(r'^(.*?)(%s)' % ('|'.join(permulaan[::-1])), word)
    permulaan_result.extend(re.findall(r'^(.*?)(%s)' % ('|'.join(permulaan)), word))
    mula = permulaan_result if len(permulaan_result) else ''
    if len(mula):
        mula = mula[1][1] if len(mula[1][1]) > len(mula[0][1]) else mula[0][1]
    return word.replace(mula, '')

def build_dataset(words, n_words):
    count = [['GO', 0], ['PAD', 1], ['EOS', 2], ['UNK', 3]]
    counter = collections.Counter(words).most_common(n_words)
    count.extend(counter)
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        index = dictionary.get(word, 3)
        if index == 0:
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary


def classification_textcleaning(string):
    string = re.sub(
        'http\S+|www.\S+',
        '',
        ' '.join(
            [i for i in string.split() if i.find('#') < 0 and i.find('@') < 0]
        ),
    )
    string = unidecode(string).replace('.', ' . ').replace(',', ' , ')
    string = re.sub('[^A-Za-z ]+', ' ', string)
    string = re.sub(r'[ ]+', ' ', string).strip()
    string = ' '.join(
        [i for i in re.findall('[\\w\']+|[;:\-\(\)&.,!?"]', string) if len(i)]
    )
    string = string.lower().split()
    string = [naive_stemmer(word) for word in string]
    return ' '.join([word for word in string if len(word) > 1])


def str_idx(corpus, dic, UNK = 3):
    X = []
    for sentence in corpus:
        X.append([dic[w] if w in dic else UNK for w in sentence.split()[:maxlen]])
    return X

def create_ngram_set(input_list, ngram_value):
    return set(zip(*[input_list[i:] for i in range(ngram_value)]))


def build_ngram(x_train):
    global max_features
    ngram_set = set()
    for input_list in tqdm(x_train, total = len(x_train), ncols = 70):
        for i in range(2, 3):
            set_of_ngram = create_ngram_set(input_list, ngram_value = i)
            ngram_set.update(set_of_ngram)
    start_index = max_features + 1
    token_indice = {v: k + start_index for k, v in enumerate(ngram_set)}
    indice_token = {token_indice[k]: k for k in token_indice}

    max_features = np.max(list(indice_token.keys())) + 1
    return token_indice


def add_ngram(sequences, token_indice):
    new_sequences = []
    for input_list in sequences:
        new_list = input_list[:]
        for ngram_value in range(2, ngram_range + 1):
            for i in range(len(new_list) - ngram_value + 1):
                ngram = tuple(new_list[i : i + ngram_value])
                if ngram in token_indice:
                    new_list.append(token_indice[ngram])
        new_sequences.append(new_list)
    return new_sequences

In [3]:
df = pd.read_csv('toxic-bm.csv')
df = df.dropna()
df.shape

(40911, 7)

In [4]:
for i in range(df.shape[0]):
    df.iloc[i,0] = classification_textcleaning(df.iloc[i,0])

In [5]:
texts = df.iloc[:,0].tolist()
concat = ' '.join(texts).split()
vocabulary_size = len(list(set(concat)))
data, count, dictionary, rev_dictionary = build_dataset(concat, vocabulary_size)
print('vocab from size: %d'%(vocabulary_size))
print('Most common words', count[4:10])
print('Sample data', data[:10], [rev_dictionary[i] for i in data[:10]])

vocab from size: 55906
Most common words [('yang', 103249), ('anda', 68711), ('tidak', 54325), ('untuk', 50517), ('ada', 39335), ('saya', 32581)]
Sample data [68, 96, 78, 4, 41, 126, 276, 2580, 6231, 73] ['jelas', 'gapa', 'gedit', 'yang', 'buat', 'bawah', 'minat', 'tegar', 'tallica', 'nama']


In [6]:
ngram_range = 2
max_features = len(dictionary)
maxlen = 80
batch_size = 32
embedded_size = 128

In [7]:
max_features

55910

In [8]:
idx_trainset = str_idx(texts, dictionary, maxlen)

In [9]:
token_indice = build_ngram(idx_trainset)
X = add_ngram(idx_trainset, token_indice)
X = tf.keras.preprocessing.sequence.pad_sequences(X, maxlen)
max_features

100%|████████████████████████| 40911/40911 [00:00<00:00, 98810.92it/s]


527480

In [10]:
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
Y = df[list_classes].values
Y.shape

(40911, 6)

In [11]:
train_X, test_X, train_Y, test_Y = train_test_split(X, Y, test_size = 0.2)

In [12]:
class Model:
    def __init__(
        self, embedded_size, dict_size, dimension_output, learning_rate
    ):

        self.X = tf.placeholder(tf.int32, [None, None])
        self.Y = tf.placeholder(tf.float32, [None, dimension_output])
        encoder_embeddings = tf.Variable(
            tf.random_uniform([dict_size, embedded_size], -1, 1)
        )
        encoder_embedded = tf.nn.embedding_lookup(encoder_embeddings, self.X)
        self.logits = tf.identity(
            tf.layers.dense(
                tf.reduce_mean(encoder_embedded, 1), dimension_output
            ),
            name = 'logits',
        )
        self.cost = tf.reduce_mean(
            tf.nn.sigmoid_cross_entropy_with_logits(
                logits = self.logits, labels = self.Y
            )
        )
        self.optimizer = tf.train.AdamOptimizer(learning_rate).minimize(
            self.cost
        )
        correct_prediction = tf.equal(tf.round(tf.nn.sigmoid(self.logits)), tf.round(self.Y))
        self.accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

In [13]:
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(embedded_size, max_features, Y.shape[1], 5e-4)
sess.run(tf.global_variables_initializer())
saver = tf.train.Saver(tf.trainable_variables())
saver.save(sess, 'fast-text/model.ckpt')

'fast-text/model.ckpt'

In [14]:
strings = ','.join(
    [
        n.name
        for n in tf.get_default_graph().as_graph_def().node
        if ('Variable' in n.op
        or 'Placeholder' in n.name
        or 'logits' in n.name)
        and 'Adam' not in n.name
        and 'beta' not in n.name
    ]
)
strings.split(',')

['Placeholder',
 'Placeholder_1',
 'Variable',
 'dense/kernel',
 'dense/bias',
 'logits']

In [15]:
tf.trainable_variables()

[<tf.Variable 'Variable:0' shape=(527480, 128) dtype=float32_ref>,
 <tf.Variable 'dense/kernel:0' shape=(128, 6) dtype=float32_ref>,
 <tf.Variable 'dense/bias:0' shape=(6,) dtype=float32_ref>]

In [16]:
EARLY_STOPPING, CURRENT_CHECKPOINT, CURRENT_ACC, EPOCH = 5, 0, 0, 0

while True:
    lasttime = time.time()
    if CURRENT_CHECKPOINT == EARLY_STOPPING:
        print('break epoch:%d\n' % (EPOCH))
        break

    train_acc, train_loss, test_acc, test_loss = 0, 0, 0, 0
    pbar = tqdm(
        range(0, len(train_X), batch_size), desc = 'train minibatch loop'
    )
    for i in pbar:
        batch_x = train_X[i : min(i + batch_size, train_X.shape[0])]
        batch_y = train_Y[i : min(i + batch_size, train_X.shape[0])]
        acc, cost, _ = sess.run(
            [model.accuracy, model.cost, model.optimizer],
            feed_dict = {
                model.X: batch_x,
                model.Y: batch_y
            },
        )
        assert not np.isnan(cost)
        train_loss += cost
        train_acc += acc
        pbar.set_postfix(cost = cost, accuracy = acc)

    pbar = tqdm(range(0, len(test_X), batch_size), desc = 'test minibatch loop')
    for i in pbar:
        batch_x = test_X[i : min(i + batch_size, test_X.shape[0])]
        batch_y = test_Y[i : min(i + batch_size, test_X.shape[0])]
        acc, cost = sess.run(
            [model.accuracy, model.cost],
            feed_dict = {
                model.X: batch_x,
                model.Y: batch_y
            },
        )
        test_loss += cost
        test_acc += acc
        pbar.set_postfix(cost = cost, accuracy = acc)

    train_loss /= len(train_X) / batch_size
    train_acc /= len(train_X) / batch_size
    test_loss /= len(test_X) / batch_size
    test_acc /= len(test_X) / batch_size

    if test_acc > CURRENT_ACC:
        print(
            'epoch: %d, pass acc: %f, current acc: %f'
            % (EPOCH, CURRENT_ACC, test_acc)
        )
        CURRENT_ACC = test_acc
        CURRENT_CHECKPOINT = 0
    else:
        CURRENT_CHECKPOINT += 1

    print('time taken:', time.time() - lasttime)
    print(
        'epoch: %d, training loss: %f, training acc: %f, valid loss: %f, valid acc: %f\n'
        % (EPOCH, train_loss, train_acc, test_loss, test_acc)
    )
    EPOCH += 1

saver.save(sess, "fast-text/model.ckpt")

train minibatch loop: 100%|██████████| 1023/1023 [01:19<00:00, 13.15it/s, accuracy=0.944, cost=0.228]
test minibatch loop: 100%|██████████| 256/256 [00:00<00:00, 331.69it/s, accuracy=0.92, cost=0.282]  
train minibatch loop:   0%|          | 2/1023 [00:00<01:17, 13.11it/s, accuracy=0.964, cost=0.157]

epoch: 0, pass acc: 0.000000, current acc: 0.963475
time taken: 80.2831301689148
epoch: 0, training loss: 0.282075, training acc: 0.944010, valid loss: 0.158684, valid acc: 0.963475



train minibatch loop: 100%|██████████| 1023/1023 [01:19<00:00, 12.88it/s, accuracy=0.944, cost=0.167] 
test minibatch loop: 100%|██████████| 256/256 [00:00<00:00, 355.42it/s, accuracy=0.92, cost=0.205]  
train minibatch loop:   0%|          | 2/1023 [00:00<01:20, 12.61it/s, accuracy=0.964, cost=0.11]

epoch: 1, pass acc: 0.963475, current acc: 0.963801
time taken: 80.16905879974365
epoch: 1, training loss: 0.132708, training acc: 0.963275, valid loss: 0.125312, valid acc: 0.963801



train minibatch loop: 100%|██████████| 1023/1023 [01:19<00:00, 12.87it/s, accuracy=0.944, cost=0.128] 
test minibatch loop: 100%|██████████| 256/256 [00:00<00:00, 350.16it/s, accuracy=0.92, cost=0.184]  
train minibatch loop:   0%|          | 2/1023 [00:00<01:19, 12.89it/s, accuracy=0.964, cost=0.0969]

epoch: 2, pass acc: 0.963801, current acc: 0.964819
time taken: 80.25054669380188
epoch: 2, training loss: 0.108261, training acc: 0.964151, valid loss: 0.111573, valid acc: 0.964819



train minibatch loop: 100%|██████████| 1023/1023 [01:19<00:00, 12.84it/s, accuracy=0.958, cost=0.0972]
test minibatch loop: 100%|██████████| 256/256 [00:00<00:00, 360.10it/s, accuracy=0.92, cost=0.171]  
train minibatch loop:   0%|          | 2/1023 [00:00<01:21, 12.57it/s, accuracy=0.964, cost=0.0875]

epoch: 3, pass acc: 0.964819, current acc: 0.966897
time taken: 80.40604782104492
epoch: 3, training loss: 0.091585, training acc: 0.966772, valid loss: 0.101931, valid acc: 0.966897



train minibatch loop: 100%|██████████| 1023/1023 [01:19<00:00, 12.82it/s, accuracy=0.986, cost=0.0783]
test minibatch loop: 100%|██████████| 256/256 [00:00<00:00, 301.75it/s, accuracy=0.92, cost=0.159]  
train minibatch loop:   0%|          | 2/1023 [00:00<01:21, 12.50it/s, accuracy=0.969, cost=0.0796]

epoch: 4, pass acc: 0.966897, current acc: 0.969117
time taken: 80.63232946395874
epoch: 4, training loss: 0.078405, training acc: 0.970231, valid loss: 0.095992, valid acc: 0.969117



train minibatch loop: 100%|██████████| 1023/1023 [01:19<00:00, 12.80it/s, accuracy=0.979, cost=0.0675]
test minibatch loop: 100%|██████████| 256/256 [00:00<00:00, 394.31it/s, accuracy=0.928, cost=0.148] 
train minibatch loop:   0%|          | 2/1023 [00:00<01:19, 12.78it/s, accuracy=0.969, cost=0.0729]

epoch: 5, pass acc: 0.969117, current acc: 0.970265
time taken: 80.57985949516296
epoch: 5, training loss: 0.068010, training acc: 0.973911, valid loss: 0.092461, valid acc: 0.970265



train minibatch loop: 100%|██████████| 1023/1023 [01:19<00:00, 12.84it/s, accuracy=0.979, cost=0.061] 
test minibatch loop: 100%|██████████| 256/256 [00:00<00:00, 332.39it/s, accuracy=0.928, cost=0.138] 
train minibatch loop:   0%|          | 2/1023 [00:00<01:15, 13.46it/s, accuracy=0.974, cost=0.0673]

epoch: 6, pass acc: 0.970265, current acc: 0.970999
time taken: 80.46009850502014
epoch: 6, training loss: 0.059511, training acc: 0.977298, valid loss: 0.090399, valid acc: 0.970999



train minibatch loop: 100%|██████████| 1023/1023 [01:19<00:00, 12.86it/s, accuracy=0.979, cost=0.0568]
test minibatch loop: 100%|██████████| 256/256 [00:00<00:00, 374.28it/s, accuracy=0.935, cost=0.129] 
train minibatch loop:   0%|          | 2/1023 [00:00<01:16, 13.30it/s, accuracy=0.979, cost=0.0623]

epoch: 7, pass acc: 0.970999, current acc: 0.971699
time taken: 80.21313452720642
epoch: 7, training loss: 0.052523, training acc: 0.980521, valid loss: 0.089425, valid acc: 0.971699



train minibatch loop: 100%|██████████| 1023/1023 [01:19<00:00, 12.82it/s, accuracy=0.986, cost=0.0536]
test minibatch loop: 100%|██████████| 256/256 [00:00<00:00, 381.60it/s, accuracy=0.942, cost=0.122] 
train minibatch loop:   0%|          | 2/1023 [00:00<01:22, 12.38it/s, accuracy=0.979, cost=0.0578]

epoch: 8, pass acc: 0.971699, current acc: 0.972257
time taken: 80.45513844490051
epoch: 8, training loss: 0.046784, training acc: 0.983064, valid loss: 0.089325, valid acc: 0.972257



train minibatch loop: 100%|██████████| 1023/1023 [01:19<00:00, 12.85it/s, accuracy=0.986, cost=0.0509]
test minibatch loop: 100%|██████████| 256/256 [00:00<00:00, 339.14it/s, accuracy=0.942, cost=0.116] 
train minibatch loop:   0%|          | 2/1023 [00:00<01:13, 13.81it/s, accuracy=0.979, cost=0.0535]

epoch: 9, pass acc: 0.972257, current acc: 0.972807
time taken: 80.36855268478394
epoch: 9, training loss: 0.042024, training acc: 0.985127, valid loss: 0.089963, valid acc: 0.972807



train minibatch loop: 100%|██████████| 1023/1023 [01:19<00:00, 12.86it/s, accuracy=0.986, cost=0.0483]
test minibatch loop: 100%|██████████| 256/256 [00:00<00:00, 411.38it/s, accuracy=0.942, cost=0.111] 
train minibatch loop:   0%|          | 2/1023 [00:00<01:18, 12.94it/s, accuracy=0.979, cost=0.0493]

epoch: 10, pass acc: 0.972807, current acc: 0.973214
time taken: 80.19673562049866
epoch: 10, training loss: 0.038003, training acc: 0.986868, valid loss: 0.091143, valid acc: 0.973214



train minibatch loop: 100%|██████████| 1023/1023 [01:19<00:00, 12.86it/s, accuracy=0.986, cost=0.0456]
test minibatch loop: 100%|██████████| 256/256 [00:00<00:00, 343.07it/s, accuracy=0.942, cost=0.106] 
train minibatch loop:   0%|          | 2/1023 [00:00<01:18, 13.08it/s, accuracy=0.979, cost=0.0452]

epoch: 11, pass acc: 0.973214, current acc: 0.973479
time taken: 80.30969452857971
epoch: 11, training loss: 0.034536, training acc: 0.988274, valid loss: 0.092835, valid acc: 0.973479



train minibatch loop: 100%|██████████| 1023/1023 [01:19<00:00, 12.85it/s, accuracy=0.986, cost=0.043] 
test minibatch loop: 100%|██████████| 256/256 [00:00<00:00, 354.88it/s, accuracy=0.949, cost=0.102] 
train minibatch loop:   0%|          | 2/1023 [00:00<01:15, 13.44it/s, accuracy=0.984, cost=0.0412]

epoch: 12, pass acc: 0.973479, current acc: 0.973670
time taken: 80.34174299240112
epoch: 12, training loss: 0.031494, training acc: 0.989542, valid loss: 0.094851, valid acc: 0.973670



train minibatch loop: 100%|██████████| 1023/1023 [01:18<00:00, 13.02it/s, accuracy=0.986, cost=0.0402] 
test minibatch loop: 100%|██████████| 256/256 [00:00<00:00, 431.85it/s, accuracy=0.957, cost=0.0987]
train minibatch loop:   0%|          | 2/1023 [00:00<01:15, 13.54it/s, accuracy=0.984, cost=0.0374]

epoch: 13, pass acc: 0.973670, current acc: 0.974024
time taken: 79.19381928443909
epoch: 13, training loss: 0.028782, training acc: 0.990626, valid loss: 0.097209, valid acc: 0.974024



train minibatch loop: 100%|██████████| 1023/1023 [01:17<00:00, 13.27it/s, accuracy=0.986, cost=0.0374]
test minibatch loop: 100%|██████████| 256/256 [00:00<00:00, 373.90it/s, accuracy=0.957, cost=0.0954]
train minibatch loop:   0%|          | 2/1023 [00:00<01:15, 13.60it/s, accuracy=0.984, cost=0.0337]

epoch: 14, pass acc: 0.974024, current acc: 0.974289
time taken: 77.80120038986206
epoch: 14, training loss: 0.026333, training acc: 0.991696, valid loss: 0.099716, valid acc: 0.974289



train minibatch loop: 100%|██████████| 1023/1023 [01:17<00:00, 13.25it/s, accuracy=0.986, cost=0.0345]
test minibatch loop: 100%|██████████| 256/256 [00:00<00:00, 344.36it/s, accuracy=0.957, cost=0.0926]
train minibatch loop:   0%|          | 2/1023 [00:00<01:22, 12.33it/s, accuracy=0.984, cost=0.0302]

epoch: 15, pass acc: 0.974289, current acc: 0.974310
time taken: 77.94110465049744
epoch: 15, training loss: 0.024104, training acc: 0.992648, valid loss: 0.102476, valid acc: 0.974310



train minibatch loop: 100%|██████████| 1023/1023 [01:19<00:00, 12.91it/s, accuracy=0.986, cost=0.0317]
test minibatch loop: 100%|██████████| 256/256 [00:00<00:00, 383.61it/s, accuracy=0.957, cost=0.09]  
train minibatch loop:   0%|          | 2/1023 [00:00<01:25, 11.91it/s, accuracy=0.984, cost=0.027]

epoch: 16, pass acc: 0.974310, current acc: 0.974554
time taken: 79.93895292282104
epoch: 16, training loss: 0.022071, training acc: 0.993402, valid loss: 0.105253, valid acc: 0.974554



train minibatch loop: 100%|██████████| 1023/1023 [01:19<00:00, 12.90it/s, accuracy=0.986, cost=0.0289] 
test minibatch loop: 100%|██████████| 256/256 [00:00<00:00, 345.93it/s, accuracy=0.957, cost=0.0877]
train minibatch loop:   0%|          | 2/1023 [00:00<01:19, 12.86it/s, accuracy=0.984, cost=0.024]

time taken: 80.0658438205719
epoch: 17, training loss: 0.020211, training acc: 0.994059, valid loss: 0.108256, valid acc: 0.974513



train minibatch loop: 100%|██████████| 1023/1023 [01:19<00:00, 12.89it/s, accuracy=0.986, cost=0.0262] 
test minibatch loop: 100%|██████████| 256/256 [00:00<00:00, 321.66it/s, accuracy=0.957, cost=0.0857]
train minibatch loop:   0%|          | 2/1023 [00:00<01:19, 12.79it/s, accuracy=0.99, cost=0.0213]

epoch: 18, pass acc: 0.974554, current acc: 0.974574
time taken: 80.16297912597656
epoch: 18, training loss: 0.018514, training acc: 0.994751, valid loss: 0.111218, valid acc: 0.974574



train minibatch loop: 100%|██████████| 1023/1023 [01:19<00:00, 12.92it/s, accuracy=0.986, cost=0.0237]
test minibatch loop: 100%|██████████| 256/256 [00:00<00:00, 368.21it/s, accuracy=0.957, cost=0.0838]
train minibatch loop:   0%|          | 2/1023 [00:00<01:18, 12.97it/s, accuracy=0.995, cost=0.0189]

epoch: 19, pass acc: 0.974574, current acc: 0.974656
time taken: 79.90573620796204
epoch: 19, training loss: 0.016963, training acc: 0.995235, valid loss: 0.114379, valid acc: 0.974656



train minibatch loop: 100%|██████████| 1023/1023 [01:19<00:00, 12.90it/s, accuracy=0.986, cost=0.0212]
test minibatch loop: 100%|██████████| 256/256 [00:00<00:00, 312.85it/s, accuracy=0.964, cost=0.0821]
train minibatch loop:   0%|          | 2/1023 [00:00<01:17, 13.18it/s, accuracy=0.995, cost=0.0168]

epoch: 20, pass acc: 0.974656, current acc: 0.974684
time taken: 80.15338468551636
epoch: 20, training loss: 0.015550, training acc: 0.995790, valid loss: 0.117534, valid acc: 0.974684



train minibatch loop: 100%|██████████| 1023/1023 [01:19<00:00, 12.92it/s, accuracy=0.986, cost=0.019] 
test minibatch loop: 100%|██████████| 256/256 [00:00<00:00, 382.90it/s, accuracy=0.964, cost=0.0806] 
train minibatch loop:   0%|          | 2/1023 [00:00<01:21, 12.47it/s, accuracy=0.995, cost=0.0149]

time taken: 79.83264565467834
epoch: 21, training loss: 0.014266, training acc: 0.996213, valid loss: 0.120769, valid acc: 0.974664



train minibatch loop: 100%|██████████| 1023/1023 [01:19<00:00, 12.93it/s, accuracy=0.993, cost=0.0169] 
test minibatch loop: 100%|██████████| 256/256 [00:00<00:00, 363.37it/s, accuracy=0.964, cost=0.0793] 
train minibatch loop:   0%|          | 2/1023 [00:00<01:24, 12.02it/s, accuracy=0.995, cost=0.0132]

time taken: 79.7959213256836
epoch: 22, training loss: 0.013098, training acc: 0.996642, valid loss: 0.124084, valid acc: 0.974582



train minibatch loop: 100%|██████████| 1023/1023 [01:19<00:00, 12.92it/s, accuracy=0.993, cost=0.015]  
test minibatch loop: 100%|██████████| 256/256 [00:00<00:00, 371.85it/s, accuracy=0.964, cost=0.078]  
train minibatch loop:   0%|          | 2/1023 [00:00<01:18, 12.98it/s, accuracy=0.995, cost=0.0118]

time taken: 79.85598611831665
epoch: 23, training loss: 0.012035, training acc: 0.997029, valid loss: 0.127429, valid acc: 0.974603



train minibatch loop: 100%|██████████| 1023/1023 [01:19<00:00, 12.92it/s, accuracy=1, cost=0.0133]     
test minibatch loop: 100%|██████████| 256/256 [00:00<00:00, 356.51it/s, accuracy=0.964, cost=0.077]  
train minibatch loop:   0%|          | 2/1023 [00:00<01:20, 12.72it/s, accuracy=0.995, cost=0.0106]

time taken: 79.882652759552
epoch: 24, training loss: 0.011069, training acc: 0.997240, valid loss: 0.130816, valid acc: 0.974562



train minibatch loop: 100%|██████████| 1023/1023 [01:19<00:00, 12.89it/s, accuracy=1, cost=0.0117]     
test minibatch loop: 100%|██████████| 256/256 [00:00<00:00, 377.80it/s, accuracy=0.964, cost=0.076]  


time taken: 80.07178211212158
epoch: 25, training loss: 0.010190, training acc: 0.997484, valid loss: 0.134234, valid acc: 0.974562

break epoch:26



'fast-text/model.ckpt'

In [17]:
stack = []
pbar = tqdm(range(0, len(test_X), batch_size), desc = 'test minibatch loop')
for i in pbar:
    batch_x = test_X[i : min(i + batch_size, test_X.shape[0])]
    batch_y = test_Y[i : min(i + batch_size, test_X.shape[0])]
    stack.append(sess.run(tf.nn.sigmoid(model.logits),
                         feed_dict = {model.X: batch_x}))

test minibatch loop: 100%|██████████| 256/256 [00:02<00:00, 114.45it/s]


In [18]:
print(metrics.classification_report(test_Y,np.around(np.concatenate(stack,axis=0))))

             precision    recall  f1-score   support

          0       0.80      0.47      0.59       815
          1       0.58      0.19      0.29        94
          2       0.90      0.38      0.54       425
          3       0.40      0.06      0.11        31
          4       0.85      0.35      0.49       410
          5       0.35      0.08      0.13        74

avg / total       0.80      0.39      0.52      1849



In [19]:
text = 'bodoh lah anti sosial'
new_vector = add_ngram(str_idx([classification_textcleaning(text)], dictionary), token_indice)
sess.run(tf.nn.sigmoid(model.logits), feed_dict={model.X:new_vector})

array([[1.        , 0.99998116, 1.        , 0.99980634, 1.        ,
        0.9999999 ]], dtype=float32)

In [20]:
import json
with open('fast-text-toxic.json','w') as fopen:
    fopen.write(json.dumps({'dictionary':dictionary,'reverse_dictionary':rev_dictionary}))

In [21]:
def freeze_graph(model_dir, output_node_names):

    if not tf.gfile.Exists(model_dir):
        raise AssertionError(
            "Export directory doesn't exists. Please specify an export "
            'directory: %s' % model_dir
        )

    checkpoint = tf.train.get_checkpoint_state(model_dir)
    input_checkpoint = checkpoint.model_checkpoint_path

    absolute_model_dir = '/'.join(input_checkpoint.split('/')[:-1])
    output_graph = absolute_model_dir + '/frozen_model.pb'
    clear_devices = True
    with tf.Session(graph = tf.Graph()) as sess:
        saver = tf.train.import_meta_graph(
            input_checkpoint + '.meta', clear_devices = clear_devices
        )
        saver.restore(sess, input_checkpoint)
        output_graph_def = tf.graph_util.convert_variables_to_constants(
            sess,
            tf.get_default_graph().as_graph_def(),
            output_node_names.split(','),
        )
        with tf.gfile.GFile(output_graph, 'wb') as f:
            f.write(output_graph_def.SerializeToString())
        print('%d ops in the final graph.' % len(output_graph_def.node))

In [22]:
freeze_graph('fast-text', strings)

INFO:tensorflow:Restoring parameters from fast-text/model.ckpt
INFO:tensorflow:Froze 3 variables.
INFO:tensorflow:Converted 3 variables to const ops.
16 ops in the final graph.


In [23]:
def load_graph(frozen_graph_filename):
    with tf.gfile.GFile(frozen_graph_filename, 'rb') as f:
        graph_def = tf.GraphDef()
        graph_def.ParseFromString(f.read())
    with tf.Graph().as_default() as graph:
        tf.import_graph_def(graph_def)
    return graph

In [24]:
g = load_graph('fast-text/frozen_model.pb')
x = g.get_tensor_by_name('import/Placeholder:0')
logits = g.get_tensor_by_name('import/logits:0')
test_sess = tf.InteractiveSession(graph = g)
test_sess.run(tf.nn.sigmoid(logits), feed_dict = {x: new_vector})



array([[1.        , 0.99998116, 1.        , 0.99980634, 1.        ,
        0.9999999 ]], dtype=float32)

In [25]:
import pickle
with open('fasttext-toxic.pkl','wb') as fopen:
    pickle.dump(token_indice, fopen)