In [1]:
import json
import re
import sentencepiece as spm

In [2]:
from prepro_utils import preprocess_text, encode_ids, encode_pieces

sp_model = spm.SentencePieceProcessor()
sp_model.Load('bert-base/sp10m.cased.v4.model')

with open('bert-base/sp10m.cased.v4.vocab') as fopen:
    v = fopen.read().split('\n')[:-1]
v = [i.split('\t') for i in v]
v = {i[0]: i[1] for i in v}

class Tokenizer:
    def __init__(self, v):
        self.vocab = v
        pass
    
    def tokenize(self, string):
        return encode_pieces(sp_model, string, return_unicode=False, sample=False)
    
    def convert_tokens_to_ids(self, tokens):
        return [sp_model.PieceToId(piece) for piece in tokens]
    
    def convert_ids_to_tokens(self, ids):
        return [sp_model.IdToPiece(i) for i in ids]
    
tokenizer = Tokenizer(v)

In [3]:
import bert
from bert import run_classifier
from bert import optimization
from bert import tokenization
from bert import modeling
import numpy as np
import json
import tensorflow as tf
import itertools
from unidecode import unidecode
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import re

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])





  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [4]:
BERT_INIT_CHKPNT = 'bert-base/model.ckpt'
BERT_CONFIG = 'bert-base/bert_config.json'

In [5]:
def cleaning(string):
    string = unidecode(string)
    string = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', string)
    string = re.sub(r'[ ]+', ' ', string).strip().split()
    string = [w for w in string if w[0] != '@']
    return ' '.join(string)

In [6]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [7]:
df = pd.read_csv('../Malaya-Dataset/toxicity/toxic-bm.csv')
df = df.dropna()
df.shape

(40911, 7)

In [8]:
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
Y = df[list_classes].values.tolist()
X = df['text'].tolist()
len(Y), len(X)

(40911, 40911)

In [9]:
import glob

files = glob.glob('../Malaya-Dataset/toxicity/*.json')
files

['../Malaya-Dataset/toxicity/toxic6.json',
 '../Malaya-Dataset/toxicity/toxic1.json',
 '../Malaya-Dataset/toxicity/toxic4.json',
 '../Malaya-Dataset/toxicity/toxic5.json',
 '../Malaya-Dataset/toxicity/toxic3.json',
 '../Malaya-Dataset/toxicity/toxic0.json',
 '../Malaya-Dataset/toxicity/toxic7.json',
 '../Malaya-Dataset/toxicity/toxic2.json']

In [10]:
for file in files:
    with open(file) as fopen:
        data = json.load(fopen)
    for x, y in data:
        X.append(x)
        Y.append(y)

In [11]:
X[0]

'penjelasan mengapa pengeditan yang dibuat di bawah peminat tegar metallica nama saya telah dibalikkan, mereka bukan vandalisme hanya ditutup pada beberapa gas selepas mengundi di boneka anak york baru dan sila jangan keluarkan templat dari halaman bercakap sejak saya bersara sekarang'

In [12]:
MAX_SEQ_LENGTH = 200

In [13]:
texts = X

In [14]:
from tqdm import tqdm

input_ids, input_masks, segment_ids = [], [], []

for text in tqdm(texts):
    text = cleaning(text)
    tokens_a = tokenizer.tokenize(text)
    if len(tokens_a) > MAX_SEQ_LENGTH - 2:
        tokens_a = tokens_a[:(MAX_SEQ_LENGTH - 2)]
    tokens = ["<cls>"] + tokens_a + ["<sep>"]
    segment_id = [0] * len(tokens)
    input_id = tokenizer.convert_tokens_to_ids(tokens)
    input_mask = [1] * len(input_id)
    padding = [0] * (MAX_SEQ_LENGTH - len(input_id))
    input_id += padding
    input_mask += padding
    segment_id += padding
    
    input_ids.append(input_id)
    input_masks.append(input_mask)
    segment_ids.append(segment_id)

100%|██████████| 192029/192029 [01:15<00:00, 2541.10it/s]


In [15]:
bert_config = modeling.BertConfig.from_json_file(BERT_CONFIG)

epoch = 10
batch_size = 60
warmup_proportion = 0.1
num_train_steps = int(len(texts) / batch_size * epoch)
num_warmup_steps = int(num_train_steps * warmup_proportion)




In [16]:
def create_initializer(initializer_range=0.02):
    return tf.truncated_normal_initializer(stddev=initializer_range)

class Model:
    def __init__(
        self,
        dimension_output,
        learning_rate = 2e-5,
        training = True
    ):
        self.X = tf.placeholder(tf.int32, [None, None])
        self.Y = tf.placeholder(tf.float32, [None, dimension_output])
        
        model = modeling.BertModel(
            config=bert_config,
            is_training=training,
            input_ids=self.X,
            use_one_hot_embeddings=False)
        
        output_layer = model.get_sequence_output()
        output_layer = tf.layers.dense(
            output_layer,
            bert_config.hidden_size,
            activation=tf.tanh,
            kernel_initializer=create_initializer())
        self.logits_seq = tf.layers.dense(output_layer, dimension_output,
                                         kernel_initializer=create_initializer())
        self.logits_seq = tf.identity(self.logits_seq, name = 'logits_seq')
        self.logits = self.logits_seq[:, 0]
        self.logits = tf.identity(self.logits, name = 'logits')
        
        self.cost = tf.reduce_mean(
            tf.nn.sigmoid_cross_entropy_with_logits(
                logits = self.logits, labels = self.Y
            )
        )
        
        self.optimizer = optimization.create_optimizer(self.cost, learning_rate, 
                                                       num_train_steps, num_warmup_steps, False)
        
        correct_prediction = tf.equal(tf.round(tf.nn.sigmoid(self.logits)), tf.round(self.Y))
        all_labels_true = tf.reduce_min(tf.cast(correct_prediction, tf.float32), 1)
        self.accuracy = tf.reduce_mean(all_labels_true)

In [17]:
dimension_output = 6
learning_rate = 2e-5

tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(
    dimension_output,
    learning_rate
)

sess.run(tf.global_variables_initializer())
var_lists = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope = 'bert')
saver = tf.train.Saver(var_list = var_lists)
saver.restore(sess, BERT_INIT_CHKPNT)




The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use keras.layers.dense instead.
















Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where




Instructions for updating:
Deprecated in favor of operator or tf.math.divide.

Instructions for updating:
Use standard file APIs to check for files with this prefix.
INFO:tensorflow:Restoring parameters from bert-base/model.ckpt


In [18]:
from sklearn.model_selection import train_test_split

train_X, test_X, train_Y, test_Y = train_test_split(
    input_ids, Y, test_size = 0.2
)

In [19]:
from tqdm import tqdm
import time

EARLY_STOPPING, CURRENT_CHECKPOINT, CURRENT_ACC, EPOCH = 2, 0, 0, 0

while True:
    lasttime = time.time()
    if CURRENT_CHECKPOINT == EARLY_STOPPING:
        print('break epoch:%d\n' % (EPOCH))
        break

    train_acc, train_loss, test_acc, test_loss = [], [], [], []
    pbar = tqdm(
        range(0, len(train_X), batch_size), desc = 'train minibatch loop'
    )
    for i in pbar:
        index = min(i + batch_size, len(train_X))
        batch_x = train_X[i: index]
        batch_y = train_Y[i: index]
        acc, cost, _ = sess.run(
            [model.accuracy, model.cost, model.optimizer],
            feed_dict = {
                model.Y: batch_y,
                model.X: batch_x,
            },
        )
        train_loss.append(cost)
        train_acc.append(acc)
        pbar.set_postfix(cost = cost, accuracy = acc)
        
    pbar = tqdm(range(0, len(test_X), batch_size), desc = 'test minibatch loop')
    for i in pbar:
        index = min(i + batch_size, len(test_X))
        batch_x = test_X[i: index]
        batch_y = test_Y[i: index]
        acc, cost = sess.run(
            [model.accuracy, model.cost],
            feed_dict = {
                model.Y: batch_y,
                model.X: batch_x,
            },
        )
        test_loss.append(cost)
        test_acc.append(acc)
        pbar.set_postfix(cost = cost, accuracy = acc)
        
    train_loss = np.mean(train_loss)
    train_acc = np.mean(train_acc)
    test_loss = np.mean(test_loss)
    test_acc = np.mean(test_acc)
        
    if test_acc > CURRENT_ACC:
        print(
            'epoch: %d, pass acc: %f, current acc: %f'
            % (EPOCH, CURRENT_ACC, test_acc)
        )
        CURRENT_ACC = test_acc
        CURRENT_CHECKPOINT = 0
    else:
        CURRENT_CHECKPOINT += 1
        
    print('time taken:', time.time() - lasttime)
    print(
        'epoch: %d, training loss: %f, training acc: %f, valid loss: %f, valid acc: %f\n'
        % (EPOCH, train_loss, train_acc, test_loss, test_acc)
    )
    EPOCH += 1

train minibatch loop: 100%|██████████| 2561/2561 [36:33<00:00,  1.17it/s, accuracy=0.957, cost=0.0597]
test minibatch loop: 100%|██████████| 641/641 [03:17<00:00,  3.25it/s, accuracy=1, cost=0.00411]   
train minibatch loop:   0%|          | 0/2561 [00:00<?, ?it/s]

epoch: 0, pass acc: 0.000000, current acc: 0.909984
time taken: 2390.9338731765747
epoch: 0, training loss: 0.092162, training acc: 0.899410, valid loss: 0.064912, valid acc: 0.909984



train minibatch loop: 100%|██████████| 2561/2561 [36:40<00:00,  1.16it/s, accuracy=0.957, cost=0.0524] 
test minibatch loop: 100%|██████████| 641/641 [03:17<00:00,  3.25it/s, accuracy=1, cost=0.00363]    
train minibatch loop:   0%|          | 0/2561 [00:00<?, ?it/s]

epoch: 1, pass acc: 0.909984, current acc: 0.915913
time taken: 2397.194011449814
epoch: 1, training loss: 0.055102, training acc: 0.916969, valid loss: 0.059234, valid acc: 0.915913



train minibatch loop: 100%|██████████| 2561/2561 [36:42<00:00,  1.16it/s, accuracy=0.957, cost=0.0401] 
test minibatch loop: 100%|██████████| 641/641 [03:17<00:00,  3.25it/s, accuracy=1, cost=0.00447]    
train minibatch loop:   0%|          | 0/2561 [00:00<?, ?it/s]

time taken: 2400.0915756225586
epoch: 2, training loss: 0.046167, training acc: 0.926158, valid loss: 0.062536, valid acc: 0.915211



train minibatch loop: 100%|██████████| 2561/2561 [36:39<00:00,  1.16it/s, accuracy=0.957, cost=0.0372] 
test minibatch loop: 100%|██████████| 641/641 [03:17<00:00,  3.25it/s, accuracy=1, cost=0.00197]    
train minibatch loop:   0%|          | 0/2561 [00:00<?, ?it/s]

epoch: 3, pass acc: 0.915913, current acc: 0.916069
time taken: 2396.6281962394714
epoch: 3, training loss: 0.038815, training acc: 0.936290, valid loss: 0.064270, valid acc: 0.916069



train minibatch loop: 100%|██████████| 2561/2561 [36:38<00:00,  1.16it/s, accuracy=1, cost=0.0185]     
test minibatch loop: 100%|██████████| 641/641 [03:17<00:00,  3.25it/s, accuracy=1, cost=0.00159]    
train minibatch loop:   0%|          | 0/2561 [00:00<?, ?it/s]

epoch: 4, pass acc: 0.916069, current acc: 0.918097
time taken: 2396.0971899032593
epoch: 4, training loss: 0.032544, training acc: 0.946440, valid loss: 0.066203, valid acc: 0.918097



train minibatch loop: 100%|██████████| 2561/2561 [36:39<00:00,  1.16it/s, accuracy=0.957, cost=0.0209] 
test minibatch loop: 100%|██████████| 641/641 [03:17<00:00,  3.25it/s, accuracy=1, cost=0.00151]   
train minibatch loop:   0%|          | 0/2561 [00:00<?, ?it/s]

time taken: 2397.0322093963623
epoch: 5, training loss: 0.026906, training acc: 0.956387, valid loss: 0.069481, valid acc: 0.916017



train minibatch loop: 100%|██████████| 2561/2561 [36:40<00:00,  1.16it/s, accuracy=0.913, cost=0.0344] 
test minibatch loop: 100%|██████████| 641/641 [03:17<00:00,  3.25it/s, accuracy=1, cost=0.00778]    

time taken: 2397.7447509765625
epoch: 6, training loss: 0.022502, training acc: 0.964772, valid loss: 0.073521, valid acc: 0.910582

break epoch:7






In [20]:
saver = tf.train.Saver(tf.trainable_variables())
saver.save(sess, 'bert-base-toxic/model.ckpt')

'bert-base-toxic/model.ckpt'

In [21]:
dimension_output = 6
learning_rate = 2e-5

tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(
    dimension_output,
    learning_rate,
    training=False
    
)

sess.run(tf.global_variables_initializer())
saver = tf.train.Saver(tf.trainable_variables())
saver.restore(sess, 'bert-base-toxic/model.ckpt')



















INFO:tensorflow:Restoring parameters from bert-base-toxic/model.ckpt


In [22]:
stack = []

pbar = tqdm(
    range(0, len(test_X), batch_size), desc = 'validation minibatch loop'
)
for i in pbar:
    index = min(i + batch_size, len(test_X))
    batch_x = test_X[i: index]
    stack.append(sess.run(tf.nn.sigmoid(model.logits),
            feed_dict = {
                model.X: batch_x,
            },
    ))

validation minibatch loop: 100%|██████████| 641/641 [07:12<00:00,  1.48it/s]


In [23]:
from sklearn import metrics

print(metrics.classification_report(np.array(test_Y),np.around(np.concatenate(stack,axis=0)),
                                    target_names=["toxic", "severe_toxic", "obscene", 
                                            "threat", "insult", "identity_hate"],
                                   digits=5))

               precision    recall  f1-score   support

        toxic    0.77604   0.73972   0.75745      3696
 severe_toxic    0.46594   0.44531   0.45539       384
      obscene    0.70845   0.75122   0.72921      2054
       threat    0.52525   0.50000   0.51232       104
       insult    0.72469   0.64050   0.68000      1911
identity_hate    0.56610   0.51385   0.53871       325

    micro avg    0.72273   0.69519   0.70869      8474
    macro avg    0.62775   0.59843   0.61218      8474
 weighted avg    0.72290   0.69519   0.70805      8474
  samples avg    0.06576   0.06529   0.06289      8474



  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [24]:
strings = ','.join(
    [
        n.name
        for n in tf.get_default_graph().as_graph_def().node
        if ('Variable' in n.op
        or 'Placeholder' in n.name
        or 'logits' in n.name
        or 'alphas' in n.name
        or 'self/Softmax' in n.name)
        and 'adam' not in n.name
        and 'beta' not in n.name
        and 'global_step' not in n.name
    ]
)
strings.split(',')

['Placeholder',
 'Placeholder_1',
 'bert/embeddings/word_embeddings',
 'bert/embeddings/token_type_embeddings',
 'bert/embeddings/position_embeddings',
 'bert/embeddings/LayerNorm/gamma',
 'bert/encoder/layer_0/attention/self/query/kernel',
 'bert/encoder/layer_0/attention/self/query/bias',
 'bert/encoder/layer_0/attention/self/key/kernel',
 'bert/encoder/layer_0/attention/self/key/bias',
 'bert/encoder/layer_0/attention/self/value/kernel',
 'bert/encoder/layer_0/attention/self/value/bias',
 'bert/encoder/layer_0/attention/self/Softmax',
 'bert/encoder/layer_0/attention/output/dense/kernel',
 'bert/encoder/layer_0/attention/output/dense/bias',
 'bert/encoder/layer_0/attention/output/LayerNorm/gamma',
 'bert/encoder/layer_0/intermediate/dense/kernel',
 'bert/encoder/layer_0/intermediate/dense/bias',
 'bert/encoder/layer_0/output/dense/kernel',
 'bert/encoder/layer_0/output/dense/bias',
 'bert/encoder/layer_0/output/LayerNorm/gamma',
 'bert/encoder/layer_1/attention/self/query/kernel',
 

In [25]:
def freeze_graph(model_dir, output_node_names):

    if not tf.gfile.Exists(model_dir):
        raise AssertionError(
            "Export directory doesn't exists. Please specify an export "
            'directory: %s' % model_dir
        )

    checkpoint = tf.train.get_checkpoint_state(model_dir)
    input_checkpoint = checkpoint.model_checkpoint_path

    absolute_model_dir = '/'.join(input_checkpoint.split('/')[:-1])
    output_graph = absolute_model_dir + '/frozen_model.pb'
    clear_devices = True
    with tf.Session(graph = tf.Graph()) as sess:
        saver = tf.train.import_meta_graph(
            input_checkpoint + '.meta', clear_devices = clear_devices
        )
        saver.restore(sess, input_checkpoint)
        output_graph_def = tf.graph_util.convert_variables_to_constants(
            sess,
            tf.get_default_graph().as_graph_def(),
            output_node_names.split(','),
        )
        with tf.gfile.GFile(output_graph, 'wb') as f:
            f.write(output_graph_def.SerializeToString())
        print('%d ops in the final graph.' % len(output_graph_def.node))

In [26]:
freeze_graph('bert-base-toxic', strings)

INFO:tensorflow:Restoring parameters from bert-base-toxic/model.ckpt
Instructions for updating:
Use `tf.compat.v1.graph_util.convert_variables_to_constants`
Instructions for updating:
Use `tf.compat.v1.graph_util.extract_sub_graph`
INFO:tensorflow:Froze 203 variables.
INFO:tensorflow:Converted 203 variables to const ops.
6992 ops in the final graph.


In [27]:
import boto3

bucketName = 'huseinhouse-storage'
Key = 'bert-base-toxic/frozen_model.pb'
outPutname = "v30/toxicity/bert-base-toxicity.pb"

s3 = boto3.client('s3',
                 aws_access_key_id='',
                 aws_secret_access_key='')
s3.upload_file(Key,bucketName,outPutname)