In [1]:
!pip3 install bert-tensorflow sentencepiece boto3

Collecting bert-tensorflow
[?25l  Downloading https://files.pythonhosted.org/packages/a6/66/7eb4e8b6ea35b7cc54c322c816f976167a43019750279a8473d355800a93/bert_tensorflow-1.0.1-py2.py3-none-any.whl (67kB)
[K     |████████████████████████████████| 71kB 8.3MB/s eta 0:00:011
Installing collected packages: bert-tensorflow
Successfully installed bert-tensorflow-1.0.1


In [2]:
from tqdm import tqdm
import json
import bert
from bert import run_classifier
from bert import optimization
from bert import tokenization
from bert import modeling
import numpy as np
import tensorflow as tf

In [3]:
MAX_SEQ_LENGTH = 100

In [4]:
!wget https://storage.googleapis.com/bert_models/2018_11_23/multi_cased_L-12_H-768_A-12.zip
!unzip multi_cased_L-12_H-768_A-12.zip

--2019-08-04 06:44:58--  https://storage.googleapis.com/bert_models/2018_11_23/multi_cased_L-12_H-768_A-12.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 108.177.127.128, 2a00:1450:4013:c07::80
Connecting to storage.googleapis.com (storage.googleapis.com)|108.177.127.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 662903077 (632M) [application/zip]
Saving to: ‘multi_cased_L-12_H-768_A-12.zip’


2019-08-04 06:45:03 (128 MB/s) - ‘multi_cased_L-12_H-768_A-12.zip’ saved [662903077/662903077]

Archive:  multi_cased_L-12_H-768_A-12.zip
   creating: multi_cased_L-12_H-768_A-12/
  inflating: multi_cased_L-12_H-768_A-12/bert_model.ckpt.meta  
  inflating: multi_cased_L-12_H-768_A-12/bert_model.ckpt.data-00000-of-00001  
  inflating: multi_cased_L-12_H-768_A-12/vocab.txt  
  inflating: multi_cased_L-12_H-768_A-12/bert_model.ckpt.index  
  inflating: multi_cased_L-12_H-768_A-12/bert_config.json  


In [5]:
BERT_VOCAB = 'multi_cased_L-12_H-768_A-12/vocab.txt'
BERT_INIT_CHKPNT = 'multi_cased_L-12_H-768_A-12/bert_model.ckpt'
BERT_CONFIG = 'multi_cased_L-12_H-768_A-12/bert_config.json'

In [6]:
import json
with open('../input/toxicity.json') as fopen:
    x = json.load(fopen)
texts = x['x']
labels = x['y']

In [7]:
tokenizer = tokenization.FullTokenizer(
      vocab_file=BERT_VOCAB, do_lower_case=False)

In [8]:
input_ids, input_masks, segment_ids = [], [], []

for text in tqdm(texts):
    tokens_a = tokenizer.tokenize(text)
    if len(tokens_a) > MAX_SEQ_LENGTH - 2:
        tokens_a = tokens_a[:(MAX_SEQ_LENGTH - 2)]
    tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
    segment_id = [0] * len(tokens)
    input_id = tokenizer.convert_tokens_to_ids(tokens)
    input_mask = [1] * len(input_id)
    padding = [0] * (MAX_SEQ_LENGTH - len(input_id))
    input_id += padding
    input_mask += padding
    segment_id += padding
    
    input_ids.append(input_id)
    input_masks.append(input_mask)
    segment_ids.append(segment_id)

100%|██████████| 192029/192029 [04:11<00:00, 762.62it/s]


In [9]:
bert_config = modeling.BertConfig.from_json_file(BERT_CONFIG)

epoch = 10
batch_size = 60
warmup_proportion = 0.1
num_train_steps = int(len(texts) / batch_size * epoch)
num_warmup_steps = int(num_train_steps * warmup_proportion)

In [10]:
class Model:
    def __init__(
        self,
        dimension_output,
        learning_rate = 2e-5,
    ):
        self.X = tf.placeholder(tf.int32, [None, None])
        self.Y = tf.placeholder(tf.float32, [None, dimension_output])
        
        model = modeling.BertModel(
            config=bert_config,
            is_training=False,
            input_ids=self.X,
            use_one_hot_embeddings=False)
        
        output_layer = model.get_sequence_output()
        self.logits_seq = tf.layers.dense(output_layer, dimension_output)
        self.logits_seq = tf.identity(self.logits_seq, name = 'logits_seq')
        self.logits = self.logits_seq[:, 0]
        self.logits = tf.identity(self.logits, name = 'logits')
        
        self.cost = tf.reduce_mean(
            tf.nn.sigmoid_cross_entropy_with_logits(
                logits = self.logits, labels = self.Y
            )
        )
        
        self.optimizer = optimization.create_optimizer(self.cost, learning_rate, 
                                                       num_train_steps, num_warmup_steps, False)
        correct_prediction = tf.equal(tf.round(tf.nn.sigmoid(self.logits)), tf.round(self.Y))
        all_labels_true = tf.reduce_min(tf.cast(correct_prediction, tf.float32), 1)
        self.accuracy = tf.reduce_mean(all_labels_true)

In [11]:
dimension_output = 6
learning_rate = 2e-5

tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(
    dimension_output,
    learning_rate
)

sess.run(tf.global_variables_initializer())
var_lists = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope = 'bert')
saver = tf.train.Saver(var_list = var_lists)
saver.restore(sess, BERT_INIT_CHKPNT)

In [12]:
from sklearn.model_selection import train_test_split

train_input_ids, test_input_ids, train_Y, test_Y = train_test_split(
    input_ids, labels, test_size = 0.2
)

In [13]:
import time

EARLY_STOPPING, CURRENT_CHECKPOINT, CURRENT_ACC, EPOCH = 2, 0, 0, 0

while True:
    lasttime = time.time()
    if CURRENT_CHECKPOINT == EARLY_STOPPING:
        print('break epoch:%d\n' % (EPOCH))
        break

    train_acc, train_loss, test_acc, test_loss = 0, 0, 0, 0
    pbar = tqdm(
        range(0, len(train_input_ids), batch_size), desc = 'train minibatch loop'
    )
    for i in pbar:
        index = min(i + batch_size, len(train_input_ids))
        batch_x = train_input_ids[i: index]
        batch_y = train_Y[i: index]
        acc, cost, _ = sess.run(
            [model.accuracy, model.cost, model.optimizer],
            feed_dict = {
                model.Y: batch_y,
                model.X: batch_x,
            },
        )
        assert not np.isnan(cost)
        train_loss += cost
        train_acc += acc
        pbar.set_postfix(cost = cost, accuracy = acc)
        
    pbar = tqdm(range(0, len(test_input_ids), batch_size), desc = 'test minibatch loop')
    for i in pbar:
        index = min(i + batch_size, len(test_input_ids))
        batch_x = test_input_ids[i: index]
        batch_y = test_Y[i: index]
        acc, cost = sess.run(
            [model.accuracy, model.cost],
            feed_dict = {
                model.Y: batch_y,
                model.X: batch_x,
            },
        )
        test_loss += cost
        test_acc += acc
        pbar.set_postfix(cost = cost, accuracy = acc)

    train_loss /= len(train_input_ids) / batch_size
    train_acc /= len(train_input_ids) / batch_size
    test_loss /= len(test_input_ids) / batch_size
    test_acc /= len(test_input_ids) / batch_size

    if test_acc > CURRENT_ACC:
        print(
            'epoch: %d, pass acc: %f, current acc: %f'
            % (EPOCH, CURRENT_ACC, test_acc)
        )
        CURRENT_ACC = test_acc
        CURRENT_CHECKPOINT = 0
    else:
        CURRENT_CHECKPOINT += 1
        
    print('time taken:', time.time() - lasttime)
    print(
        'epoch: %d, training loss: %f, training acc: %f, valid loss: %f, valid acc: %f\n'
        % (EPOCH, train_loss, train_acc, test_loss, test_acc)
    )
    EPOCH += 1

train minibatch loop: 100%|██████████| 2561/2561 [29:23<00:00,  1.73it/s, accuracy=0.87, cost=0.0508] 
test minibatch loop: 100%|██████████| 641/641 [02:23<00:00,  4.46it/s, accuracy=1, cost=0.038]     
train minibatch loop:   0%|          | 0/2561 [00:00<?, ?it/s]

epoch: 0, pass acc: 0.000000, current acc: 0.915430
time taken: 1906.949625968933
epoch: 0, training loss: 0.083422, training acc: 0.895407, valid loss: 0.058635, valid acc: 0.915430



train minibatch loop: 100%|██████████| 2561/2561 [29:19<00:00,  1.74it/s, accuracy=0.913, cost=0.0389] 
test minibatch loop: 100%|██████████| 641/641 [02:22<00:00,  4.49it/s, accuracy=1, cost=0.0321]     
train minibatch loop:   0%|          | 0/2561 [00:00<?, ?it/s]

time taken: 1902.3830225467682
epoch: 1, training loss: 0.054654, training acc: 0.919106, valid loss: 0.055803, valid acc: 0.914961



train minibatch loop: 100%|██████████| 2561/2561 [29:18<00:00,  1.73it/s, accuracy=0.957, cost=0.0296] 
test minibatch loop: 100%|██████████| 641/641 [02:22<00:00,  4.49it/s, accuracy=0.833, cost=0.0359] 
train minibatch loop:   0%|          | 0/2561 [00:00<?, ?it/s]

epoch: 2, pass acc: 0.915430, current acc: 0.916576
time taken: 1901.6554355621338
epoch: 2, training loss: 0.040511, training acc: 0.933912, valid loss: 0.061490, valid acc: 0.916576



train minibatch loop: 100%|██████████| 2561/2561 [29:18<00:00,  1.73it/s, accuracy=0.957, cost=0.0163] 
test minibatch loop: 100%|██████████| 641/641 [02:22<00:00,  4.49it/s, accuracy=0.833, cost=0.02]   
train minibatch loop:   0%|          | 0/2561 [00:00<?, ?it/s]

epoch: 3, pass acc: 0.916576, current acc: 0.917591
time taken: 1901.1561419963837
epoch: 3, training loss: 0.029528, training acc: 0.950349, valid loss: 0.083887, valid acc: 0.917591



train minibatch loop: 100%|██████████| 2561/2561 [29:18<00:00,  1.74it/s, accuracy=0.957, cost=0.0117] 
test minibatch loop: 100%|██████████| 641/641 [02:22<00:00,  4.49it/s, accuracy=1, cost=0.0102]     
train minibatch loop:   0%|          | 0/2561 [00:00<?, ?it/s]

time taken: 1901.2402818202972
epoch: 4, training loss: 0.019540, training acc: 0.965874, valid loss: 0.075365, valid acc: 0.912175



train minibatch loop: 100%|██████████| 2561/2561 [29:18<00:00,  1.74it/s, accuracy=1, cost=0.00482]    
test minibatch loop: 100%|██████████| 641/641 [02:22<00:00,  4.49it/s, accuracy=1, cost=0.00831]    

time taken: 1900.8933420181274
epoch: 5, training loss: 0.012058, training acc: 0.979157, valid loss: 0.082174, valid acc: 0.914024

break epoch:6






In [14]:
stack = []

pbar = tqdm(
    range(0, len(test_input_ids), batch_size), desc = 'validation minibatch loop'
)
for i in pbar:
    index = min(i + batch_size, len(test_input_ids))
    batch_x = test_input_ids[i: index]
    stack.append(sess.run(tf.nn.sigmoid(model.logits),
            feed_dict = {
                model.Y: batch_y,
                model.X: batch_x,
            },
    ))

validation minibatch loop: 100%|██████████| 641/641 [07:42<00:00,  1.49it/s]


In [15]:
from sklearn import metrics

print(metrics.classification_report(np.array(test_Y),np.around(np.concatenate(stack,axis=0)),
                                    target_names=["toxic", "severe_toxic", "obscene", 
                                            "threat", "insult", "identity_hate"],
                                   digits=6))

               precision    recall  f1-score   support

        toxic   0.758527  0.744709  0.751554      3733
 severe_toxic   0.514184  0.375648  0.434132       386
      obscene   0.768908  0.729811  0.748849      2006
       threat   0.742857  0.201550  0.317073       129
       insult   0.690093  0.737226  0.712881      1918
identity_hate   0.555172  0.449721  0.496914       358

    micro avg   0.728267  0.702227  0.715010      8530
    macro avg   0.671623  0.539777  0.576901      8530
 weighted avg   0.725752  0.702227  0.710601      8530
  samples avg   0.066491  0.066235  0.063811      8530



  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [16]:
strings = ','.join(
    [
        n.name
        for n in tf.get_default_graph().as_graph_def().node
        if ('Variable' in n.op
        or 'Placeholder' in n.name
        or 'logits' in n.name
        or 'alphas' in n.name
        or 'self/Softmax' in n.name)
        and 'adam' not in n.name
        and 'beta' not in n.name
        and 'global_step' not in n.name
    ]
)
strings.split(',')

['Placeholder',
 'Placeholder_1',
 'bert/embeddings/word_embeddings',
 'bert/embeddings/token_type_embeddings',
 'bert/embeddings/position_embeddings',
 'bert/embeddings/LayerNorm/gamma',
 'bert/encoder/layer_0/attention/self/query/kernel',
 'bert/encoder/layer_0/attention/self/query/bias',
 'bert/encoder/layer_0/attention/self/key/kernel',
 'bert/encoder/layer_0/attention/self/key/bias',
 'bert/encoder/layer_0/attention/self/value/kernel',
 'bert/encoder/layer_0/attention/self/value/bias',
 'bert/encoder/layer_0/attention/self/Softmax',
 'bert/encoder/layer_0/attention/output/dense/kernel',
 'bert/encoder/layer_0/attention/output/dense/bias',
 'bert/encoder/layer_0/attention/output/LayerNorm/gamma',
 'bert/encoder/layer_0/intermediate/dense/kernel',
 'bert/encoder/layer_0/intermediate/dense/bias',
 'bert/encoder/layer_0/output/dense/kernel',
 'bert/encoder/layer_0/output/dense/bias',
 'bert/encoder/layer_0/output/LayerNorm/gamma',
 'bert/encoder/layer_1/attention/self/query/kernel',
 

In [17]:
def freeze_graph(model_dir, output_node_names):

    if not tf.gfile.Exists(model_dir):
        raise AssertionError(
            "Export directory doesn't exists. Please specify an export "
            'directory: %s' % model_dir
        )

    checkpoint = tf.train.get_checkpoint_state(model_dir)
    input_checkpoint = checkpoint.model_checkpoint_path

    absolute_model_dir = '/'.join(input_checkpoint.split('/')[:-1])
    output_graph = absolute_model_dir + '/frozen_model.pb'
    clear_devices = True
    with tf.Session(graph = tf.Graph()) as sess:
        saver = tf.train.import_meta_graph(
            input_checkpoint + '.meta', clear_devices = clear_devices
        )
        saver.restore(sess, input_checkpoint)
        output_graph_def = tf.graph_util.convert_variables_to_constants(
            sess,
            tf.get_default_graph().as_graph_def(),
            output_node_names.split(','),
        )
        with tf.gfile.GFile(output_graph, 'wb') as f:
            f.write(output_graph_def.SerializeToString())
        print('%d ops in the final graph.' % len(output_graph_def.node))

In [18]:
saver = tf.train.Saver(tf.trainable_variables())
saver.save(sess, 'bert-multilanguage-toxicity/model.ckpt')

'bert-multilanguage-toxicity/model.ckpt'

In [19]:
freeze_graph('bert-multilanguage-toxicity', strings)

5929 ops in the final graph.


In [20]:
def load_graph(frozen_graph_filename):
    with tf.gfile.GFile(frozen_graph_filename, 'rb') as f:
        graph_def = tf.GraphDef()
        graph_def.ParseFromString(f.read())
    with tf.Graph().as_default() as graph:
        tf.import_graph_def(graph_def)
    return graph

In [21]:
g = load_graph('bert-multilanguage-toxicity/frozen_model.pb')
x = g.get_tensor_by_name('import/Placeholder:0')
logits = g.get_tensor_by_name('import/logits:0')
test_sess = tf.InteractiveSession(graph = g)
result = test_sess.run(tf.nn.sigmoid(logits), feed_dict = {x: [input_id]})
result



array([[3.8315356e-03, 7.1981549e-04, 1.1235178e-03, 6.0230494e-05,
        1.2349161e-02, 2.5903422e-03]], dtype=float32)

In [22]:
import boto3

bucketName = 'huseinhouse-storage'
Key = 'bert-multilanguage-toxicity/frozen_model.pb'
outPutname = "v27/stoxicity/bert-multilanguage-toxicity.pb"

s3 = boto3.client('s3',
                 aws_access_key_id='',
                 aws_secret_access_key='')

s3.upload_file(Key,bucketName,outPutname)