In [1]:
import json
import re
import sentencepiece as spm

In [2]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '1'

In [3]:
from prepro_utils import preprocess_text, encode_ids, encode_pieces

sp_model = spm.SentencePieceProcessor()
sp_model.Load('albert-base/sp10m.cased.v6.model')

with open('albert-base/sp10m.cased.v6.vocab') as fopen:
    v = fopen.read().split('\n')[:-1]
v = [i.split('\t') for i in v]
v = {i[0]: i[1] for i in v}

class Tokenizer:
    def __init__(self, v):
        self.vocab = v
        pass
    
    def tokenize(self, string):
        return encode_pieces(sp_model, string, return_unicode=False, sample=False)
    
    def convert_tokens_to_ids(self, tokens):
        return [sp_model.PieceToId(piece) for piece in tokens]
    
    def convert_ids_to_tokens(self, ids):
        return [sp_model.IdToPiece(i) for i in ids]
    
tokenizer = Tokenizer(v)

In [4]:
import optimization
import tokenization
import modeling
import numpy as np
import json
import tensorflow as tf
import itertools
from unidecode import unidecode
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import re

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])





  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [5]:
ALBERT_INIT_CHKPNT = 'albert-base/model.ckpt'
ALBERT_CONFIG = 'albert-base/albert_config_base.json'

In [7]:
def cleaning(string):
    string = unidecode(string)
    string = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', string)
    string = re.sub(r'[ ]+', ' ', string).strip().split()
    string = [w for w in string if w[0] != '@']
    return ' '.join(string)

In [8]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [9]:
df = pd.read_csv('../Malaya-Dataset/toxicity/toxic-bm.csv')
df = df.dropna()
df.shape

(40911, 7)

In [10]:
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
Y = df[list_classes].values.tolist()
X = df['text'].tolist()
len(Y), len(X)

(40911, 40911)

In [11]:
import glob

files = glob.glob('../Malaya-Dataset/toxicity/*.json')
files

['../Malaya-Dataset/toxicity/toxic6.json',
 '../Malaya-Dataset/toxicity/toxic1.json',
 '../Malaya-Dataset/toxicity/toxic4.json',
 '../Malaya-Dataset/toxicity/toxic5.json',
 '../Malaya-Dataset/toxicity/toxic3.json',
 '../Malaya-Dataset/toxicity/toxic0.json',
 '../Malaya-Dataset/toxicity/toxic7.json',
 '../Malaya-Dataset/toxicity/toxic2.json']

In [12]:
for file in files:
    with open(file) as fopen:
        data = json.load(fopen)
    for x, y in data:
        X.append(x)
        Y.append(y)

In [15]:
MAX_SEQ_LENGTH = 200
texts = X

In [16]:
from tqdm import tqdm

input_ids, input_masks, segment_ids = [], [], []

for text in tqdm(texts):
    text = cleaning(text)
    tokens_a = tokenizer.tokenize(text)
    if len(tokens_a) > MAX_SEQ_LENGTH - 2:
        tokens_a = tokens_a[:(MAX_SEQ_LENGTH - 2)]
    tokens = ["<cls>"] + tokens_a + ["<sep>"]
    segment_id = [0] * len(tokens)
    input_id = tokenizer.convert_tokens_to_ids(tokens)
    input_mask = [1] * len(input_id)
    padding = [0] * (MAX_SEQ_LENGTH - len(input_id))
    input_id += padding
    input_mask += padding
    segment_id += padding
    
    input_ids.append(input_id)
    input_masks.append(input_mask)
    segment_ids.append(segment_id)

100%|██████████| 192029/192029 [01:17<00:00, 2473.70it/s]


In [17]:
albert_config = modeling.BertConfig.from_json_file(ALBERT_CONFIG)




In [18]:
epoch = 10
batch_size = 60
warmup_proportion = 0.1
num_train_steps = int(len(texts) / batch_size * epoch)
num_warmup_steps = int(num_train_steps * warmup_proportion)

In [21]:
def create_initializer(initializer_range=0.02):
    return tf.truncated_normal_initializer(stddev=initializer_range)

class Model:
    def __init__(
        self,
        dimension_output,
        learning_rate = 2e-5,
    ):
        self.X = tf.placeholder(tf.int32, [None, None])
        self.Y = tf.placeholder(tf.float32, [None, dimension_output])
        
        model = modeling.BertModel(
            config=albert_config,
            is_training=False,
            input_ids=self.X,
            use_one_hot_embeddings=False)
        
        output_layer = model.get_sequence_output()
        output_layer = tf.layers.dense(
            output_layer,
            albert_config.hidden_size,
            activation=tf.tanh,
            kernel_initializer=create_initializer())
        self.logits_seq = tf.layers.dense(output_layer, dimension_output,
                                         kernel_initializer=create_initializer())
        self.logits_seq = tf.identity(self.logits_seq, name = 'logits_seq')
        self.logits = self.logits_seq[:, 0]
        self.logits = tf.identity(self.logits, name = 'logits')
        
        self.cost = tf.reduce_mean(
            tf.nn.sigmoid_cross_entropy_with_logits(
                logits = self.logits, labels = self.Y
            )
        )
        
        self.optimizer = optimization.create_optimizer(self.cost, learning_rate, 
                                                       num_train_steps, num_warmup_steps, False)
        
        correct_prediction = tf.equal(tf.round(tf.nn.sigmoid(self.logits)), tf.round(self.Y))
        all_labels_true = tf.reduce_min(tf.cast(correct_prediction, tf.float32), 1)
        self.accuracy = tf.reduce_mean(all_labels_true)

In [22]:
dimension_output = 6
learning_rate = 2e-5

tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(
    dimension_output,
    learning_rate
)

sess.run(tf.global_variables_initializer())
var_lists = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope = 'bert')
saver = tf.train.Saver(var_list = var_lists)
saver.restore(sess, ALBERT_INIT_CHKPNT)



embedding_lookup_factorized. factorized embedding parameterization is used.
















Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Instructions for updating:
Deprecated in favor of operator or tf.math.divide.

Instructions for updating:
Use standard file APIs to check for files with this prefix.
INFO:tensorflow:Restoring parameters from albert-base/model.ckpt


In [23]:
from sklearn.model_selection import train_test_split

train_X, test_X, train_Y, test_Y = train_test_split(
    input_ids, Y, test_size = 0.2
)

In [24]:
from tqdm import tqdm
import time

EARLY_STOPPING, CURRENT_CHECKPOINT, CURRENT_ACC, EPOCH = 2, 0, 0, 0

while True:
    lasttime = time.time()
    if CURRENT_CHECKPOINT == EARLY_STOPPING:
        print('break epoch:%d\n' % (EPOCH))
        break

    train_acc, train_loss, test_acc, test_loss = [], [], [], []
    pbar = tqdm(
        range(0, len(train_X), batch_size), desc = 'train minibatch loop'
    )
    for i in pbar:
        index = min(i + batch_size, len(train_X))
        batch_x = train_X[i: index]
        batch_y = train_Y[i: index]
        acc, cost, _ = sess.run(
            [model.accuracy, model.cost, model.optimizer],
            feed_dict = {
                model.Y: batch_y,
                model.X: batch_x,
            },
        )
        train_loss.append(cost)
        train_acc.append(acc)
        pbar.set_postfix(cost = cost, accuracy = acc)
        
    pbar = tqdm(range(0, len(test_X), batch_size), desc = 'test minibatch loop')
    for i in pbar:
        index = min(i + batch_size, len(test_X))
        batch_x = test_X[i: index]
        batch_y = test_Y[i: index]
        acc, cost = sess.run(
            [model.accuracy, model.cost],
            feed_dict = {
                model.Y: batch_y,
                model.X: batch_x,
            },
        )
        test_loss.append(cost)
        test_acc.append(acc)
        pbar.set_postfix(cost = cost, accuracy = acc)
        
    train_loss = np.mean(train_loss)
    train_acc = np.mean(train_acc)
    test_loss = np.mean(test_loss)
    test_acc = np.mean(test_acc)
        
    if test_acc > CURRENT_ACC:
        print(
            'epoch: %d, pass acc: %f, current acc: %f'
            % (EPOCH, CURRENT_ACC, test_acc)
        )
        CURRENT_ACC = test_acc
        CURRENT_CHECKPOINT = 0
    else:
        CURRENT_CHECKPOINT += 1
        
    print('time taken:', time.time() - lasttime)
    print(
        'epoch: %d, training loss: %f, training acc: %f, valid loss: %f, valid acc: %f\n'
        % (EPOCH, train_loss, train_acc, test_loss, test_acc)
    )
    EPOCH += 1

train minibatch loop: 100%|██████████| 2561/2561 [32:27<00:00,  1.32it/s, accuracy=0.913, cost=0.0266] 
test minibatch loop: 100%|██████████| 641/641 [02:49<00:00,  3.79it/s, accuracy=0.833, cost=0.039]  
train minibatch loop:   0%|          | 0/2561 [00:00<?, ?it/s]

epoch: 0, pass acc: 0.000000, current acc: 0.913234
time taken: 2116.3814690113068
epoch: 0, training loss: 0.090588, training acc: 0.896087, valid loss: 0.063953, valid acc: 0.913234



train minibatch loop: 100%|██████████| 2561/2561 [32:29<00:00,  1.31it/s, accuracy=0.957, cost=0.0202] 
test minibatch loop: 100%|██████████| 641/641 [02:49<00:00,  3.79it/s, accuracy=1, cost=0.0296]    
train minibatch loop:   0%|          | 0/2561 [00:00<?, ?it/s]

epoch: 1, pass acc: 0.913234, current acc: 0.915809
time taken: 2118.792075395584
epoch: 1, training loss: 0.059063, training acc: 0.914456, valid loss: 0.058545, valid acc: 0.915809



train minibatch loop: 100%|██████████| 2561/2561 [32:28<00:00,  1.31it/s, accuracy=0.957, cost=0.0171] 
test minibatch loop: 100%|██████████| 641/641 [02:48<00:00,  3.79it/s, accuracy=1, cost=0.0376]     
train minibatch loop:   0%|          | 0/2561 [00:00<?, ?it/s]

time taken: 2117.25768494606
epoch: 2, training loss: 0.050911, training acc: 0.921739, valid loss: 0.058837, valid acc: 0.912402



train minibatch loop: 100%|██████████| 2561/2561 [32:28<00:00,  1.31it/s, accuracy=1, cost=0.0127]     
test minibatch loop: 100%|██████████| 641/641 [02:49<00:00,  3.79it/s, accuracy=0.833, cost=0.0517] 

time taken: 2117.6472158432007
epoch: 3, training loss: 0.044098, training acc: 0.929865, valid loss: 0.062630, valid acc: 0.903796

break epoch:4






In [25]:
saver = tf.train.Saver(tf.trainable_variables())
saver.save(sess, 'albert-base-toxic/model.ckpt')

'albert-base-toxic/model.ckpt'

In [26]:
stack = []

pbar = tqdm(
    range(0, len(test_X), batch_size), desc = 'validation minibatch loop'
)
for i in pbar:
    index = min(i + batch_size, len(test_X))
    batch_x = test_X[i: index]
    stack.append(sess.run(tf.nn.sigmoid(model.logits),
            feed_dict = {
                model.X: batch_x,
            },
    ))

validation minibatch loop: 100%|██████████| 641/641 [05:34<00:00,  1.92it/s]


In [27]:
from sklearn import metrics

print(metrics.classification_report(np.array(test_Y),np.around(np.concatenate(stack,axis=0)),
                                    target_names=["toxic", "severe_toxic", "obscene", 
                                            "threat", "insult", "identity_hate"],
                                   digits=5))

               precision    recall  f1-score   support

        toxic    0.70172   0.75169   0.72585      3693
 severe_toxic    0.46209   0.33420   0.38788       383
      obscene    0.76764   0.74951   0.75847      2032
       threat    0.49296   0.34314   0.40462       102
       insult    0.67535   0.67606   0.67570      1880
identity_hate    0.67879   0.33333   0.44711       336

    micro avg    0.70126   0.69369   0.69745      8426
    macro avg    0.62976   0.53132   0.56660      8426
 weighted avg    0.69740   0.69369   0.69216      8426
  samples avg    0.06495   0.06556   0.06256      8426



  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [28]:
strings = ','.join(
    [
        n.name
        for n in tf.get_default_graph().as_graph_def().node
        if ('Variable' in n.op
        or 'Placeholder' in n.name
        or 'logits' in n.name
        or 'alphas' in n.name
        or 'self/Softmax' in n.name)
        and 'adam' not in n.name
        and 'beta' not in n.name
        and 'global_step' not in n.name
    ]
)
strings.split(',')

['Placeholder',
 'Placeholder_1',
 'bert/embeddings/word_embeddings',
 'bert/embeddings/word_embeddings_2',
 'bert/embeddings/token_type_embeddings',
 'bert/embeddings/position_embeddings',
 'bert/embeddings/LayerNorm/gamma',
 'bert/encoder/layer_shared/attention/output/LayerNorm/gamma',
 'bert/encoder/layer_shared/attention/self/query/kernel',
 'bert/encoder/layer_shared/attention/self/query/bias',
 'bert/encoder/layer_shared/attention/self/key/kernel',
 'bert/encoder/layer_shared/attention/self/key/bias',
 'bert/encoder/layer_shared/attention/self/value/kernel',
 'bert/encoder/layer_shared/attention/self/value/bias',
 'bert/encoder/layer_shared/attention/self/Softmax',
 'bert/encoder/layer_shared/attention/output/dense/kernel',
 'bert/encoder/layer_shared/attention/output/dense/bias',
 'bert/encoder/layer_shared/output/LayerNorm/gamma',
 'bert/encoder/layer_shared/intermediate/dense/kernel',
 'bert/encoder/layer_shared/intermediate/dense/bias',
 'bert/encoder/layer_shared/output/dens

In [29]:
def freeze_graph(model_dir, output_node_names):

    if not tf.gfile.Exists(model_dir):
        raise AssertionError(
            "Export directory doesn't exists. Please specify an export "
            'directory: %s' % model_dir
        )

    checkpoint = tf.train.get_checkpoint_state(model_dir)
    input_checkpoint = checkpoint.model_checkpoint_path

    absolute_model_dir = '/'.join(input_checkpoint.split('/')[:-1])
    output_graph = absolute_model_dir + '/frozen_model.pb'
    clear_devices = True
    with tf.Session(graph = tf.Graph()) as sess:
        saver = tf.train.import_meta_graph(
            input_checkpoint + '.meta', clear_devices = clear_devices
        )
        saver.restore(sess, input_checkpoint)
        output_graph_def = tf.graph_util.convert_variables_to_constants(
            sess,
            tf.get_default_graph().as_graph_def(),
            output_node_names.split(','),
        )
        with tf.gfile.GFile(output_graph, 'wb') as f:
            f.write(output_graph_def.SerializeToString())
        print('%d ops in the final graph.' % len(output_graph_def.node))

In [30]:
freeze_graph('albert-base-toxic', strings)

INFO:tensorflow:Restoring parameters from albert-base-toxic/model.ckpt
Instructions for updating:
Use `tf.compat.v1.graph_util.convert_variables_to_constants`
Instructions for updating:
Use `tf.compat.v1.graph_util.extract_sub_graph`
INFO:tensorflow:Froze 28 variables.
INFO:tensorflow:Converted 28 variables to const ops.
5500 ops in the final graph.


In [31]:
import boto3

bucketName = 'huseinhouse-storage'
Key = 'albert-base-toxic/frozen_model.pb'
outPutname = "v30/toxicity/albert-base-toxicity.pb"

s3 = boto3.client('s3',
                 aws_access_key_id='',
                 aws_secret_access_key='')
s3.upload_file(Key,bucketName,outPutname)