In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '2'

In [2]:
import xlnet
import numpy as np
import tensorflow as tf
from tqdm import tqdm
import model_utils




In [3]:
import sentencepiece as spm
from prepro_utils import preprocess_text, encode_ids

sp_model = spm.SentencePieceProcessor()
sp_model.Load('xlnet-base/sp10m.cased.v9.model')

True

In [4]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [5]:
from rules import normalized_chars
import random
import re
from unidecode import unidecode

laughing = {
    'huhu',
    'haha',
    'gagaga',
    'hihi',
    'wkawka',
    'wkwk',
    'kiki',
    'keke',
    'huehue',
    'hshs',
    'hoho',
    'hewhew',
    'uwu',
    'sksk',
    'ksks',
    'gituu',
    'gitu',
    'mmeeooww',
    'meow',
    'alhamdulillah',
    'muah',
    'mmuahh',
    'hehe',
    'salamramadhan',
    'happywomensday',
    'jahagaha',
    'ahakss',
    'ahksk'
}

def make_cleaning(s, c_dict):
    s = s.translate(c_dict)
    return s

def cleaning(string):
    """
    use by any transformer model before tokenization
    """
    string = unidecode(string)
    
    string = ' '.join(
        [make_cleaning(w, normalized_chars) for w in string.split()]
    )
    string = re.sub('\(dot\)', '.', string)
    string = (
        re.sub(re.findall(r'\<a(.*?)\>', string)[0], '', string)
        if (len(re.findall(r'\<a (.*?)\>', string)) > 0)
        and ('href' in re.findall(r'\<a (.*?)\>', string)[0])
        else string
    )
    string = re.sub(
        r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', ' ', string
    )
    
    chars = '.,/'
    for c in chars:
        string = string.replace(c, f' {c} ')
        
    string = re.sub(r'[ ]+', ' ', string).strip().split()
    string = [w for w in string if w[0] != '@']
    x = []
    for word in string:
        word = word.lower()
        if any([laugh in word for laugh in laughing]):
            if random.random() >= 0.5:
                x.append(word)
        else:
            x.append(word)
    string = [w.title() if w[0].isupper() else w for w in x]
    return ' '.join(string)

In [6]:
labels = """
1. severe toxic
2. obscene
3. identity attack
4. insult
5. threat
6. asian
7. atheist
8. bisexual
9. black
10. buddhist
11. christian
12. female
13. heterosexual
14. indian
15. homosexual, gay or lesbian
16. intellectual or learning disability
17. jewish
18. latino
19. male
20. muslim
21. other disability
22. other gender
23. other race or ethnicity
24. other religion
25. other sexual orientation
26. physical disability
27. psychiatric or mental illness
28. transgender
29. white
30. malay
31. chinese
"""
labels = [l.split('. ')[1].strip() for l in labels.split('\n') if len(l)]
labels

['severe toxic',
 'obscene',
 'identity attack',
 'insult',
 'threat',
 'asian',
 'atheist',
 'bisexual',
 'black',
 'buddhist',
 'christian',
 'female',
 'heterosexual',
 'indian',
 'homosexual, gay or lesbian',
 'intellectual or learning disability',
 'jewish',
 'latino',
 'male',
 'muslim',
 'other disability',
 'other gender',
 'other race or ethnicity',
 'other religion',
 'other sexual orientation',
 'physical disability',
 'psychiatric or mental illness',
 'transgender',
 'white',
 'malay',
 'chinese']

In [7]:
import glob

files = glob.glob('../toxicity/translated*')
files

['../toxicity/translated-1750000.json',
 '../toxicity/translated-1450000.json',
 '../toxicity/translated-700000.json',
 '../toxicity/translated-350000.json',
 '../toxicity/translated-600000.json',
 '../toxicity/translated-900000.json',
 '../toxicity/translated-1000000.json',
 '../toxicity/translated-1100000.json',
 '../toxicity/translated-550000.json',
 '../toxicity/translated-150000.json',
 '../toxicity/translated-500000.json',
 '../toxicity/translated-1500000.json',
 '../toxicity/translated-1150000.json',
 '../toxicity/translated-750000.json',
 '../toxicity/translated-850000.json',
 '../toxicity/translated-1650000.json',
 '../toxicity/translated-300000.json',
 '../toxicity/translated-650000.json',
 '../toxicity/translated-950000.json',
 '../toxicity/translated-250000.json',
 '../toxicity/translated-1600000.json',
 '../toxicity/translated-0.json',
 '../toxicity/translated-1550000.json',
 '../toxicity/translated-1800000.json',
 '../toxicity/translated-450000.json',
 '../toxicity/transl

In [8]:
import json

X, Y = [], []

for file in files:
    print(file)
    with open(file) as fopen:
        f = json.load(fopen)
        for row in f:
            if len(row[1]) == 29:
                X.append(row[0])
                Y.append(row[1] + [0, 0])
        
    
len(X)

../toxicity/translated-1750000.json
../toxicity/translated-1450000.json
../toxicity/translated-700000.json
../toxicity/translated-350000.json
../toxicity/translated-600000.json
../toxicity/translated-900000.json
../toxicity/translated-1000000.json
../toxicity/translated-1100000.json
../toxicity/translated-550000.json
../toxicity/translated-150000.json
../toxicity/translated-500000.json
../toxicity/translated-1500000.json
../toxicity/translated-1150000.json
../toxicity/translated-750000.json
../toxicity/translated-850000.json
../toxicity/translated-1650000.json
../toxicity/translated-300000.json
../toxicity/translated-650000.json
../toxicity/translated-950000.json
../toxicity/translated-250000.json
../toxicity/translated-1600000.json
../toxicity/translated-0.json
../toxicity/translated-1550000.json
../toxicity/translated-1800000.json
../toxicity/translated-450000.json
../toxicity/translated-50000.json
../toxicity/translated-1050000.json
../toxicity/translated-1200000.json
../toxicity/tr

1401054

In [9]:
rejected_labels = ['black', 'white', 'jewish', 'latino']
[labels.index(l) for l in rejected_labels]
labels = [l for l in labels if l not in rejected_labels]

In [10]:
ydf = pd.DataFrame(np.array(Y))
ydf = ydf.loc[(ydf[8] == 0) & (ydf[28] == 0) & (ydf[16] == 0) & (ydf[17] == 0)]
ydf = ydf.drop([8, 28, 16, 17], axis = 1)
ix = ydf.index.tolist()
Y = ydf.values.tolist()

In [11]:
X = [X[i] for i in ix]
len(X), len(Y)

(1361040, 1361040)

In [12]:
mapping = {'severe_toxic': 'severe toxic', 'identity_hate': 'identity attack',
          'toxic': 'severe toxic', 'melayu': 'malay', 'cina': 'chinese', 'india': 'indian'}

In [13]:
def generate_onehot(tags, depth = len(labels)):
    onehot = [0] * depth
    for tag in tags:
        onehot[labels.index(tag)] = 1
    return onehot

In [14]:
with open('../toxicity/kaum.json') as fopen:
    kaum = json.load(fopen)
    
for k, v in kaum.items():
    print(k, len(v))

melayu 84851
cina 43956
india 20208


In [15]:
with open('../toxicity/weak-learning-toxicity.json') as fopen:
    scores = json.load(fopen)
    
for k, v in scores.items():
    for no in range(len(v)):
        tags = []
        for l, v_ in v[no].items():
            if round(v_) == 1:
                tags.append(mapping.get(l, l))
        tags.append(mapping[k])
        Y.append(generate_onehot(tags))
        X.append(kaum[k][no])

In [16]:
from tqdm import tqdm

for i in tqdm(range(len(X))):
    X[i] = cleaning(X[i])

100%|██████████| 1510055/1510055 [04:43<00:00, 5323.50it/s] 


In [17]:
actual_t, actual_l = [], []

for i in tqdm(range(len(X))):
    if len(X[i]) > 2:
        actual_t.append(X[i])
        actual_l.append(Y[i])

100%|██████████| 1510055/1510055 [00:01<00:00, 1319801.22it/s]


In [18]:
from prepro_utils import preprocess_text, encode_ids

def tokenize_fn(text):
    text = preprocess_text(text, lower= False)
    return encode_ids(sp_model, text)

In [19]:
from tqdm import tqdm

SEG_ID_A   = 0
SEG_ID_B   = 1
SEG_ID_CLS = 2
SEG_ID_SEP = 3
SEG_ID_PAD = 4
MAX_SEQ_LENGTH = 150

special_symbols = {
    "<unk>"  : 0,
    "<s>"    : 1,
    "</s>"   : 2,
    "<cls>"  : 3,
    "<sep>"  : 4,
    "<pad>"  : 5,
    "<mask>" : 6,
    "<eod>"  : 7,
    "<eop>"  : 8,
}

VOCAB_SIZE = 32000
UNK_ID = special_symbols["<unk>"]
CLS_ID = special_symbols["<cls>"]
SEP_ID = special_symbols["<sep>"]
MASK_ID = special_symbols["<mask>"]
EOD_ID = special_symbols["<eod>"]

input_ids, input_masks, segment_ids = [], [], []

for text in tqdm(actual_t):
    tokens_a = tokenize_fn(text)
    if len(tokens_a) > MAX_SEQ_LENGTH - 2:
        tokens_a = tokens_a[:(MAX_SEQ_LENGTH - 2)]
        
    tokens = []
    segment_id = []
    for token in tokens_a:
        tokens.append(token)
        segment_id.append(SEG_ID_A)
    tokens.append(SEP_ID)
    segment_id.append(SEG_ID_A)
    tokens.append(CLS_ID)
    segment_id.append(SEG_ID_CLS)
    
    input_id = tokens
    input_mask = [0] * len(input_id)
    if len(input_id) < MAX_SEQ_LENGTH:
        delta_len = MAX_SEQ_LENGTH - len(input_id)
        input_id = [0] * delta_len + input_id
        input_mask = [1] * delta_len + input_mask
        segment_id = [SEG_ID_PAD] * delta_len + segment_id
    
    input_ids.append(input_id)
    input_masks.append(input_mask)
    segment_ids.append(segment_id)

100%|██████████| 1502319/1502319 [09:59<00:00, 2504.29it/s]


In [20]:
kwargs = dict(
      is_training=True,
      use_tpu=False,
      use_bfloat16=False,
      dropout=0.1,
      dropatt=0.1,
      init='normal',
      init_range=0.1,
      init_std=0.05,
      clamp_len=-1)

xlnet_parameters = xlnet.RunConfig(**kwargs)
xlnet_config = xlnet.XLNetConfig(json_path='xlnet-base/config.json')




In [21]:
epoch = 10
batch_size = 50
warmup_proportion = 0.1
num_train_steps = int(len(actual_t) / batch_size * epoch)
num_warmup_steps = int(num_train_steps * warmup_proportion)
print(num_train_steps, num_warmup_steps)
learning_rate = 2e-5

training_parameters = dict(
      decay_method = 'poly',
      train_steps = num_train_steps,
      learning_rate = learning_rate,
      warmup_steps = num_warmup_steps,
      min_lr_ratio = 0.0,
      weight_decay = 0.00,
      adam_epsilon = 1e-8,
      num_core_per_host = 1,
      lr_layer_decay_rate = 1,
      use_tpu=False,
      use_bfloat16=False,
      dropout=0.0,
      dropatt=0.0,
      init='normal',
      init_range=0.1,
      init_std=0.05,
      clip = 1.0,
      clamp_len=-1,)

300463 30046


In [22]:
class Parameter:
    def __init__(self, decay_method, warmup_steps, weight_decay, adam_epsilon, 
                num_core_per_host, lr_layer_decay_rate, use_tpu, learning_rate, train_steps,
                min_lr_ratio, clip, **kwargs):
        self.decay_method = decay_method
        self.warmup_steps = warmup_steps
        self.weight_decay = weight_decay
        self.adam_epsilon = adam_epsilon
        self.num_core_per_host = num_core_per_host
        self.lr_layer_decay_rate = lr_layer_decay_rate
        self.use_tpu = use_tpu
        self.learning_rate = learning_rate
        self.train_steps = train_steps
        self.min_lr_ratio = min_lr_ratio
        self.clip = clip
        
training_parameters = Parameter(**training_parameters)

In [23]:
class Model:
    def __init__(
        self,
        dimension_output,
        learning_rate = 2e-5,
    ):
        self.X = tf.placeholder(tf.int32, [None, None])
        self.segment_ids = tf.placeholder(tf.int32, [None, None])
        self.input_masks = tf.placeholder(tf.float32, [None, None])
        self.Y = tf.placeholder(tf.float32, [None, dimension_output])
        
        xlnet_model = xlnet.XLNetModel(
            xlnet_config=xlnet_config,
            run_config=xlnet_parameters,
            input_ids=tf.transpose(self.X, [1, 0]),
            seg_ids=tf.transpose(self.segment_ids, [1, 0]),
            input_mask=tf.transpose(self.input_masks, [1, 0]))
        
        output_layer = xlnet_model.get_sequence_output()
        output_layer = tf.transpose(output_layer, [1, 0, 2])
        
        self.logits_seq = tf.layers.dense(output_layer, dimension_output)
        self.logits_seq = tf.identity(self.logits_seq, name = 'logits_seq')
        self.logits = self.logits_seq[:, 0]
        self.logits = tf.identity(self.logits, name = 'logits')
        
        self.cost = tf.reduce_mean(
            tf.nn.sigmoid_cross_entropy_with_logits(
                logits = self.logits, labels = self.Y
            )
        )
        
        self.optimizer, self.learning_rate, _ = model_utils.get_train_op(training_parameters, self.cost)
        
        correct_prediction = tf.equal(tf.round(tf.nn.sigmoid(self.logits)), tf.round(self.Y))
        all_labels_true = tf.reduce_min(tf.cast(correct_prediction, tf.float32), 1)
        self.accuracy = tf.reduce_mean(all_labels_true)

In [24]:
dimension_output = len(labels)

tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(
    dimension_output,
    learning_rate
)

sess.run(tf.global_variables_initializer())




INFO:tensorflow:memory input None
INFO:tensorflow:Use float type <dtype: 'float32'>

Instructions for updating:
Use keras.layers.dropout instead.
Instructions for updating:
Please use `layer.__call__` method instead.
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Use keras.layers.Dense instead.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where





In [25]:
import collections
import re

def get_assignment_map_from_checkpoint(tvars, init_checkpoint):
    """Compute the union of the current variables and checkpoint variables."""
    assignment_map = {}
    initialized_variable_names = {}

    name_to_variable = collections.OrderedDict()
    for var in tvars:
        name = var.name
        m = re.match('^(.*):\\d+$', name)
        if m is not None:
            name = m.group(1)
        name_to_variable[name] = var

    init_vars = tf.train.list_variables(init_checkpoint)

    assignment_map = collections.OrderedDict()
    for x in init_vars:
        (name, var) = (x[0], x[1])
        if name not in name_to_variable:
            continue
        assignment_map[name] = name_to_variable[name]
        initialized_variable_names[name] = 1
        initialized_variable_names[name + ':0'] = 1

    return (assignment_map, initialized_variable_names)

In [26]:
tvars = tf.trainable_variables()
checkpoint = 'xlnet-base/model.ckpt'
assignment_map, initialized_variable_names = get_assignment_map_from_checkpoint(tvars, 
                                                                                checkpoint)

In [27]:
saver = tf.train.Saver(var_list = assignment_map)
saver.restore(sess, checkpoint)

INFO:tensorflow:Restoring parameters from xlnet-base/model.ckpt


In [28]:
from sklearn.model_selection import train_test_split

train_X, test_X, train_masks, test_masks, train_segments, test_segments, train_Y, test_Y = train_test_split(
    input_ids, input_masks, segment_ids, actual_l, test_size = 0.2
)

In [29]:
pad_sequences = tf.keras.preprocessing.sequence.pad_sequences

i = 0
index = 4
batch_x = train_X[i : index]
batch_y = train_Y[i : index]
batch_masks = train_masks[i : index]
batch_segments = train_segments[i : index]

sess.run(
    [model.accuracy, model.cost],
    feed_dict = {
        model.X: batch_x,
        model.Y: batch_y,
        model.segment_ids: batch_segments,
        model.input_masks: batch_masks,
    },
)

[0.0, 1.3953786]

In [30]:
from tqdm import tqdm
import time


EARLY_STOPPING, CURRENT_CHECKPOINT, CURRENT_ACC, EPOCH = 1, 0, 0, 0

while True:
    lasttime = time.time()
    if CURRENT_CHECKPOINT == EARLY_STOPPING:
        print('break epoch:%d\n' % (EPOCH))
        break

    train_acc, train_loss, test_acc, test_loss = [], [], [], []
    pbar = tqdm(
        range(0, len(train_X), batch_size), desc = 'train minibatch loop'
    )
    for i in pbar:
        index = min(i + batch_size, len(train_X))
        batch_x = train_X[i : index]
        batch_y = train_Y[i : index]
        batch_masks = train_masks[i : index]
        batch_segments = train_segments[i : index]
        acc, cost, _ = sess.run(
            [model.accuracy, model.cost, model.optimizer],
            feed_dict = {
                model.X: batch_x,
                model.Y: batch_y,
                model.segment_ids: batch_segments,
                model.input_masks: batch_masks,
            },
        )
        train_loss.append(cost)
        train_acc.append(acc)
        pbar.set_postfix(cost = cost, accuracy = acc)
        
    pbar = tqdm(range(0, len(test_X), batch_size), desc = 'test minibatch loop')
    for i in pbar:
        index = min(i + batch_size, len(test_X))
        batch_x = test_X[i : index]
        batch_y = test_Y[i : index]
        batch_masks = test_masks[i : index]
        batch_segments = test_segments[i : index]
        acc, cost = sess.run(
            [model.accuracy, model.cost],
            feed_dict = {
                model.X: batch_x,
                model.Y: batch_y,
                model.segment_ids: batch_segments,
                model.input_masks: batch_masks,
            },
        )
        test_loss.append(cost)
        test_acc.append(acc)
        pbar.set_postfix(cost = cost, accuracy = acc)
        
    train_loss = np.mean(train_loss)
    train_acc = np.mean(train_acc)
    test_loss = np.mean(test_loss)
    test_acc = np.mean(test_acc)
        
    if test_acc > CURRENT_ACC:
        print(
            'epoch: %d, pass acc: %f, current acc: %f'
            % (EPOCH, CURRENT_ACC, test_acc)
        )
        CURRENT_ACC = test_acc
        CURRENT_CHECKPOINT = 0
    else:
        CURRENT_CHECKPOINT += 1
        
    print('time taken:', time.time() - lasttime)
    print(
        'epoch: %d, training loss: %f, training acc: %f, valid loss: %f, valid acc: %f\n'
        % (EPOCH, train_loss, train_acc, test_loss, test_acc)
    )
    EPOCH += 1

train minibatch loop: 100%|██████████| 24038/24038 [5:37:03<00:00,  1.19it/s, accuracy=1, cost=0.0129]      
test minibatch loop: 100%|██████████| 6010/6010 [31:09<00:00,  3.22it/s, accuracy=0.714, cost=0.0333]
train minibatch loop:   0%|          | 0/24038 [00:00<?, ?it/s]

epoch: 0, pass acc: 0.000000, current acc: 0.908575
time taken: 22092.603753566742
epoch: 0, training loss: 0.045489, training acc: 0.869965, valid loss: 0.022408, valid acc: 0.908575



train minibatch loop: 100%|██████████| 24038/24038 [5:35:31<00:00,  1.19it/s, accuracy=1, cost=0.0129]      
test minibatch loop: 100%|██████████| 6010/6010 [30:53<00:00,  3.24it/s, accuracy=0.786, cost=0.0349]
train minibatch loop:   0%|          | 0/24038 [00:00<?, ?it/s]

epoch: 1, pass acc: 0.908575, current acc: 0.913881
time taken: 21984.56486439705
epoch: 1, training loss: 0.021535, training acc: 0.914687, valid loss: 0.021293, valid acc: 0.913881



train minibatch loop: 100%|██████████| 24038/24038 [5:35:07<00:00,  1.20it/s, accuracy=1, cost=0.0105]      
test minibatch loop: 100%|██████████| 6010/6010 [31:11<00:00,  3.21it/s, accuracy=0.786, cost=0.0325]
train minibatch loop:   0%|          | 0/24038 [00:00<?, ?it/s]

epoch: 2, pass acc: 0.913881, current acc: 0.917342
time taken: 21978.87307047844
epoch: 2, training loss: 0.020206, training acc: 0.922538, valid loss: 0.020972, valid acc: 0.917342



train minibatch loop: 100%|██████████| 24038/24038 [5:35:08<00:00,  1.20it/s, accuracy=1, cost=0.0121]      
test minibatch loop: 100%|██████████| 6010/6010 [31:13<00:00,  3.21it/s, accuracy=0.786, cost=0.0363]

time taken: 21982.28723335266
epoch: 3, training loss: 0.019263, training acc: 0.928971, valid loss: 0.021161, valid acc: 0.916643

break epoch:4






In [31]:
saver = tf.train.Saver(tf.trainable_variables())
saver.save(sess, 'xlnet-base-toxicity/model.ckpt')

'xlnet-base-toxicity/model.ckpt'

In [32]:
kwargs = dict(
      is_training=False,
      use_tpu=False,
      use_bfloat16=False,
      dropout=0.0,
      dropatt=0.0,
      init='normal',
      init_range=0.1,
      init_std=0.05,
      clamp_len=-1)

xlnet_parameters = xlnet.RunConfig(**kwargs)
xlnet_config = xlnet.XLNetConfig(json_path='xlnet-base/config.json')

In [33]:
dimension_output = len(labels)
learning_rate = 2e-5

tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(
    dimension_output,
    learning_rate
)

sess.run(tf.global_variables_initializer())

INFO:tensorflow:memory input None
INFO:tensorflow:Use float type <dtype: 'float32'>




In [34]:
saver = tf.train.Saver(tf.trainable_variables())
saver.restore(sess, 'xlnet-base-toxicity/model.ckpt')

INFO:tensorflow:Restoring parameters from xlnet-base-toxicity/model.ckpt


In [None]:
stack = []

pbar = tqdm(range(0, len(test_X), batch_size), desc = 'test minibatch loop')
for i in pbar:
    index = min(i + batch_size, len(test_X))
    batch_x = test_X[i : index]
    batch_y = test_Y[i : index]
    batch_masks = test_masks[i : index]
    batch_segments = test_segments[i : index]
    stack.append(sess.run(tf.nn.sigmoid(model.logits),
            feed_dict = {
                model.X: batch_x,
                model.segment_ids: batch_segments,
                model.input_masks: batch_masks
            },
    ))

test minibatch loop:  45%|████▌     | 2734/6010 [1:02:21<2:20:44,  2.58s/it]

In [41]:
from sklearn import metrics

print(metrics.classification_report(np.around(np.array(test_Y)),
                                    np.around(np.concatenate(stack,axis=0)),
                                    target_names=labels,
                                    digits=5))

                                     precision    recall  f1-score   support

                       severe toxic    0.76274   0.78363   0.77305     10006
                            obscene    0.50862   0.52366   0.51603      2874
                    identity attack    0.40349   0.52707   0.45707      1404
                             insult    0.58435   0.70709   0.63989     12717
                             threat    0.29885   0.46547   0.36400       391
                              asian    0.41160   0.74425   0.53005       391
                            atheist    0.78571   0.96175   0.86486       183
                           bisexual    0.54545   0.72000   0.62069        25
                           buddhist    0.54054   0.80000   0.64516        50
                          christian    0.73638   0.92561   0.82022      4584
                             female    0.87304   0.92314   0.89739      6935
                       heterosexual    0.70130   0.81818   0.75524       13

In [42]:
strings = ','.join(
    [
        n.name
        for n in tf.get_default_graph().as_graph_def().node
        if ('Variable' in n.op
        or 'Placeholder' in n.name
        or 'logits' in n.name
        or 'alphas' in n.name
        or 'self/Softmax' in n.name)
        and 'Adam' not in n.name
        and 'beta' not in n.name
        and 'global_step' not in n.name
    ]
)
strings.split(',')

['Placeholder',
 'Placeholder_1',
 'Placeholder_2',
 'Placeholder_3',
 'model/transformer/r_w_bias',
 'model/transformer/r_r_bias',
 'model/transformer/word_embedding/lookup_table',
 'model/transformer/r_s_bias',
 'model/transformer/seg_embed',
 'model/transformer/layer_0/rel_attn/q/kernel',
 'model/transformer/layer_0/rel_attn/k/kernel',
 'model/transformer/layer_0/rel_attn/v/kernel',
 'model/transformer/layer_0/rel_attn/r/kernel',
 'model/transformer/layer_0/rel_attn/o/kernel',
 'model/transformer/layer_0/rel_attn/LayerNorm/gamma',
 'model/transformer/layer_0/ff/layer_1/kernel',
 'model/transformer/layer_0/ff/layer_1/bias',
 'model/transformer/layer_0/ff/layer_2/kernel',
 'model/transformer/layer_0/ff/layer_2/bias',
 'model/transformer/layer_0/ff/LayerNorm/gamma',
 'model/transformer/layer_1/rel_attn/q/kernel',
 'model/transformer/layer_1/rel_attn/k/kernel',
 'model/transformer/layer_1/rel_attn/v/kernel',
 'model/transformer/layer_1/rel_attn/r/kernel',
 'model/transformer/layer_1/rel

In [43]:
def freeze_graph(model_dir, output_node_names):

    if not tf.gfile.Exists(model_dir):
        raise AssertionError(
            "Export directory doesn't exists. Please specify an export "
            'directory: %s' % model_dir
        )

    checkpoint = tf.train.get_checkpoint_state(model_dir)
    input_checkpoint = checkpoint.model_checkpoint_path

    absolute_model_dir = '/'.join(input_checkpoint.split('/')[:-1])
    output_graph = absolute_model_dir + '/frozen_model.pb'
    clear_devices = True
    with tf.Session(graph = tf.Graph()) as sess:
        saver = tf.train.import_meta_graph(
            input_checkpoint + '.meta', clear_devices = clear_devices
        )
        saver.restore(sess, input_checkpoint)
        output_graph_def = tf.graph_util.convert_variables_to_constants(
            sess,
            tf.get_default_graph().as_graph_def(),
            output_node_names.split(','),
        )
        with tf.gfile.GFile(output_graph, 'wb') as f:
            f.write(output_graph_def.SerializeToString())
        print('%d ops in the final graph.' % len(output_graph_def.node))

In [44]:
freeze_graph('xlnet-base-toxicity', strings)

INFO:tensorflow:Restoring parameters from xlnet-base-toxicity/model.ckpt
INFO:tensorflow:Froze 163 variables.
INFO:tensorflow:Converted 163 variables to const ops.
7673 ops in the final graph.


In [45]:
import boto3

bucketName = 'huseinhouse-storage'
Key = 'xlnet-base-toxicity/frozen_model.pb'
outPutname = "v34/toxicity/xlnet-base-toxicity.pb"

s3 = boto3.client('s3')

s3.upload_file(Key,bucketName,outPutname)