In [1]:
# !wget https://storage.googleapis.com/xlnet/released_models/cased_L-24_H-1024_A-16.zip
# !unzip cased_L-24_H-1024_A-16.zip
# !wget https://raw.githubusercontent.com/huseinzol05/NLP-Models-Tensorflow/master/text-classification/utils.py

In [2]:
import xlnet
import numpy as np
import tensorflow as tf
from tqdm import tqdm
import model_utils

In [3]:
import sentencepiece as spm
from prepro_utils import preprocess_text, encode_ids

sp_model = spm.SentencePieceProcessor()
sp_model.Load('xlnet_cased_L-24_H-1024_A-16/spiece.model')

True

In [4]:
from utils import *
from sklearn.cross_validation import train_test_split

trainset = sklearn.datasets.load_files(container_path = 'data', encoding = 'UTF-8')
trainset.data, trainset.target = separate_dataset(trainset,1.0)
print (trainset.target_names)
print (len(trainset.data))
print (len(trainset.target))



['negative', 'positive']
10662
10662


In [5]:
from prepro_utils import preprocess_text, encode_ids

def tokenize_fn(text):
    text = preprocess_text(text, lower= False)
    return encode_ids(sp_model, text)

In [6]:
MAX_SEQ_LENGTH = 100

SEG_ID_A   = 0
SEG_ID_B   = 1
SEG_ID_CLS = 2
SEG_ID_SEP = 3
SEG_ID_PAD = 4

special_symbols = {
    "<unk>"  : 0,
    "<s>"    : 1,
    "</s>"   : 2,
    "<cls>"  : 3,
    "<sep>"  : 4,
    "<pad>"  : 5,
    "<mask>" : 6,
    "<eod>"  : 7,
    "<eop>"  : 8,
}

VOCAB_SIZE = 32000
UNK_ID = special_symbols["<unk>"]
CLS_ID = special_symbols["<cls>"]
SEP_ID = special_symbols["<sep>"]
MASK_ID = special_symbols["<mask>"]
EOD_ID = special_symbols["<eod>"]

input_ids, input_masks, segment_ids = [], [], []

for text in tqdm(trainset.data):
    tokens_a = tokenize_fn(text)
    if len(tokens_a) > MAX_SEQ_LENGTH - 2:
        tokens_a = tokens_a[:(MAX_SEQ_LENGTH - 2)]
        
    tokens = []
    segment_id = []
    for token in tokens_a:
        tokens.append(token)
        segment_id.append(SEG_ID_A)
    tokens.append(SEP_ID)
    segment_id.append(SEG_ID_A)
    tokens.append(CLS_ID)
    segment_id.append(SEG_ID_CLS)
    
    input_id = tokens
    input_mask = [0] * len(input_id)
    if len(input_id) < MAX_SEQ_LENGTH:
        delta_len = MAX_SEQ_LENGTH - len(input_id)
        input_id = [0] * delta_len + input_id
        input_mask = [1] * delta_len + input_mask
        segment_id = [SEG_ID_PAD] * delta_len + segment_id
    
    input_ids.append(input_id)
    input_masks.append(input_mask)
    segment_ids.append(segment_id)

100%|██████████| 10662/10662 [00:01<00:00, 9218.95it/s]


In [7]:
kwargs = dict(
      is_training=True,
      use_tpu=False,
      use_bfloat16=False,
      dropout=0.1,
      dropatt=0.1,
      init='normal',
      init_range=0.1,
      init_std=0.02,
      clamp_len=-1)

xlnet_parameters = xlnet.RunConfig(**kwargs)
xlnet_config = xlnet.XLNetConfig(json_path='xlnet_cased_L-24_H-1024_A-16/xlnet_config.json')

In [8]:
epoch = 10
batch_size = 10
warmup_proportion = 0.1
num_train_steps = int(len(input_ids) / batch_size * epoch)
num_warmup_steps = int(num_train_steps * warmup_proportion)
print(num_train_steps, num_warmup_steps)

training_parameters = dict(
      decay_method = 'poly',
      train_steps = num_train_steps,
      learning_rate = 5e-5,
      warmup_steps = num_warmup_steps,
      min_lr_ratio = 0.0,
      weight_decay = 0.00,
      adam_epsilon = 1e-8,
      num_core_per_host = 1,
      lr_layer_decay_rate = 1,
      use_tpu=False,
      use_bfloat16=False,
      dropout=0.1,
      dropatt=0.1,
      init='normal',
      init_range=0.1,
      init_std=0.02,
      clip = 1.0,
      clamp_len=-1,)

10662 1066


In [9]:
class Parameter:
    def __init__(self, decay_method, warmup_steps, weight_decay, adam_epsilon, 
                num_core_per_host, lr_layer_decay_rate, use_tpu, learning_rate, train_steps,
                min_lr_ratio, clip, **kwargs):
        self.decay_method = decay_method
        self.warmup_steps = warmup_steps
        self.weight_decay = weight_decay
        self.adam_epsilon = adam_epsilon
        self.num_core_per_host = num_core_per_host
        self.lr_layer_decay_rate = lr_layer_decay_rate
        self.use_tpu = use_tpu
        self.learning_rate = learning_rate
        self.train_steps = train_steps
        self.min_lr_ratio = min_lr_ratio
        self.clip = clip
        
training_parameters = Parameter(**training_parameters)

In [10]:
class Model:
    def __init__(
        self,
        dimension_output,
        learning_rate = 2e-5,
    ):
        self.X = tf.placeholder(tf.int32, [None, None])
        self.segment_ids = tf.placeholder(tf.int32, [None, None])
        self.input_masks = tf.placeholder(tf.float32, [None, None])
        self.Y = tf.placeholder(tf.int32, [None])
        
        xlnet_model = xlnet.XLNetModel(
            xlnet_config=xlnet_config,
            run_config=xlnet_parameters,
            input_ids=tf.transpose(self.X, [1, 0]),
            seg_ids=tf.transpose(self.segment_ids, [1, 0]),
            input_mask=tf.transpose(self.input_masks, [1, 0]))
        
        summary = xlnet_model.get_pooled_out("last", True)
        print(summary)
        
        self.logits = tf.layers.dense(summary, dimension_output)
        
        self.cost = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits = self.logits, labels = self.Y
            )
        )
        
        self.optimizer, self.learning_rate, _ = model_utils.get_train_op(training_parameters, self.cost)
        correct_pred = tf.equal(
            tf.argmax(self.logits, 1, output_type = tf.int32), self.Y
        )
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [11]:
dimension_output = 2
learning_rate = 2e-5

tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(
    dimension_output,
    learning_rate
)

sess.run(tf.global_variables_initializer())
var_lists = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope = 'model/transformer')

INFO:tensorflow:memory input None
INFO:tensorflow:Use float type <dtype: 'float32'>
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use keras.layers.dropout instead.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.

For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Use keras.layers.dense instead.
Tensor("model_1/sequnece_summary/dropout/dropout/mul:0", shape=(?, 1024), dtype=float32)
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Instructions for updating:
Use tf.cast instead.


In [12]:
from tensorflow.contrib.framework.python.framework import checkpoint_utils
checkpoint_utils.list_variables('xlnet_cased_L-24_H-1024_A-16/xlnet_model.ckpt')

[('global_step', []),
 ('model/lm_loss/bias', [32000]),
 ('model/transformer/layer_0/ff/LayerNorm/beta', [1024]),
 ('model/transformer/layer_0/ff/LayerNorm/gamma', [1024]),
 ('model/transformer/layer_0/ff/layer_1/bias', [4096]),
 ('model/transformer/layer_0/ff/layer_1/kernel', [1024, 4096]),
 ('model/transformer/layer_0/ff/layer_2/bias', [1024]),
 ('model/transformer/layer_0/ff/layer_2/kernel', [4096, 1024]),
 ('model/transformer/layer_0/rel_attn/LayerNorm/beta', [1024]),
 ('model/transformer/layer_0/rel_attn/LayerNorm/gamma', [1024]),
 ('model/transformer/layer_0/rel_attn/k/kernel', [1024, 16, 64]),
 ('model/transformer/layer_0/rel_attn/o/kernel', [1024, 16, 64]),
 ('model/transformer/layer_0/rel_attn/q/kernel', [1024, 16, 64]),
 ('model/transformer/layer_0/rel_attn/r/kernel', [1024, 16, 64]),
 ('model/transformer/layer_0/rel_attn/v/kernel', [1024, 16, 64]),
 ('model/transformer/layer_1/ff/LayerNorm/beta', [1024]),
 ('model/transformer/layer_1/ff/LayerNorm/gamma', [1024]),
 ('model/tr

In [13]:
saver = tf.train.Saver(var_list = var_lists)
saver.restore(sess, 'xlnet_cased_L-24_H-1024_A-16/xlnet_model.ckpt')

Instructions for updating:
Use standard file APIs to check for files with this prefix.
INFO:tensorflow:Restoring parameters from xlnet_cased_L-24_H-1024_A-16/xlnet_model.ckpt


In [14]:
from sklearn.cross_validation import train_test_split

train_input_ids, test_input_ids, train_input_masks, test_input_masks, train_segment_ids, test_segment_ids, train_Y, test_Y = train_test_split(
    input_ids, input_masks, segment_ids, trainset.target, test_size = 0.2
)

In [15]:
from tqdm import tqdm
import time

EARLY_STOPPING, CURRENT_CHECKPOINT, CURRENT_ACC, EPOCH = 3, 0, 0, 0

while True:
    lasttime = time.time()
    if CURRENT_CHECKPOINT == EARLY_STOPPING:
        print('break epoch:%d\n' % (EPOCH))
        break

    train_acc, train_loss, test_acc, test_loss = 0, 0, 0, 0
    pbar = tqdm(
        range(0, len(train_input_ids), batch_size), desc = 'train minibatch loop'
    )
    for i in pbar:
        index = min(i + batch_size, len(train_input_ids))
        batch_x = train_input_ids[i: index]
        batch_masks = train_input_masks[i: index]
        batch_segment = train_segment_ids[i: index]
        batch_y = train_Y[i: index]
        acc, cost, _, lr = sess.run(
            [model.accuracy, model.cost, model.optimizer, model.learning_rate],
            feed_dict = {
                model.Y: batch_y,
                model.X: batch_x,
                model.segment_ids: batch_segment,
                model.input_masks: batch_masks
            },
        )
        assert not np.isnan(cost)
        train_loss += cost
        train_acc += acc
        pbar.set_postfix(cost = cost, accuracy = acc, lr = lr)
        
    pbar = tqdm(range(0, len(test_input_ids), batch_size), desc = 'test minibatch loop')
    for i in pbar:
        index = min(i + batch_size, len(test_input_ids))
        batch_x = test_input_ids[i: index]
        batch_masks = test_input_masks[i: index]
        batch_segment = test_segment_ids[i: index]
        batch_y = test_Y[i: index]
        acc, cost = sess.run(
            [model.accuracy, model.cost],
            feed_dict = {
                model.Y: batch_y,
                model.X: batch_x,
                model.segment_ids: batch_segment,
                model.input_masks: batch_masks
            },
        )
        test_loss += cost
        test_acc += acc
        pbar.set_postfix(cost = cost, accuracy = acc)

    train_loss /= len(train_input_ids) / batch_size
    train_acc /= len(train_input_ids) / batch_size
    test_loss /= len(test_input_ids) / batch_size
    test_acc /= len(test_input_ids) / batch_size

    if test_acc > CURRENT_ACC:
        print(
            'epoch: %d, pass acc: %f, current acc: %f'
            % (EPOCH, CURRENT_ACC, test_acc)
        )
        CURRENT_ACC = test_acc
        CURRENT_CHECKPOINT = 0
    else:
        CURRENT_CHECKPOINT += 1
        
    print('time taken:', time.time() - lasttime)
    print(
        'epoch: %d, training loss: %f, training acc: %f, valid loss: %f, valid acc: %f\n'
        % (EPOCH, train_loss, train_acc, test_loss, test_acc)
    )
    EPOCH += 1

train minibatch loop: 100%|██████████| 853/853 [05:20<00:00,  2.77it/s, accuracy=0.556, cost=0.748, lr=4e-5] 
test minibatch loop: 100%|██████████| 214/214 [00:29<00:00,  7.14it/s, accuracy=0, cost=0.801]  
train minibatch loop:   0%|          | 0/853 [00:00<?, ?it/s]

epoch: 0, pass acc: 0.000000, current acc: 0.486639
time taken: 350.65410113334656
epoch: 0, training loss: 0.799104, training acc: 0.498834, valid loss: 0.800484, valid acc: 0.486639



train minibatch loop: 100%|██████████| 853/853 [05:14<00:00,  2.76it/s, accuracy=0.333, cost=0.724, lr=4.67e-5]
test minibatch loop: 100%|██████████| 214/214 [00:26<00:00,  8.01it/s, accuracy=0.667, cost=0.494]
train minibatch loop:   0%|          | 0/853 [00:00<?, ?it/s]

epoch: 1, pass acc: 0.486639, current acc: 0.501016
time taken: 341.2207417488098
epoch: 1, training loss: 0.770199, training acc: 0.493180, valid loss: 0.747319, valid acc: 0.501016



train minibatch loop: 100%|██████████| 853/853 [05:14<00:00,  2.77it/s, accuracy=0.444, cost=0.831, lr=4.22e-5]
test minibatch loop: 100%|██████████| 214/214 [00:26<00:00,  8.01it/s, accuracy=0.667, cost=0.667]
train minibatch loop:   0%|          | 0/853 [00:00<?, ?it/s]

epoch: 2, pass acc: 0.501016, current acc: 0.509923
time taken: 340.930935382843
epoch: 2, training loss: 0.759579, training acc: 0.497414, valid loss: 0.732103, valid acc: 0.509923



train minibatch loop: 100%|██████████| 853/853 [05:14<00:00,  2.76it/s, accuracy=0.444, cost=0.785, lr=3.78e-5]
test minibatch loop: 100%|██████████| 214/214 [00:26<00:00,  8.01it/s, accuracy=0, cost=0.984]  
train minibatch loop:   0%|          | 0/853 [00:00<?, ?it/s]

time taken: 340.80710792541504
epoch: 3, training loss: 0.757005, training acc: 0.490262, valid loss: 0.763326, valid acc: 0.498828



train minibatch loop: 100%|██████████| 853/853 [05:14<00:00,  2.76it/s, accuracy=0.333, cost=0.778, lr=3.33e-5]
test minibatch loop: 100%|██████████| 214/214 [00:26<00:00,  8.01it/s, accuracy=0.333, cost=0.736]
train minibatch loop:   0%|          | 0/853 [00:00<?, ?it/s]

time taken: 340.8487582206726
epoch: 4, training loss: 0.749612, training acc: 0.497636, valid loss: 0.748154, valid acc: 0.507423



train minibatch loop: 100%|██████████| 853/853 [05:14<00:00,  2.77it/s, accuracy=0.333, cost=0.861, lr=2.89e-5]
test minibatch loop: 100%|██████████| 214/214 [00:26<00:00,  8.01it/s, accuracy=0.667, cost=0.576]
train minibatch loop:   0%|          | 0/853 [00:00<?, ?it/s]

epoch: 5, pass acc: 0.509923, current acc: 0.514612
time taken: 340.76790738105774
epoch: 5, training loss: 0.747597, training acc: 0.494001, valid loss: 0.728410, valid acc: 0.514612



train minibatch loop: 100%|██████████| 853/853 [05:14<00:00,  2.76it/s, accuracy=0.333, cost=0.862, lr=2.44e-5]
test minibatch loop: 100%|██████████| 214/214 [00:26<00:00,  8.01it/s, accuracy=1, cost=0.414]  
train minibatch loop:   0%|          | 0/853 [00:00<?, ?it/s]

time taken: 340.7731728553772
epoch: 6, training loss: 0.739021, training acc: 0.507953, valid loss: 0.737402, valid acc: 0.494609



train minibatch loop: 100%|██████████| 853/853 [05:14<00:00,  2.77it/s, accuracy=0.667, cost=0.637, lr=2e-5] 
test minibatch loop: 100%|██████████| 214/214 [00:26<00:00,  8.00it/s, accuracy=0.333, cost=0.84]
train minibatch loop:   0%|          | 0/853 [00:00<?, ?it/s]

time taken: 340.7534418106079
epoch: 7, training loss: 0.741228, training acc: 0.496971, valid loss: 0.750603, valid acc: 0.483982



train minibatch loop: 100%|██████████| 853/853 [05:13<00:00,  2.77it/s, accuracy=0.556, cost=0.693, lr=1.56e-5]
test minibatch loop: 100%|██████████| 214/214 [00:26<00:00,  8.01it/s, accuracy=0.333, cost=1.06]

time taken: 340.7138590812683
epoch: 8, training loss: 0.730399, training acc: 0.502820, valid loss: 0.747379, valid acc: 0.492890

break epoch:9




