In [1]:
# !wget http://qim.fs.quoracdn.net/quora_duplicate_questions.tsv
# !wget https://storage.googleapis.com/xlnet/released_models/cased_L-12_H-768_A-12.zip -O xlnet.zip
# !unzip xlnet.zip

In [2]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '1'

In [3]:
import sentencepiece as spm
from prepro_utils import preprocess_text, encode_ids

sp_model = spm.SentencePieceProcessor()
sp_model.Load('xlnet_cased_L-12_H-768_A-12/spiece.model')

def tokenize_fn(text):
    text = preprocess_text(text, lower= False)
    return encode_ids(sp_model, text)

In [4]:
SEG_ID_A   = 0
SEG_ID_B   = 1
SEG_ID_CLS = 2
SEG_ID_SEP = 3
SEG_ID_PAD = 4

special_symbols = {
    "<unk>"  : 0,
    "<s>"    : 1,
    "</s>"   : 2,
    "<cls>"  : 3,
    "<sep>"  : 4,
    "<pad>"  : 5,
    "<mask>" : 6,
    "<eod>"  : 7,
    "<eop>"  : 8,
}

VOCAB_SIZE = 32000
UNK_ID = special_symbols["<unk>"]
CLS_ID = special_symbols["<cls>"]
SEP_ID = special_symbols["<sep>"]
MASK_ID = special_symbols["<mask>"]
EOD_ID = special_symbols["<eod>"]

In [5]:
import pandas as pd

df = pd.read_csv('quora_duplicate_questions.tsv', delimiter='\t').dropna()
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [6]:
left, right = df['question1'].tolist(), df['question2'].tolist()

In [7]:
from tqdm import tqdm

input_ids, input_mask, all_seg_ids = [], [], []
for i in tqdm(range(len(left))):
    tokens = tokenize_fn(left[i])
    tokens_right = tokenize_fn(right[i])
    segment_ids = [SEG_ID_A] * len(tokens)
    tokens.append(SEP_ID)
    segment_ids.append(SEG_ID_A)

    tokens.extend(tokens_right)
    segment_ids.extend([SEG_ID_B] * len(tokens_right))
    tokens.append(SEP_ID)
    segment_ids.append(SEG_ID_B)

    tokens.append(CLS_ID)
    segment_ids.append(SEG_ID_CLS)

    cur_input_ids = tokens
    cur_input_mask = [0] * len(cur_input_ids)
    assert len(tokens) == len(cur_input_mask)
    assert len(tokens) == len(segment_ids)
    input_ids.append(tokens)
    input_mask.append(cur_input_mask)
    all_seg_ids.append(segment_ids)

100%|██████████| 404287/404287 [00:58<00:00, 6877.24it/s]


In [8]:
import xlnet
import model_utils
import tensorflow as tf
import numpy as np

kwargs = dict(
      is_training=True,
      use_tpu=False,
      use_bfloat16=False,
      dropout=0.1,
      dropatt=0.1,
      init='normal',
      init_range=0.1,
      init_std=0.05,
      clamp_len=-1)

xlnet_parameters = xlnet.RunConfig(**kwargs)
xlnet_config = xlnet.XLNetConfig(json_path='xlnet_cased_L-12_H-768_A-12/xlnet_config.json')

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])






  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [9]:
epoch = 15
batch_size = 16
warmup_proportion = 0.1
num_train_steps = int(len(input_ids) / batch_size * epoch)
num_warmup_steps = int(num_train_steps * warmup_proportion)
print(num_train_steps, num_warmup_steps)

training_parameters = dict(
      decay_method = 'poly',
      train_steps = num_train_steps,
      learning_rate = 2e-5,
      warmup_steps = num_warmup_steps,
      min_lr_ratio = 0.0,
      weight_decay = 0.00,
      adam_epsilon = 1e-8,
      num_core_per_host = 1,
      lr_layer_decay_rate = 1,
      use_tpu=False,
      use_bfloat16=False,
      dropout=0.0,
      dropatt=0.0,
      init='normal',
      init_range=0.1,
      init_std=0.02,
      clip = 1.0,
      clamp_len=-1,)

379019 37901


In [10]:
class Parameter:
    def __init__(self, decay_method, warmup_steps, weight_decay, adam_epsilon, 
                num_core_per_host, lr_layer_decay_rate, use_tpu, learning_rate, train_steps,
                min_lr_ratio, clip, **kwargs):
        self.decay_method = decay_method
        self.warmup_steps = warmup_steps
        self.weight_decay = weight_decay
        self.adam_epsilon = adam_epsilon
        self.num_core_per_host = num_core_per_host
        self.lr_layer_decay_rate = lr_layer_decay_rate
        self.use_tpu = use_tpu
        self.learning_rate = learning_rate
        self.train_steps = train_steps
        self.min_lr_ratio = min_lr_ratio
        self.clip = clip
        
training_parameters = Parameter(**training_parameters)

In [11]:
class Model:
    def __init__(
        self,
        dimension_output,
        learning_rate = 2e-5,
    ):
        self.X = tf.placeholder(tf.int32, [None, None])
        self.segment_ids = tf.placeholder(tf.int32, [None, None])
        self.input_masks = tf.placeholder(tf.float32, [None, None])
        self.Y = tf.placeholder(tf.int32, [None])
        
        xlnet_model = xlnet.XLNetModel(
            xlnet_config=xlnet_config,
            run_config=xlnet_parameters,
            input_ids=tf.transpose(self.X, [1, 0]),
            seg_ids=tf.transpose(self.segment_ids, [1, 0]),
            input_mask=tf.transpose(self.input_masks, [1, 0]))
        
        summary = xlnet_model.get_pooled_out("last", True)
        print(summary)
        
        self.logits = tf.layers.dense(summary, dimension_output)
        
        self.cost = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits = self.logits, labels = self.Y
            )
        )
        self.optimizer, self.learning_rate, _ = model_utils.get_train_op(training_parameters, self.cost)
        
        correct_pred = tf.equal(
            tf.argmax(self.logits, 1, output_type = tf.int32), self.Y
        )
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [12]:
dimension_output = 2
learning_rate = 2e-5

tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(dimension_output, learning_rate)

sess.run(tf.global_variables_initializer())




INFO:tensorflow:memory input None
INFO:tensorflow:Use float type <dtype: 'float32'>

Instructions for updating:
Use keras.layers.dropout instead.
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Use keras.layers.dense instead.
Tensor("model_1/sequnece_summary/dropout/dropout/mul_1:0", shape=(?, 768), dtype=float32)
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where



In [13]:
import collections
import re

def get_assignment_map_from_checkpoint(tvars, init_checkpoint):
    """Compute the union of the current variables and checkpoint variables."""
    assignment_map = {}
    initialized_variable_names = {}

    name_to_variable = collections.OrderedDict()
    for var in tvars:
        name = var.name
        m = re.match('^(.*):\\d+$', name)
        if m is not None:
            name = m.group(1)
        name_to_variable[name] = var

    init_vars = tf.train.list_variables(init_checkpoint)

    assignment_map = collections.OrderedDict()
    for x in init_vars:
        (name, var) = (x[0], x[1])
        if name not in name_to_variable:
            continue
        assignment_map[name] = name_to_variable[name]
        initialized_variable_names[name] = 1
        initialized_variable_names[name + ':0'] = 1

    return (assignment_map, initialized_variable_names)

In [14]:
tvars = tf.trainable_variables()
checkpoint = 'xlnet_cased_L-12_H-768_A-12/xlnet_model.ckpt'
assignment_map, initialized_variable_names = get_assignment_map_from_checkpoint(tvars, 
                                                                                checkpoint)

In [15]:
saver = tf.train.Saver(var_list = assignment_map)
saver.restore(sess, checkpoint)

Instructions for updating:
Use standard file APIs to check for files with this prefix.
INFO:tensorflow:Restoring parameters from xlnet_cased_L-12_H-768_A-12/xlnet_model.ckpt


In [16]:
from sklearn.model_selection import train_test_split

train_X, test_X, train_Y, test_Y, train_segment, test_segment, train_mask, test_mask = \
train_test_split(input_ids, df['is_duplicate'].tolist(), input_mask, all_seg_ids, test_size = 0.2)

In [17]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

batch_x = train_X[:5]
batch_x = pad_sequences(batch_x,padding='post')
batch_y = train_Y[:5]
batch_segments = train_segment[:5]
batch_segments = pad_sequences(batch_segments, padding='post', value = 4)
batch_masks = train_mask[:5]
batch_masks = pad_sequences(batch_masks, padding='post', value = 1)

In [18]:
sess.run([model.accuracy, model.cost],
        feed_dict = {model.X: batch_x,
                model.Y: batch_y,
                model.segment_ids: batch_segments,
                model.input_masks: batch_masks})

[0.8, 0.45326415]

In [19]:
from tqdm import tqdm
import time

EARLY_STOPPING, CURRENT_CHECKPOINT, CURRENT_ACC, EPOCH = 2, 0, 0, 0

while True:
    lasttime = time.time()
    if CURRENT_CHECKPOINT == EARLY_STOPPING:
        print('break epoch:%d\n' % (EPOCH))
        break
    train_acc, train_loss = [], []
    test_acc, test_loss = [], []
    
    pbar = tqdm(
        range(0, len(train_X), batch_size), desc = 'train minibatch loop'
    )
    for i in pbar:
        index = min(i + batch_size, len(train_X))
        batch_x = train_X[i: index]
        batch_x = pad_sequences(batch_x,padding='post')
        batch_y = train_Y[i: index]
        batch_segments = train_segment[i: index]
        batch_segments = pad_sequences(batch_segments, padding='post', value = 4)
        batch_masks = train_mask[i: index]
        batch_masks = pad_sequences(batch_masks, padding='post', value = 1)
        
        acc, cost, _ = sess.run(
            [model.accuracy, model.cost, model.optimizer],
            feed_dict = {
                model.X: batch_x,
                model.Y: batch_y,
                model.segment_ids: batch_segments,
                model.input_masks: batch_masks
            },
        )
        train_loss.append(cost)
        train_acc.append(acc)
        pbar.set_postfix(cost = cost, accuracy = acc)
        
    pbar = tqdm(
        range(0, len(test_X), batch_size), desc = 'test minibatch loop'
    )
    for i in pbar:
        index = min(i + batch_size, len(test_X))
        batch_x = test_X[i: index]
        batch_x = pad_sequences(batch_x,padding='post')
        batch_y = test_Y[i: index]
        batch_segments = test_segment[i: index]
        batch_segments = pad_sequences(batch_segments, padding='post', value = 4)
        batch_masks = test_mask[i: index]
        batch_masks = pad_sequences(batch_masks, padding='post', value = 1)
        
        acc, cost = sess.run(
            [model.accuracy, model.cost],
            feed_dict = {
                model.X: batch_x,
                model.Y: batch_y,
                model.segment_ids: batch_segments,
                model.input_masks: batch_masks
            },
        )
        test_loss.append(cost)
        test_acc.append(acc)
        pbar.set_postfix(cost = cost, accuracy = acc)
    
    train_loss = np.mean(train_loss)
    train_acc = np.mean(train_acc)
    test_loss = np.mean(test_loss)
    test_acc = np.mean(test_acc)
    
    if test_acc > CURRENT_ACC:
        print(
            'epoch: %d, pass acc: %f, current acc: %f'
            % (EPOCH, CURRENT_ACC, test_acc)
        )
        CURRENT_ACC = test_acc
        CURRENT_CHECKPOINT = 0
    else:
        CURRENT_CHECKPOINT += 1
        
    print('time taken:', time.time() - lasttime)
    print(
        'epoch: %d, training loss: %f, training acc: %f, valid loss: %f, valid acc: %f\n'
        % (EPOCH, train_loss, train_acc, test_loss, test_acc)
    )
    EPOCH += 1

train minibatch loop: 100%|██████████| 20215/20215 [51:46<00:00,  6.51it/s, accuracy=0.8, cost=0.374]   
test minibatch loop: 100%|██████████| 5054/5054 [04:40<00:00, 18.04it/s, accuracy=0.5, cost=0.801]  
train minibatch loop:   0%|          | 0/20215 [00:00<?, ?it/s]

epoch: 0, pass acc: 0.000000, current acc: 0.732872
time taken: 3386.655509710312
epoch: 0, training loss: 0.613765, training acc: 0.677210, valid loss: 0.541196, valid acc: 0.732872



train minibatch loop: 100%|██████████| 20215/20215 [52:06<00:00,  6.46it/s, accuracy=1, cost=0.154]     
test minibatch loop: 100%|██████████| 5054/5054 [04:40<00:00, 18.04it/s, accuracy=0.5, cost=0.739]  
train minibatch loop:   0%|          | 0/20215 [00:00<?, ?it/s]

epoch: 1, pass acc: 0.732872, current acc: 0.761451
time taken: 3406.9526970386505
epoch: 1, training loss: 0.513010, training acc: 0.754075, valid loss: 0.505185, valid acc: 0.761451



train minibatch loop: 100%|██████████| 20215/20215 [51:40<00:00,  6.52it/s, accuracy=1, cost=0.0618]    
test minibatch loop: 100%|██████████| 5054/5054 [04:39<00:00, 18.08it/s, accuracy=0.7, cost=0.514]  
train minibatch loop:   0%|          | 0/20215 [00:00<?, ?it/s]

epoch: 2, pass acc: 0.761451, current acc: 0.768540
time taken: 3380.2917671203613
epoch: 2, training loss: 0.462681, training acc: 0.790511, valid loss: 0.519211, valid acc: 0.768540



train minibatch loop: 100%|██████████| 20215/20215 [52:05<00:00,  6.47it/s, accuracy=1, cost=0.0799]    
test minibatch loop: 100%|██████████| 5054/5054 [04:40<00:00, 18.04it/s, accuracy=0.7, cost=0.731]   
train minibatch loop:   0%|          | 0/20215 [00:00<?, ?it/s]

epoch: 3, pass acc: 0.768540, current acc: 0.773956
time taken: 3405.854062795639
epoch: 3, training loss: 0.414382, training acc: 0.822904, valid loss: 0.552833, valid acc: 0.773956



train minibatch loop: 100%|██████████| 20215/20215 [52:29<00:00,  6.42it/s, accuracy=1, cost=0.0567]    
test minibatch loop: 100%|██████████| 5054/5054 [04:28<00:00, 18.85it/s, accuracy=0.7, cost=0.563]   
train minibatch loop:   0%|          | 0/20215 [00:00<?, ?it/s]

time taken: 3417.4785237312317
epoch: 4, training loss: 0.374338, training acc: 0.847520, valid loss: 0.600461, valid acc: 0.773313



train minibatch loop: 100%|██████████| 20215/20215 [55:02<00:00,  6.12it/s, accuracy=1, cost=0.0265]     
test minibatch loop: 100%|██████████| 5054/5054 [04:39<00:00, 18.07it/s, accuracy=0.6, cost=1.03]   

time taken: 3581.9256689548492
epoch: 5, training loss: 0.338192, training acc: 0.868210, valid loss: 0.716454, valid acc: 0.726400

break epoch:6




