## How-to

1. You need to use [modeling.py](modeling.py) from extractive-summarization folder. An improvement BERT model to accept text longer than 512 tokens.

In [None]:
import tensorflow as tf
import numpy as np
import pickle

In [None]:
with open('dataset-bert.pkl', 'rb') as fopen:
    dataset = pickle.load(fopen)
dataset.keys()

In [None]:
BERT_VOCAB = 'uncased_L-12_H-768_A-12/vocab.txt'
BERT_INIT_CHKPNT = 'uncased_L-12_H-768_A-12/bert_model.ckpt'
BERT_CONFIG = 'uncased_L-12_H-768_A-12/bert_config.json'

In [None]:
import bert
from bert import run_classifier
from bert import optimization
from bert import tokenization
import modeling

In [None]:
tokenization.validate_case_matches_checkpoint(True,BERT_INIT_CHKPNT)
tokenizer = tokenization.FullTokenizer(
      vocab_file=BERT_VOCAB, do_lower_case=True)

In [None]:
bert_config = modeling.BertConfig.from_json_file(BERT_CONFIG)

In [None]:
epoch = 20
batch_size = 8
warmup_proportion = 0.1
num_train_steps = int(len(dataset['train_texts']) / batch_size * epoch)
num_warmup_steps = int(num_train_steps * warmup_proportion)

In [None]:
class Model:
    def __init__(
        self,
        learning_rate = 2e-5,
    ):
        self.X = tf.placeholder(tf.int32, [None, None])
        self.segment_ids = tf.placeholder(tf.int32, [None, None])
        self.input_masks = tf.placeholder(tf.int32, [None, None])
        self.Y = tf.placeholder(tf.float32, [None, None])
        self.mask = tf.placeholder(tf.int32, [None, None])
        self.clss = tf.placeholder(tf.int32, [None, None])
        mask = tf.cast(self.mask, tf.float32)
        
        model = modeling.BertModel(
            config=bert_config,
            is_training=True,
            input_ids=self.X,
            input_mask=self.input_masks,
            token_type_ids=self.segment_ids,
            use_one_hot_embeddings=False)
        
        outputs = tf.gather(model.get_sequence_output(), self.clss, axis = 1, batch_dims = 1)
        self.logits = tf.layers.dense(outputs, 1)
        self.logits = tf.squeeze(self.logits, axis=-1)
        self.logits = self.logits * mask
        crossent = tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=self.Y)
        crossent = crossent * mask
        crossent = tf.reduce_sum(crossent)
        total_size = tf.reduce_sum(mask)
        self.cost = tf.div_no_nan(crossent, total_size)
        
        self.optimizer = optimization.create_optimizer(self.cost, learning_rate, 
                                                       num_train_steps, num_warmup_steps, False)
        
        l = tf.round(tf.sigmoid(self.logits))
        self.accuracy = tf.reduce_mean(tf.cast(tf.boolean_mask(l, tf.equal(self.Y, 1)), tf.float32))

In [None]:
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(learning_rate = 1e-5)
sess.run(tf.global_variables_initializer())

In [None]:
sess.run(tf.global_variables_initializer())
var_lists = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope = 'bert')
saver = tf.train.Saver(var_list = var_lists)
saver.restore(sess, BERT_INIT_CHKPNT)

In [None]:
def pad_sentence_batch(sentence_batch, pad_int):
    padded_seqs = []
    seq_lens = []
    max_sentence_len = max([len(sentence) for sentence in sentence_batch])
    for sentence in sentence_batch:
        padded_seqs.append(sentence + [pad_int] * (max_sentence_len - len(sentence)))
        seq_lens.append(len(sentence))
    return padded_seqs, seq_lens

In [None]:
train_X = dataset['train_texts']
test_X = dataset['test_texts']
train_clss = dataset['train_clss']
test_clss = dataset['test_clss']
train_Y = dataset['train_labels']
test_Y = dataset['test_labels']
train_segments = dataset['train_segments']
test_segments = dataset['test_segments']

In [None]:
import tqdm

for e in range(epoch):
    pbar = tqdm.tqdm(
        range(0, len(train_X), batch_size), desc = 'minibatch loop')
    train_loss, train_acc, test_loss, test_acc = [], [], [], []
    for i in pbar:
        index = min(i + batch_size, len(train_X))
        batch_x, _ = pad_sentence_batch(train_X[i : index], 0)
        batch_y, _ = pad_sentence_batch(train_Y[i : index], 0)
        batch_segments, _ = pad_sentence_batch(train_segments[i : index], 0)
        batch_clss, _ = pad_sentence_batch(train_clss[i : index], -1)
        batch_clss = np.array(batch_clss)
        batch_x = np.array(batch_x)
        batch_mask = 1 - (batch_clss == -1)
        batch_clss[batch_clss == -1] = 0
        mask_src = 1 - (batch_x == 0)
        feed = {model.X: batch_x,
                model.Y: batch_y,
                model.mask: batch_mask,
                model.clss: batch_clss,
                model.segment_ids: batch_segments,
                model.input_masks: mask_src}
        accuracy, loss, _ = sess.run([model.accuracy,model.cost,model.optimizer],
                                    feed_dict = feed)
        train_loss.append(loss)
        train_acc.append(accuracy)
        pbar.set_postfix(cost = loss, accuracy = accuracy)
    
    pbar = tqdm.tqdm(
        range(0, len(test_X), batch_size), desc = 'minibatch loop')
    for i in pbar:
        index = min(i + batch_size, len(test_X))
        batch_x, _ = pad_sentence_batch(test_X[i : index], 0)
        batch_y, _ = pad_sentence_batch(test_Y[i : index], 0)
        batch_segments, _ = pad_sentence_batch(test_segments[i : index], 0)
        batch_clss, _ = pad_sentence_batch(test_clss[i : index], -1)
        batch_clss = np.array(batch_clss)
        batch_x = np.array(batch_x)
        batch_mask = 1 - (batch_clss == -1)
        batch_clss[batch_clss == -1] = 0
        mask_src = 1 - (batch_x == 0)
        feed = {model.X: batch_x,
                model.Y: batch_y,
                model.mask: batch_mask,
                model.clss: batch_clss,
                model.segment_ids: batch_segments,
                model.input_masks: mask_src}
        accuracy, loss = sess.run([model.accuracy,model.cost],
                                    feed_dict = feed)

        pbar.set_postfix(cost = loss, accuracy = accuracy)
    
    print('epoch %d, training avg loss %f, training avg acc %f'%(e+1,
                                                                 np.mean(train_loss),np.mean(train_acc)))
    print('epoch %d, testing avg loss %f, testing avg acc %f'%(e+1,
                                                              np.mean(test_loss),np.mean(test_acc)))