In [1]:
# !wget https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-24_H-1024_A-16.zip
# !unzip uncased_L-24_H-1024_A-16.zip

In [2]:
!ls -lh uncased_L-24_H-1024_A-16

total 1.3G
-rw-r--r-- 1 husein husein  314 Oct 18  2018 bert_config.json
-rw-r--r-- 1 husein husein 1.3G Oct 18  2018 bert_model.ckpt.data-00000-of-00001
-rw-r--r-- 1 husein husein  17K Oct 18  2018 bert_model.ckpt.index
-rw-r--r-- 1 husein husein 1.8M Oct 18  2018 bert_model.ckpt.meta
-rw-r--r-- 1 husein husein 227K Oct 18  2018 vocab.txt


In [3]:
import json

with open('uncased_L-24_H-1024_A-16/bert_config.json') as fopen:
    config = json.load(fopen)
config

{'attention_probs_dropout_prob': 0.1,
 'hidden_act': 'gelu',
 'hidden_dropout_prob': 0.1,
 'hidden_size': 1024,
 'initializer_range': 0.02,
 'intermediate_size': 4096,
 'max_position_embeddings': 512,
 'num_attention_heads': 16,
 'num_hidden_layers': 24,
 'type_vocab_size': 2,
 'vocab_size': 30522}

In [4]:
config['num_hidden_layers'] = config['num_hidden_layers'] // 2
with open('6_bert_config.json', 'w') as fopen:
    json.dump(config, fopen)

In [5]:
from utils import *
import tensorflow as tf
from sklearn.cross_validation import train_test_split
import time
import random
import os



In [6]:
trainset = sklearn.datasets.load_files(container_path = 'data', encoding = 'UTF-8')
trainset.data, trainset.target = separate_dataset(trainset,1.0)
print (trainset.target_names)
print (len(trainset.data))
print (len(trainset.target))

['negative', 'positive']
10662
10662


In [7]:
BERT_VOCAB = 'uncased_L-24_H-1024_A-16/vocab.txt'
BERT_INIT_CHKPNT = 'uncased_L-24_H-1024_A-16/bert_model.ckpt'
BERT_CONFIG = '6_bert_config.json'

In [8]:
import bert
from bert import run_classifier
from bert import optimization
from bert import tokenization
from bert import modeling

In [9]:
tokenization.validate_case_matches_checkpoint(True,BERT_INIT_CHKPNT)
tokenizer = tokenization.FullTokenizer(
      vocab_file=BERT_VOCAB, do_lower_case=True)

In [10]:
MAX_SEQ_LENGTH = 100

In [11]:
from tqdm import tqdm

input_ids, input_masks, segment_ids = [], [], []

for text in tqdm(trainset.data):
    tokens_a = tokenizer.tokenize(text)
    if len(tokens_a) > MAX_SEQ_LENGTH - 2:
        tokens_a = tokens_a[:(MAX_SEQ_LENGTH - 2)]
    tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
    segment_id = [0] * len(tokens)
    input_id = tokenizer.convert_tokens_to_ids(tokens)
    input_mask = [1] * len(input_id)
    padding = [0] * (MAX_SEQ_LENGTH - len(input_id))
    input_id += padding
    input_mask += padding
    segment_id += padding
    
    input_ids.append(input_id)
    input_masks.append(input_mask)
    segment_ids.append(segment_id)

100%|██████████| 10662/10662 [00:03<00:00, 3476.75it/s]


In [12]:
bert_config = modeling.BertConfig.from_json_file(BERT_CONFIG)

In [13]:
tokenizer.tokenize(trainset.data[0])

['interesting',
 'effort',
 'particularly',
 'j',
 '##fk',
 'conspiracy',
 'nuts',
 'barry',
 '##s',
 'cold',
 '##fish',
 'act',
 'makes',
 'experience',
 'worth',
 '##while']

In [14]:
epoch = 5
batch_size = 3
warmup_proportion = 0.1
num_train_steps = int(len(input_ids) / batch_size * epoch)
num_warmup_steps = int(num_train_steps * warmup_proportion)

In [15]:
class Model:
    def __init__(
        self,
        dimension_output,
        learning_rate = 2e-5,
    ):
        self.X = tf.placeholder(tf.int32, [None, None])
        self.segment_ids = tf.placeholder(tf.int32, [None, None])
        self.input_masks = tf.placeholder(tf.int32, [None, None])
        self.Y = tf.placeholder(tf.int32, [None])
        
        model = modeling.BertModel(
            config=bert_config,
            is_training=True,
            input_ids=self.X,
            input_mask=self.input_masks,
            token_type_ids=self.segment_ids,
            use_one_hot_embeddings=False)
        
        output_layer = model.get_pooled_output()
        self.logits = tf.layers.dense(output_layer, dimension_output)
        
        self.cost = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits = self.logits, labels = self.Y
            )
        )
        
        self.optimizer = optimization.create_optimizer(self.cost, learning_rate, 
                                                       num_train_steps, num_warmup_steps, False)
        correct_pred = tf.equal(
            tf.argmax(self.logits, 1, output_type = tf.int32), self.Y
        )
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [16]:
dimension_output = 2
learning_rate = 2e-5

tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(
    dimension_output,
    learning_rate
)

sess.run(tf.global_variables_initializer())

In [17]:
tvars = tf.trainable_variables()
tvars

[<tf.Variable 'bert/embeddings/word_embeddings:0' shape=(30522, 1024) dtype=float32_ref>,
 <tf.Variable 'bert/embeddings/token_type_embeddings:0' shape=(2, 1024) dtype=float32_ref>,
 <tf.Variable 'bert/embeddings/position_embeddings:0' shape=(512, 1024) dtype=float32_ref>,
 <tf.Variable 'bert/embeddings/LayerNorm/beta:0' shape=(1024,) dtype=float32_ref>,
 <tf.Variable 'bert/embeddings/LayerNorm/gamma:0' shape=(1024,) dtype=float32_ref>,
 <tf.Variable 'bert/encoder/layer_0/attention/self/query/kernel:0' shape=(1024, 1024) dtype=float32_ref>,
 <tf.Variable 'bert/encoder/layer_0/attention/self/query/bias:0' shape=(1024,) dtype=float32_ref>,
 <tf.Variable 'bert/encoder/layer_0/attention/self/key/kernel:0' shape=(1024, 1024) dtype=float32_ref>,
 <tf.Variable 'bert/encoder/layer_0/attention/self/key/bias:0' shape=(1024,) dtype=float32_ref>,
 <tf.Variable 'bert/encoder/layer_0/attention/self/value/kernel:0' shape=(1024, 1024) dtype=float32_ref>,
 <tf.Variable 'bert/encoder/layer_0/attention/s

In [18]:
def get_assignment_map_from_checkpoint(tvars, init_checkpoint):
    """Compute the union of the current variables and checkpoint variables."""
    assignment_map = {}
    initialized_variable_names = {}

    name_to_variable = collections.OrderedDict()
    for var in tvars:
        name = var.name
        m = re.match('^(.*):\\d+$', name)
        if m is not None:
            name = m.group(1)
        name_to_variable[name] = var

    init_vars = tf.train.list_variables(init_checkpoint)

    assignment_map = collections.OrderedDict()
    for x in init_vars:
        (name, var) = (x[0], x[1])
        if name not in name_to_variable:
            continue
        assignment_map[name] = name_to_variable[name]
        initialized_variable_names[name] = 1
        initialized_variable_names[name + ':0'] = 1

    return (assignment_map, initialized_variable_names)

In [19]:
assignment_map, initialized_variable_names = get_assignment_map_from_checkpoint(tvars, 
                                                                                BERT_INIT_CHKPNT)
initialized_variable_names

{'bert/embeddings/LayerNorm/beta': 1,
 'bert/embeddings/LayerNorm/beta:0': 1,
 'bert/embeddings/LayerNorm/gamma': 1,
 'bert/embeddings/LayerNorm/gamma:0': 1,
 'bert/embeddings/position_embeddings': 1,
 'bert/embeddings/position_embeddings:0': 1,
 'bert/embeddings/token_type_embeddings': 1,
 'bert/embeddings/token_type_embeddings:0': 1,
 'bert/embeddings/word_embeddings': 1,
 'bert/embeddings/word_embeddings:0': 1,
 'bert/encoder/layer_0/attention/output/LayerNorm/beta': 1,
 'bert/encoder/layer_0/attention/output/LayerNorm/beta:0': 1,
 'bert/encoder/layer_0/attention/output/LayerNorm/gamma': 1,
 'bert/encoder/layer_0/attention/output/LayerNorm/gamma:0': 1,
 'bert/encoder/layer_0/attention/output/dense/bias': 1,
 'bert/encoder/layer_0/attention/output/dense/bias:0': 1,
 'bert/encoder/layer_0/attention/output/dense/kernel': 1,
 'bert/encoder/layer_0/attention/output/dense/kernel:0': 1,
 'bert/encoder/layer_0/attention/self/key/bias': 1,
 'bert/encoder/layer_0/attention/self/key/bias:0': 1

In [20]:
saver = tf.train.Saver(var_list = assignment_map)
saver.restore(sess, BERT_INIT_CHKPNT)

INFO:tensorflow:Restoring parameters from uncased_L-24_H-1024_A-16/bert_model.ckpt


In [21]:
train_input_ids, test_input_ids, train_input_masks, test_input_masks, train_segment_ids, test_segment_ids, train_Y, test_Y = train_test_split(
    input_ids, input_masks, segment_ids, trainset.target, test_size = 0.2
)

In [22]:
from tqdm import tqdm
import time

EARLY_STOPPING, CURRENT_CHECKPOINT, CURRENT_ACC, EPOCH = 3, 0, 0, 0

while True:
    lasttime = time.time()
    if CURRENT_CHECKPOINT == EARLY_STOPPING:
        print('break epoch:%d\n' % (EPOCH))
        break

    train_acc, train_loss, test_acc, test_loss = 0, 0, 0, 0
    pbar = tqdm(
        range(0, len(train_input_ids), batch_size), desc = 'train minibatch loop'
    )
    for i in pbar:
        index = min(i + batch_size, len(train_input_ids))
        batch_x = train_input_ids[i: index]
        batch_masks = train_input_masks[i: index]
        batch_segment = train_segment_ids[i: index]
        batch_y = train_Y[i: index]
        acc, cost, _ = sess.run(
            [model.accuracy, model.cost, model.optimizer],
            feed_dict = {
                model.Y: batch_y,
                model.X: batch_x,
                model.segment_ids: batch_segment,
                model.input_masks: batch_masks
            },
        )
        assert not np.isnan(cost)
        train_loss += cost
        train_acc += acc
        pbar.set_postfix(cost = cost, accuracy = acc)

    pbar = tqdm(range(0, len(test_input_ids), batch_size), desc = 'test minibatch loop')
    for i in pbar:
        index = min(i + batch_size, len(test_input_ids))
        batch_x = test_input_ids[i: index]
        batch_masks = test_input_masks[i: index]
        batch_segment = test_segment_ids[i: index]
        batch_y = test_Y[i: index]
        acc, cost = sess.run(
            [model.accuracy, model.cost],
            feed_dict = {
                model.Y: batch_y,
                model.X: batch_x,
                model.segment_ids: batch_segment,
                model.input_masks: batch_masks
            },
        )
        test_loss += cost
        test_acc += acc
        pbar.set_postfix(cost = cost, accuracy = acc)

    train_loss /= len(train_input_ids) / batch_size
    train_acc /= len(train_input_ids) / batch_size
    test_loss /= len(test_input_ids) / batch_size
    test_acc /= len(test_input_ids) / batch_size

    if test_acc > CURRENT_ACC:
        print(
            'epoch: %d, pass acc: %f, current acc: %f'
            % (EPOCH, CURRENT_ACC, test_acc)
        )
        CURRENT_ACC = test_acc
        CURRENT_CHECKPOINT = 0
    else:
        CURRENT_CHECKPOINT += 1
        
    print('time taken:', time.time() - lasttime)
    print(
        'epoch: %d, training loss: %f, training acc: %f, valid loss: %f, valid acc: %f\n'
        % (EPOCH, train_loss, train_acc, test_loss, test_acc)
    )
    EPOCH += 1

train minibatch loop: 100%|██████████| 2843/2843 [21:52<00:00,  2.17it/s, accuracy=0.667, cost=0.337]
test minibatch loop: 100%|██████████| 711/711 [01:01<00:00, 11.55it/s, accuracy=0.667, cost=0.761]
train minibatch loop:   0%|          | 0/2843 [00:00<?, ?it/s]

epoch: 0, pass acc: 0.000000, current acc: 0.772152
time taken: 1374.7178251743317
epoch: 0, training loss: 0.684424, training acc: 0.715090, valid loss: 0.605673, valid acc: 0.772152



train minibatch loop: 100%|██████████| 2843/2843 [21:56<00:00,  2.17it/s, accuracy=1, cost=0.0029]   
test minibatch loop: 100%|██████████| 711/711 [01:01<00:00, 11.54it/s, accuracy=0.333, cost=2.71] 
train minibatch loop:   0%|          | 0/2843 [00:00<?, ?it/s]

epoch: 1, pass acc: 0.772152, current acc: 0.773558
time taken: 1378.1370601654053
epoch: 1, training loss: 0.519813, training acc: 0.868566, valid loss: 1.008805, valid acc: 0.773558



train minibatch loop: 100%|██████████| 2843/2843 [21:52<00:00,  2.17it/s, accuracy=1, cost=0.000309] 
test minibatch loop: 100%|██████████| 711/711 [01:01<00:00, 11.53it/s, accuracy=0.667, cost=1.95] 
train minibatch loop:   0%|          | 0/2843 [00:00<?, ?it/s]

epoch: 2, pass acc: 0.773558, current acc: 0.784341
time taken: 1374.092584848404
epoch: 2, training loss: 0.241860, training acc: 0.951460, valid loss: 1.405984, valid acc: 0.784341



train minibatch loop: 100%|██████████| 2843/2843 [21:43<00:00,  2.19it/s, accuracy=1, cost=5.52e-5]  
test minibatch loop: 100%|██████████| 711/711 [01:01<00:00, 11.60it/s, accuracy=1, cost=0.000362] 
train minibatch loop:   0%|          | 0/2843 [00:00<?, ?it/s]

epoch: 3, pass acc: 0.784341, current acc: 0.788092
time taken: 1365.3060529232025
epoch: 3, training loss: 0.089658, training acc: 0.983585, valid loss: 1.619896, valid acc: 0.788092



train minibatch loop: 100%|██████████| 2843/2843 [21:50<00:00,  2.17it/s, accuracy=1, cost=4.41e-5]  
test minibatch loop: 100%|██████████| 711/711 [01:01<00:00, 11.56it/s, accuracy=1, cost=0.000797] 
train minibatch loop:   0%|          | 0/2843 [00:00<?, ?it/s]

epoch: 4, pass acc: 0.788092, current acc: 0.792780
time taken: 1372.1305689811707
epoch: 4, training loss: 0.039045, training acc: 0.992144, valid loss: 1.938607, valid acc: 0.792780



train minibatch loop: 100%|██████████| 2843/2843 [21:55<00:00,  2.16it/s, accuracy=1, cost=4.25e-6]  
test minibatch loop: 100%|██████████| 711/711 [01:01<00:00, 11.50it/s, accuracy=1, cost=2.22e-5]  
train minibatch loop:   0%|          | 0/2843 [00:00<?, ?it/s]

epoch: 5, pass acc: 0.792780, current acc: 0.793718
time taken: 1377.0767679214478
epoch: 5, training loss: 0.015696, training acc: 0.997421, valid loss: 2.088988, valid acc: 0.793718



train minibatch loop: 100%|██████████| 2843/2843 [21:52<00:00,  2.17it/s, accuracy=1, cost=2.74e-6] 
test minibatch loop: 100%|██████████| 711/711 [01:01<00:00, 11.53it/s, accuracy=0.667, cost=2.99] 
train minibatch loop:   0%|          | 0/2843 [00:00<?, ?it/s]

epoch: 6, pass acc: 0.793718, current acc: 0.797937
time taken: 1374.0545842647552
epoch: 6, training loss: 0.006389, training acc: 0.998945, valid loss: 2.122673, valid acc: 0.797937



train minibatch loop: 100%|██████████| 2843/2843 [21:43<00:00,  2.18it/s, accuracy=1, cost=1.03e-5]  
test minibatch loop: 100%|██████████| 711/711 [01:01<00:00, 11.52it/s, accuracy=1, cost=0.000446] 
train minibatch loop:   0%|          | 0/2843 [00:00<?, ?it/s]

epoch: 7, pass acc: 0.797937, current acc: 0.800281
time taken: 1365.3034193515778
epoch: 7, training loss: 0.005304, training acc: 0.999179, valid loss: 2.101871, valid acc: 0.800281



train minibatch loop: 100%|██████████| 2843/2843 [21:50<00:00,  2.17it/s, accuracy=1, cost=3.06e-6]  
test minibatch loop: 100%|██████████| 711/711 [01:01<00:00, 11.52it/s, accuracy=1, cost=7.07e-6]  
train minibatch loop:   0%|          | 0/2843 [00:00<?, ?it/s]

time taken: 1372.6576752662659
epoch: 8, training loss: 0.008127, training acc: 0.998710, valid loss: 2.092777, valid acc: 0.799812



train minibatch loop: 100%|██████████| 2843/2843 [21:56<00:00,  2.16it/s, accuracy=1, cost=3.42e-6]  
test minibatch loop: 100%|██████████| 711/711 [01:01<00:00, 11.51it/s, accuracy=1, cost=0.00287]  
train minibatch loop:   0%|          | 0/2843 [00:00<?, ?it/s]

time taken: 1378.0080671310425
epoch: 9, training loss: 0.005311, training acc: 0.998945, valid loss: 2.096939, valid acc: 0.793249



train minibatch loop: 100%|██████████| 2843/2843 [21:51<00:00,  2.17it/s, accuracy=1, cost=8.07e-6]  
test minibatch loop: 100%|██████████| 711/711 [01:01<00:00, 11.52it/s, accuracy=1, cost=0.00344]  

time taken: 1373.6489472389221
epoch: 10, training loss: 0.008527, training acc: 0.998476, valid loss: 2.108078, valid acc: 0.795593

break epoch:11






In [23]:
saver = tf.train.Saver(tf.trainable_variables())
saver.save(sess, 'bert/model.ckpt')

'bert/model.ckpt'

In [24]:
!ls -lh bert

total 706M
-rw-rw-r-- 1 husein husein   77 Jul  5 11:07 checkpoint
-rw-rw-r-- 1 husein husein 702M Jul  5 11:07 model.ckpt.data-00000-of-00001
-rw-rw-r-- 1 husein husein 8.2K Jul  5 11:07 model.ckpt.index
-rw-rw-r-- 1 husein husein 3.9M Jul  5 11:07 model.ckpt.meta


In [25]:
(1300 - 702) / 1300

0.46

## We save 46% of space, accuracy drop around 3-5%