Run the BERT fine tuning on GPU for the mrc competition. Do binary classification for each of the three alternatives.

In [1]:
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [2]:
from google.colab import drive

drive.mount('/content/drive/')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive/


In [0]:
import pandas as pd
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
#import jieba
import zipfile
from matplotlib import pyplot as plt
%matplotlib inline
import sys
import datetime
import random

## Preprocess

In [0]:
!pip install jieba

Collecting jieba
[?25l  Downloading https://files.pythonhosted.org/packages/71/46/c6f9179f73b818d5827202ad1c4a94e371a29473b7f043b736b4dab6b8cd/jieba-0.39.zip (7.3MB)
[K    100% |████████████████████████████████| 7.3MB 6.0MB/s 
[?25hBuilding wheels for collected packages: jieba
  Running setup.py bdist_wheel for jieba ... [?25l- \ | / - \ | / done
[?25h  Stored in directory: /root/.cache/pip/wheels/c9/c7/63/a9ec0322ccc7c365fd51e475942a82395807186e94f0522243
Successfully built jieba
Installing collected packages: jieba
Successfully installed jieba-0.39


In [0]:
train_path = '/content/drive/My Drive/Colab Notebooks/ai_challenger_oqmrc_trainingset.json' # train set
valid_path = '/content/drive/My Drive/Colab Notebooks/ai_challenger_oqmrc_validationset.json' # validation set
test_path = '/content/drive/My Drive/Colab Notebooks/ai_challenger_oqmrc_testa.json' # test set

In [0]:
train_set = pd.read_json(train_path, orient='records', encoding='utf-8', lines=True)
valid_set = pd.read_json(valid_path, orient='records', encoding='utf-8', lines=True)
test_set = pd.read_json(test_path, orient='records', encoding='utf-8', lines=True)

In [0]:
def preprocess(text, alternatives, aug=False):
    jieba.suggest_freq(('不', '会'), tune=True)
    jieba.suggest_freq(('不', '能'), tune=True)
    jieba.suggest_freq(('不', '行'), tune=True)
    jieba.suggest_freq(('不', '好'), tune=True)
    jieba.suggest_freq(('不', '要'), tune=True)
    jieba.suggest_freq(('不', '是'), tune=True)
    jieba.suggest_freq(('不'), tune=True)
    jieba.suggest_freq('无法确定', tune=True)
    sent = jieba.lcut(text, HMM=False)

    for i in range(len(sent)):
        if sent[i] in "[\s+\.\!\/_,$%^*(+\"\']+|[+——！，。？、~@#￥%……&*（）《》“”：【】]+":
            sent[i] = ' '
        elif aug and random.random()<0.1: # data augmentation
            sent[i] = ' '
        else:
            sent[i].lower()
    sent = ' '.join(sent)
    return sent

In [0]:
# concatenate query and alternatives
def query_alt(query, alternatives, a):
    '''
    query: line['query'] from original dataframe
    alternatives: line['alternatives'] from original dataframe
    a: current option in alternatives to be merged with query
    
    return: query and current option a concatenated (preprocessed)
    '''
    
    query = query.strip()
    if query[-1] == "吗" or query[-1] == "么" or query[-1] == "嘛" or query[-1] == "不": 
        query = query[:-1]
        match = None
        o = alternatives.split('|')
        o = [m.strip() for m in o]
        if '无法确认' in o:
            o.remove('无法确认')
        if '无法确定' in o:
            o.remove('无法确定') 
        if o[0] in o[1]:
            long = o[1]
            short = o[0]
        else:
            long = o[0]
            short = o[1]
        if long in query:
            match = long
        else:
            if short in query:
                match = short
            elif (short == '能') and ('可以' in query):
                match = '可以'
            elif (short == '可以') and ('能' in query):
                match = '能'
            elif (short == '可以') and ('会' in query):
                match = '会'
            elif (short == '会') and ('可以' in query):
                match = '可以'
            elif (short == '会') and ('能' in query):
                match = '能'
            elif (short == '能') and ('会' in query):
                match = '会'

        if match:
            query = query.replace(match, a)
        else:
            query = a + query
            
        return query
            
    else: # 问题里正反两个词都要替换
        match = alternatives.split('|')
        match = [m.strip() for m in match]
        if '无法确认' in match:
            match.remove('无法确认')
        if '无法确定' in match:
            match.remove('无法确定') 
        if match[0] in query and match[1] in query: # 两个词都出现了
            if match[0] + match[1] in query: # 有没有，会不会
                query = query.replace(match[0] + match[1], a)
            elif match[1] + match[0] in query:
                query = query.replace(match[1] + match[0], a)
            else: # A好还是B好
                if a == match[0]:
                    query = query.replace(match[1], ' ')
                elif a == match[1]:
                    query = query.replace(match[0], ' ')
                else: # 无法确定
                    query = query.replace(match[0], ' ')
                    query = query.replace(match[1], a)
        else: # 两个词没完整出现
            if '能否' in query:
                query = query.replace('能否', a)
            elif '是否' in query:
                query = query.replace('是否', a)
            elif '可否' in query:
                query = query.replace('可否', a)
            
        return query

In [0]:
with open('train.tsv', 'w', encoding='utf-8') as fw:
    fw.write('id' + '\t' + 'passage' + '\t' + 'query' + '\t' + 'option' + '\t' + 'label' + '\n')
    for i in tqdm(range(train_set.shape[0])):
        line = train_set.iloc[i]
        p = line['passage']
        for a in line['alternatives'].split('|'):
            a = a.strip()
            m = query_alt(query=line['query'], alternatives=line['alternatives'], a=a)
            if a == line['answer'].strip():
                fw.write(str(line['query_id'])+ '\t'+ p+ '\t'+ m+ '\t'+ a+ '\t'+ '1'+'\n')
            else:
                fw.write(str(line['query_id'])+ '\t'+ p+ '\t'+ m+ '\t'+ a+ '\t'+ '0'+'\n')

100%|██████████| 250000/250000 [01:43<00:00, 2409.21it/s]


In [0]:
with open('valid.tsv', 'w', encoding='utf-8') as fw:
    fw.write('id' + '\t' + 'passage' + '\t' + 'query' + '\t' + 'option' + '\t' + 'label' + '\n')
    for i in tqdm(range(valid_set.shape[0])):
        line = valid_set.iloc[i]
        p = line['passage']
        for a in line['alternatives'].split('|'):
            a = a.strip()
            m = query_alt(query=line['query'], alternatives=line['alternatives'], a=a)
            if a == line['answer'].strip():
                fw.write(str(line['query_id'])+ '\t'+ p+ '\t'+ m+ '\t'+ a+ '\t'+ '1'+'\n')
            else:
                fw.write(str(line['query_id'])+ '\t'+ p+ '\t'+ m+ '\t'+ a+ '\t'+ '0'+'\n')

100%|██████████| 30000/30000 [00:12<00:00, 2386.52it/s]


In [0]:
with open('test.tsv', 'w', encoding='utf-8') as fw:
    fw.write('id' + '\t' + 'passage' + '\t' + 'query' + '\t'+ 'option'+ '\n')
    for i in tqdm(range(test_set.shape[0])):
        line = test_set.iloc[i]
        p = line['passage']
        for a in line['alternatives'].split('|'):
            a = a.strip()
            m = query_alt(query=line['query'], alternatives=line['alternatives'], a=a)
            fw.write(str(line['query_id'])+ '\t'+ p+ '\t'+ m+ '\t'+ a+ '\n')

100%|██████████| 10000/10000 [00:03<00:00, 2587.17it/s]


## BERT files

In [7]:
#downloading weights and cofiguration file for the model
!wget https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip

--2018-12-05 00:52:04--  https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 108.177.97.128, 2404:6800:4008:c00::80
Connecting to storage.googleapis.com (storage.googleapis.com)|108.177.97.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 381892918 (364M) [application/zip]
Saving to: ‘chinese_L-12_H-768_A-12.zip’


2018-12-05 00:52:14 (41.2 MB/s) - ‘chinese_L-12_H-768_A-12.zip’ saved [381892918/381892918]



In [0]:
repo = 'model_repo'
with zipfile.ZipFile("chinese_L-12_H-768_A-12.zip","r") as zip_ref:
    zip_ref.extractall(repo)

In [9]:
!ls 'model_repo/chinese_L-12_H-768_A-12'

bert_config.json		     bert_model.ckpt.index  vocab.txt
bert_model.ckpt.data-00000-of-00001  bert_model.ckpt.meta


In [0]:
import modeling
import optimization
import run_classifier
import tokenization
import tensorflow as tf

# Hyper parameter

In [11]:
repo = 'model_repo'
BERT_MODEL = 'chinese_L-12_H-768_A-12'
BERT_PRETRAINED_DIR = f'{repo}/chinese_L-12_H-768_A-12'
OUTPUT_DIR = f'{repo}/outputs'
print(f'***** Model output directory: {OUTPUT_DIR} *****')
print(f'***** BERT pretrained directory: {BERT_PRETRAINED_DIR} *****')

***** Model output directory: model_repo/outputs *****
***** BERT pretrained directory: model_repo/chinese_L-12_H-768_A-12 *****


In [0]:
# Model Hyper Parameters
BATCH_SIZE = 24
epoch = 2
LEARNING_RATE = 5e-5
WARMUP_PROPORTION = 0.1
MAX_SEQ_LENGTH = 200
VOCAB_FILE = os.path.join(BERT_PRETRAINED_DIR, 'vocab.txt')
CONFIG_FILE = os.path.join(BERT_PRETRAINED_DIR, 'bert_config.json')
INIT_CHECKPOINT = os.path.join(BERT_PRETRAINED_DIR, 'bert_model.ckpt')

# Data generation

## Load tsv files

In [0]:
train_path = 'train.tsv' # train set
valid_path = 'valid.tsv' # validation set
test_path = 'test.tsv' # test set
train = pd.read_csv(train_path, sep='\t', header=0)
valid = pd.read_csv(valid_path, sep='\t', header=0)
test = pd.read_csv(test_path, sep='\t', header=0)
print (train.shape, valid.shape, test.shape)

(750000, 5) (90000, 5) (30000, 4)


In [0]:
print (train.head(3))

   id                                            passage          query  \
0   1  孩子是父母的一面镜子，由于儿童的世界观尚未形成，他们的模仿带有很大的盲目性，所以还是父母带好...  你的孩子无法确定保姆带大的   
1   1  孩子是父母的一面镜子，由于儿童的世界观尚未形成，他们的模仿带有很大的盲目性，所以还是父母带好...     你的孩子是保姆带大的   
2   1  孩子是父母的一面镜子，由于儿童的世界观尚未形成，他们的模仿带有很大的盲目性，所以还是父母带好...    你的孩子不是保姆带大的   

  option  label  
0   无法确定      1  
1      是      0  
2     不是      0  


## Create BERT input examples

In [0]:
def create_examples(set_type):
#Generate data for the BERT model
    if set_type == 'train':
        lines = train
    elif set_type == 'valid':
        lines = valid
    else:
        lines = test
    examples = []
    for i in range(0, lines.shape[0], 3):
        line3 = lines.iloc[i:i+3]
        if set_type == 'train':
            line3 = line3.sample(frac=1) # need to shuffle the train set, otherwise the first answer
        for j in range(3):
            line = line3.iloc[j]
            guid = '%s-%s-%d'%(set_type, line['id'], j)
            text_a = tokenization.convert_to_unicode(line['passage'])
            text_b = tokenization.convert_to_unicode(line['query'])
            if set_type == 'test':
                label='0'
            else:
                label = tokenization.convert_to_unicode(str(line['label']))
            examples.append(run_classifier.InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
    return examples

In [0]:
def create_examples(set_type):
#Generate data for the BERT model
    if set_type == 'train':
        lines = train_set
    elif set_type == 'valid':
        lines = valid_set
    else:
        lines = test_set
    examples = []
    for i in range(0, lines.shape[0]):
        line = lines.iloc[i]
        options = line['alternatives'].split('|')
        if set_type == 'train':
            random.shuffle(options)
        for i, o in enumerate(options):
            guid = '%s-%s-%d'%(set_type, line['query_id'], i)
            text_a = tokenization.convert_to_unicode(line['passage'] + line['query'])
            text_b = tokenization.convert_to_unicode(o)
            if set_type != 'test' and o == line['answer']:
                label = '1'
            else:
                label = '0'
            examples.append(run_classifier.InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
    return examples

In [0]:
label_list = ['0', '1']
tokenizer = tokenization.FullTokenizer(vocab_file=VOCAB_FILE, do_lower_case=False)
valid_examples = create_examples('valid')
train_examples = create_examples('train')

## Convert training examples to features, write into tf record files

In [15]:
print('Please wait...')
train_file = 'model_repo/train.tf_record'
run_classifier.file_based_convert_examples_to_features(
    train_examples, label_list, MAX_SEQ_LENGTH, tokenizer, train_file)
print (f'train data written into tfrecord file: {train_file}')

Please wait...
INFO:tensorflow:Writing example 0 of 750000
INFO:tensorflow:*** Example ***
INFO:tensorflow:guid: train-1-0
INFO:tensorflow:tokens: [CLS] 孩 子 是 父 母 的 一 面 镜 子 ， 由 于 儿 童 的 世 界 观 尚 未 形 成 ， 他 们 的 模 仿 带 有 很 大 的 盲 目 性 ， 所 以 还 是 父 母 带 好 。 除 非 万 不 得 已 ， 绝 对 不 能 把 上 早 教 课 等 教 育 问 题 交 给 保 姆 ， 她 们 负 责 生 活 起 居 就 好 了 ， 树 立 孩 子 一 生 的 良 好 习 惯 ， 家 长 们 可 千 万 不 能 大 意 。 你 的 孩 子 是 保 姆 带 大 的 么 [SEP] 是 [SEP]
INFO:tensorflow:input_ids: 101 2111 2094 3221 4266 3678 4638 671 7481 7262 2094 8024 4507 754 1036 4997 4638 686 4518 6225 2213 3313 2501 2768 8024 800 812 4638 3563 820 2372 3300 2523 1920 4638 4683 4680 2595 8024 2792 809 6820 3221 4266 3678 2372 1962 511 7370 7478 674 679 2533 2347 8024 5318 2190 679 5543 2828 677 3193 3136 6440 5023 3136 5509 7309 7579 769 5314 924 1990 8024 1961 812 6566 6569 4495 3833 6629 2233 2218 1962 749 8024 3409 4989 2111 2094 671 4495 4638 5679 1962 739 2679 8024 2157 7270 812 1377 1283 674 679 5543 1920 2692 511 872 4638 2111 2094 3221 924 1990 2372 1920 463

## Convert evaluation examples to features, write into tf record files

In [16]:
print('Please wait...')
eval_file = 'model_repo/valid.tf_record'
run_classifier.file_based_convert_examples_to_features(
    valid_examples, label_list, MAX_SEQ_LENGTH, tokenizer, eval_file)
print (f'valid data written into tfrecord file: {eval_file}')

Please wait...
INFO:tensorflow:Writing example 0 of 90000
INFO:tensorflow:*** Example ***
INFO:tensorflow:guid: valid-250001-0
INFO:tensorflow:tokens: [CLS] 动 漫 好 看 的 [UNK] ： 爱 的 魔 法 ， [UNK] 的 作 品 ， 喧 嚣 学 院 ， 草 莓 100 % ， 双 恋 ， 爱 丽 丝 学 园 ， 灼 眼 的 夏 娜 ， 我 的 女 神 ， 赐 予 护 女 神 的 祝 福 , 旋 风 管 家 ， 全 金 属 狂 潮 ， 初 音 岛 ， 命 运 之 夜 ， 心 跳 回 忆 。 有 没 有 好 看 的 h [SEP] 有 [SEP]
INFO:tensorflow:input_ids: 101 1220 4035 1962 4692 4638 100 8038 4263 4638 7795 3791 8024 100 4638 868 1501 8024 1602 1709 2110 7368 8024 5770 5803 8135 110 8024 1352 2605 8024 4263 714 692 2110 1736 8024 4133 4706 4638 1909 2025 8024 2769 4638 1957 4868 8024 6606 750 2844 1957 4868 4638 4867 4886 117 3181 7599 5052 2157 8024 1059 7032 2247 4312 4060 8024 1159 7509 2270 8024 1462 6817 722 1915 8024 2552 6663 1726 2554 511 3300 3766 3300 1962 4692 4638 150 102 3300 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

## Get batch input from tf record files

In [0]:
def file_based_input_fn_builder(input_file, seq_length, is_training,
                                drop_remainder):
    """Creates an `input_fn` closure to be passed to Estimator."""
    name_to_features = {
          "input_ids": tf.FixedLenFeature([MAX_SEQ_LENGTH], tf.int64),
          "input_mask": tf.FixedLenFeature([MAX_SEQ_LENGTH], tf.int64),
          "segment_ids": tf.FixedLenFeature([MAX_SEQ_LENGTH], tf.int64),
          "label_ids": tf.FixedLenFeature([], tf.int64),
      }
    def _decode_record(record, name_to_features):
        """Decodes a record to a TensorFlow example."""
        example = tf.parse_single_example(record, name_to_features)
        return example
    def input_data (input_file=input_file, batch_size=BATCH_SIZE, drop_remainder=False):
        d = tf.data.TFRecordDataset(input_file)
        if is_training:
            d = d.repeat()
            #d = d.shuffle(buffer_size=100)
        d = d.apply(
            tf.data.experimental.map_and_batch(
                lambda record: _decode_record(record, name_to_features),
                batch_size=batch_size,
                drop_remainder=drop_remainder))
        return d

    return input_data

In [0]:
    name_to_features = {
          "input_ids": tf.FixedLenFeature([MAX_SEQ_LENGTH], tf.int64),
          "input_mask": tf.FixedLenFeature([MAX_SEQ_LENGTH], tf.int64),
          "segment_ids": tf.FixedLenFeature([MAX_SEQ_LENGTH], tf.int64),
          "label_ids": tf.FixedLenFeature([], tf.int64),
      }
    def _decode_record(record, name_to_features):
        """Decodes a record to a TensorFlow example."""
        example = tf.parse_single_example(record, name_to_features)
        return example
    def input_data (input_file, batch_size=BATCH_SIZE, drop_remainder=False):
        d = tf.data.TFRecordDataset(input_file)
        d = d.shuffle(buffer_size=100)
        d = d.apply(
            tf.data.experimental.map_and_batch(
                lambda record: _decode_record(record, name_to_features),
                batch_size=batch_size,
                drop_remainder=drop_remainder))
        return d

In [18]:
num_train_steps = int(len(train_examples) / BATCH_SIZE * epoch)
#num_train_steps = 1e30
num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)
print (num_train_steps, num_warmup_steps)

62500 6250


# Build the model

## Fine tuning model

In [0]:
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
                 labels, num_labels, use_one_hot_embeddings=False):
    """Creates mrc model."""
    model = modeling.BertModel(
        config=bert_config,
        is_training=is_training,
        input_ids=input_ids,
        input_mask=input_mask,
        token_type_ids=segment_ids,
        use_one_hot_embeddings=use_one_hot_embeddings)

    output_layer = model.get_pooled_output() # [batch*3, h]
    
    if is_training:
        # I.e., 0.1 dropout
        output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)

    hidden_size = output_layer.shape[-1].value

    output_weights = tf.get_variable(
        "output_weights", [1, hidden_size],
        initializer=tf.truncated_normal_initializer(stddev=0.02))

    output_bias = tf.get_variable(
        "output_bias", [1], initializer=tf.zeros_initializer())

    logits = tf.matmul(output_layer, output_weights, transpose_b=True)
    logits = tf.nn.bias_add(logits, output_bias) # [batch*3, 1]
    logits = tf.reshape(logits, shape=[-1, num_labels]) # group every 3 lines together to get the softmax [batch, 3]
    probabilities = tf.nn.softmax(logits, axis=-1) # [batch, 3]
    log_probs = tf.nn.log_softmax(logits, axis=-1) # [batch, 3]

    one_hot_labels = tf.to_float(labels)
    one_hot_labels = tf.reshape(one_hot_labels, shape=[-1, num_labels]) # group every 3 labels together to get the one hot target

    per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) # [batch]
    loss = tf.reduce_mean(per_example_loss) # [1]

    return (loss, per_example_loss, logits, probabilities)

In [0]:
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
                 labels, num_labels, use_one_hot_embeddings=False):
  """Creates a classification model."""
  model = modeling.BertModel(
      config=bert_config,
      is_training=is_training,
      input_ids=input_ids,
      input_mask=input_mask,
      token_type_ids=segment_ids,
      use_one_hot_embeddings=use_one_hot_embeddings)

  # In the demo, we are doing a simple classification task on the entire
  # segment.
  #
  # If you want to use the token-level output, use model.get_sequence_output()
  # instead.
  output_layer = model.get_pooled_output()

  hidden_size = output_layer.shape[-1].value

  output_weights = tf.get_variable(
      "output_weights", [num_labels, hidden_size],
      initializer=tf.truncated_normal_initializer(stddev=0.02))

  output_bias = tf.get_variable(
      "output_bias", [num_labels], initializer=tf.zeros_initializer())

  with tf.variable_scope("loss"):
    if is_training:
      # I.e., 0.1 dropout
      output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)

    logits = tf.matmul(output_layer, output_weights, transpose_b=True)
    logits = tf.nn.bias_add(logits, output_bias)
    probabilities = tf.nn.softmax(logits, axis=-1)
    log_probs = tf.nn.log_softmax(logits, axis=-1)

    one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)

    per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
    loss = tf.reduce_mean(per_example_loss)

    return (loss, per_example_loss, logits, probabilities)

## Build the model function for estimator

In [0]:
def model_fn_builder(bert_config, num_labels, init_checkpoint, learning_rate,
                     num_train_steps, num_warmup_steps):
    """Returns `model_fn` closure for Estimator."""

    def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
        """The `model_fn` for Estimator."""

        tf.logging.info("*** Features ***")
        for name in sorted(features.keys()):
            tf.logging.info("  name = %s, shape = %s" % (name, features[name].shape))

        input_ids = features["input_ids"]
        input_mask = features["input_mask"]
        segment_ids = features["segment_ids"]
        label_ids = features["label_ids"]

        is_training = (mode == tf.estimator.ModeKeys.TRAIN)

        (total_loss, per_example_loss, logits, probabilities) = create_model(
            bert_config, is_training, input_ids, input_mask, segment_ids, label_ids,
            num_labels, False)

        tvars = tf.trainable_variables()
        initialized_variable_names = {}

        if init_checkpoint:
            (assignment_map, initialized_variable_names) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint)
            tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
            
        tf.logging.info("**** Trainable Variables ****")
        for var in tvars:
            init_string = ""
            if var.name in initialized_variable_names:
                init_string = ", *INIT_FROM_CKPT*"
            tf.logging.info("  name = %s, shape = %s%s", var.name, var.shape, init_string)

        output_spec = None
        if mode == tf.estimator.ModeKeys.TRAIN:

            train_op = optimization.create_optimizer(total_loss, LEARNING_RATE, num_train_steps, num_warmup_steps=0, use_tpu=False)

            output_spec = tf.estimator.EstimatorSpec(
                  mode=mode,
                  loss=total_loss,
                  train_op=train_op)
        elif mode == tf.estimator.ModeKeys.EVAL:

            def metric_fn(per_example_loss, label_ids, probabilities):
                predictions = tf.argmax(probabilities, axis=-1, output_type=tf.int32)
                labels = tf.zeros_like(predictions) # First option is always the answer
                accuracy = tf.metrics.accuracy(label, predictions)
                loss = tf.metrics.mean(per_example_loss)
                return {
                    "eval_accuracy": accuracy,
                    "eval_loss": loss,
                }

            eval_metrics = metric_fn(per_example_loss, label_ids, probabilities)
            output_spec = tf.estimator.EstimatorSpec(
                  mode=mode,
                  loss=total_loss,
                  eval_metric_ops=eval_metrics)
        else:
            pred = {'class_ids': tf.argmax(probabilities, axis=-1, output_type=tf.int32),
                'probabilities': probabilities,
                'logits': logits,}
            output_spec = tf.estimator.EstimatorSpec(mode=mode, predictions=pred)
        return output_spec

    return model_fn


In [0]:
model_fn = model_fn_builder(
      bert_config=modeling.BertConfig.from_json_file(CONFIG_FILE),
      num_labels=len(label_list),
      init_checkpoint=INIT_CHECKPOINT,
      learning_rate=LEARNING_RATE,
      num_train_steps=num_train_steps,
      num_warmup_steps=num_warmup_steps)

In [0]:
run_config = tf.estimator.RunConfig(
      model_dir=OUTPUT_DIR,
      save_checkpoints_steps=1000,
      keep_checkpoint_max=2)

estimator = tf.estimator.Estimator(
      model_fn=model_fn,
      config=run_config)

In [0]:
tf.logging.info("***** Running training *****")
tf.logging.info("  Num examples = %d", len(train_examples))
tf.logging.info("  Batch size = %d", BATCH_SIZE)
tf.logging.info("  Num epochs = %d", epoch)
train_input_fn = file_based_input_fn_builder(
    input_file=train_file,
    seq_length=MAX_SEQ_LENGTH,
    is_training=True,
    drop_remainder=False)
estimator.train(input_fn=train_input_fn, steps=num_train_steps)

## Training graph

In [0]:
def train_model(train_iter):
    e = train_iter.get_next()
    
    (loss, per_example_loss, logits, probabilities) = create_model(
        bert_config=modeling.BertConfig.from_json_file(CONFIG_FILE), 
        is_training=True, 
        input_ids = e['input_ids'], 
        input_mask = e['input_mask'], 
        segment_ids = e['segment_ids'], 
        labels = e['label_ids'],
        num_labels=2)
    
    tvars = tf.trainable_variables()
    initialized_variable_names = {}

    if INIT_CHECKPOINT:
      (assignment_map, initialized_variable_names
      ) = modeling.get_assignment_map_from_checkpoint(tvars, INIT_CHECKPOINT)

      tf.train.init_from_checkpoint(INIT_CHECKPOINT, assignment_map)

    tf.logging.info("**** Trainable Variables ****")
    for var in tvars:
      init_string = ""
      if var.name in initialized_variable_names:
        init_string = ", *INIT_FROM_CKPT*"
      tf.logging.info("  name = %s, shape = %s%s", var.name, var.shape,
                      init_string)
    
    train_op = optimization.create_optimizer(
        loss, 
        init_lr = LEARNING_RATE, 
        num_train_steps = num_train_steps, 
        num_warmup_steps = 0, 
        use_tpu = False)
    
    return loss, train_op

In [21]:
train_graph = tf.Graph()
with train_graph.as_default():
    train_batched = input_data(train_file)
    train_iter = train_batched.make_initializable_iterator()
    train = train_model(train_iter)
        
    initializer = tf.global_variables_initializer()
    train_saver = tf.train.Saver(max_to_keep=2)

INFO:tensorflow:**** Trainable Variables ****
INFO:tensorflow:  name = bert/embeddings/word_embeddings:0, shape = (21128, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/embeddings/token_type_embeddings:0, shape = (2, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/embeddings/position_embeddings:0, shape = (512, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/embeddings/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/embeddings/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_0/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_0/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_0/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_0/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = ber

## Evaluation graph

In [0]:
def accu(prob):
    pmax = np.argmax(prob, axis=-1)
    pmax = list(pmax)
    return pmax.count(0) / len(pmax)

In [0]:
def eval_model(eval_iter):
    e = eval_iter.get_next()
    
    (loss, per_example_loss, logits, probabilities) = create_model(
        bert_config=modeling.BertConfig.from_json_file(CONFIG_FILE), 
        is_training=True, 
        input_ids = e['input_ids'], 
        input_mask = e['input_mask'], 
        segment_ids = e['segment_ids'], 
        labels = e['label_ids'],
        num_labels=2)
    
    return per_example_loss, probabilities

In [0]:
eval_graph = tf.Graph()
with eval_graph.as_default():
    eval_batched = input_data(eval_file)
    eval_iter = eval_batched.make_initializable_iterator()
    
    eval = eval_model(eval_iter)
    
    eval_saver = tf.train.Saver()

# Training and Evaluation

In [25]:
# Train the model.
print('***** Started training at {} *****'.format(datetime.datetime.now()))
train_sess = tf.Session(graph=train_graph)
train_sess.run(initializer)
print ("===================================")
train_sess.run(train_iter.initializer)

eval_sess = tf.Session(graph=eval_graph)

checkpoint_path = 'model_repo/tmp-model.ckpt'
for i in range(epoch):
    
    # training
    print (f'epoch{i+1} training......')
    train_sess.run(train_iter.initializer)
    n_batch=0
    while True:
        try:
            cost, _ = train_sess.run(train)
            n_batch+=1
            print (n_batch, cost)
            if n_batch % 100 == 0:
                print (f'{n_batch*BATCH_SIZE*100.0/len(train_examples)}%, cost={cost}')
        except tf.errors.OutOfRangeError:
            print (f'epoch{i+1} finished: training cost={cost}')
            break
    model_ckpt = train_saver.save(train_sess, checkpoint_path, global_step=i+1)
    
    # evaluation
    print (f'epoch{i+1} evaluation......')
    eval_saver.restore(eval_sess, model_ckpt)
    eval_sess.run(eval_iter.initializer)
    eval_loss = []
    eval_prob = []
    while True:
        try:
            cost, prob = eval_sess.run(eval)
            eval_loss.extend(cost)
            eval_prob.extend(prob)
        except tf.errors.OutOfRangeError:
            eval_loss = np.mean(eval_loss)
            eval_prob = np.concatenate(eval_prob, axis=0)
            eval_acc = accu(eval_prob)
            print (f'epoch{i+1} finished: evaluation cost={eval_loss}, evaluation accuracy={eval_acc}')
            break
print('***** Finished training at {} *****'.format(datetime.datetime.now()))
train_sess.close()
eval_sess.close()

***** Started training at 2018-12-05 01:28:21.331055 *****
epoch1 training......
1 0.7100766
2 0.5527292
3 0.9276956
4 0.84787446
5 0.6477666
6 0.6642615
7 0.8681152
8 0.56843954
9 0.4942313
10 0.69983727
11 0.5391748
12 0.4755589
13 0.7439442
14 0.85343045
15 0.6308963
16 0.72356415
17 0.61843485
18 0.5852565
19 0.7466466
20 0.6056261
21 0.6265531
22 0.6306996
23 0.7080157
24 0.62331194
25 0.6553756
26 0.6933019
27 0.690361
28 0.6253683
29 0.55079305
30 0.5859081
31 0.92501813
32 0.7544012
33 0.63634616
34 0.5654758
35 0.6654814
36 0.49668956
37 0.7664671
38 0.6859761
39 0.6048573
40 0.9424925
41 0.60335785
42 0.77994365
43 0.6096688
44 0.6652221
45 0.623316
46 0.57830817
47 0.66376305
48 0.73002005
49 0.61531097
50 0.60055643
51 0.69060564
52 0.5927017
53 0.58770996
54 0.53505105
55 0.5810701
56 0.8200994
57 0.60753447
58 0.7437249
59 0.7850701
60 0.67178124
61 0.59115106
62 0.647585
63 0.6443059
64 0.64919907
65 0.68287927
66 0.70230275
67 0.7128133
68 0.64887166
69 0.65219784
70 0.

KeyboardInterrupt: ignored

# Prediction