In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '1'
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

In [2]:
import bert
from bert import tokenization
from bert import modeling
import numpy as np
import json
import tensorflow as tf
import itertools
import collections
import re
import random
import sentencepiece as spm
from tqdm import tqdm
import bert_utils as squad_utils




In [3]:
import tensorflow as tf
import logging

tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
tf.get_logger().setLevel(logging.ERROR)
tf.autograph.set_verbosity(1)

In [4]:
sp_model = spm.SentencePieceProcessor()
sp_model.Load('sp10m.cased.bert.model')

with open('sp10m.cased.bert.vocab') as fopen:
    v = fopen.read().split('\n')[:-1]
v = [i.split('\t') for i in v]
v = {i[0]: i[1] for i in v}


class Tokenizer:
    def __init__(self, v, sp_model):
        self.vocab = v
        self.sp_model = sp_model

    def tokenize(self, string):
        return encode_pieces(
            self.sp_model, string, return_unicode = False, sample = False
        )

    def convert_tokens_to_ids(self, tokens):
        return [self.sp_model.PieceToId(piece) for piece in tokens]

    def convert_ids_to_tokens(self, ids):
        return [self.sp_model.IdToPiece(i) for i in ids]


tokenizer = Tokenizer(v, sp_model)

In [5]:
train_file = '/home/husein/pure-text/ms-train-2.0.json'
train_examples = squad_utils.read_squad_examples(
      input_file=train_file, is_training=True)

len(train_examples)

100%|██████████| 442/442 [00:00<00:00, 619.45it/s] 


130318

In [6]:
test_file = '/home/husein/pure-text/ms-dev-2.0.json'
test_examples = squad_utils.read_squad_examples(
      input_file=test_file, is_training=False)

len(test_examples)

100%|██████████| 35/35 [00:00<00:00, 1276.22it/s]


11858

In [7]:
max_seq_length = 384
doc_stride = 128
max_query_length = 64

train_features = squad_utils.convert_examples_to_features(
      examples=train_examples,
      tokenizer=tokenizer,
      max_seq_length=max_seq_length,
      doc_stride=doc_stride,
      max_query_length=max_query_length,
      is_training=True,
      do_lower_case=False)

100%|██████████| 130318/130318 [2:51:49<00:00, 12.64it/s]  


In [8]:
test_features = squad_utils.convert_examples_to_features(
      examples=test_examples,
      tokenizer=tokenizer,
      max_seq_length=max_seq_length,
      doc_stride=doc_stride,
      max_query_length=max_query_length,
      is_training=False,
      do_lower_case=False)

100%|██████████| 11858/11858 [15:50<00:00, 12.48it/s] 


In [9]:
import pickle

with open('bert-squad-train.pkl', 'wb') as fopen:
    pickle.dump([train_features, train_examples], fopen)
    
with open('bert-squad-test.pkl', 'wb') as fopen:
    pickle.dump([test_features, test_examples], fopen)