In [1]:
from bert import tokenization

In [2]:
tokenizer = tokenization.FullTokenizer(
      vocab_file='MASS.wordpiece', do_lower_case=False)




In [3]:
tokenizer.tokenize('saya suka makan anjeng')

['saya', 'suka', 'makan', 'an', '##jen', '##g']

In [4]:
import random

rng = random.Random(12345)

In [5]:
max_seq_length = 300
dupe_factor = 5
short_seq_prob = 0.1
masked_lm_prob = 0.2
max_predictions_per_seq = 20
eos_id = 1

vocab_words = list(tokenizer.vocab.keys())

In [6]:
from glob import glob

files = glob('../pure-text/*.txt')
files

['../pure-text/dumping-instagram.txt',
 '../pure-text/parliament-text.txt',
 '../pure-text/dumping-parliament.txt',
 '../pure-text/dumping-twitter.txt',
 '../pure-text/normalization-100000.txt',
 '../pure-text/iium.wordpiece-vocab.txt',
 '../pure-text/dumping-iium.txt',
 '../pure-text/twitter-normalization-100000.txt',
 '../pure-text/dumping-wiki.txt',
 '../pure-text/dumping-news.txt',
 '../pure-text/dumping-watpadd.txt',
 '../pure-text/dumping-pdf.txt',
 '../pure-text/dumping-common-crawl.txt']

In [7]:
import tensorflow as tf
from tqdm import tqdm

def create_single_instances(
    input_files,
    tokenizer,
):
    all_documents = []
    for input_file in input_files:
        with open(input_file,'r') as f:
            for line in tqdm(f):
                line = tokenization.convert_to_unicode(line)
                if not line:
                    break
                line = line.strip()
                if line:
                    tokens = tokenizer.tokenize(line)
                    if len(tokens) > 10:
                        all_documents.append(tokens)

    all_documents = [x for x in all_documents if x]
    return all_documents

In [8]:
def process_MaskedLanguagePairDataset(row):
    max_num_tokens = max_seq_length
    target_seq_length = max_num_tokens
    if rng.random() < short_seq_prob:
        target_seq_length = rng.randint(2, max_num_tokens)
    tokens = row[:max_num_tokens]
    
    output_tokens = list(tokens)
    
    cand_indexes = []
    for (i, token) in enumerate(tokens):
        if token == '[CLS]' or token == '[SEP]':
            continue
        if (
            len(cand_indexes) >= 1
            and token.startswith('##')
        ):
            cand_indexes[-1].append(i)
        else:
            cand_indexes.append([i])

    num_to_predict = min(
        max_predictions_per_seq,
        max(1, int(round(len(tokens) * masked_lm_prob))),
    )
    start = random.randint(1, len(cand_indexes) - num_to_predict)

    label, input_decoder = [], []
    covered_indexes = set()
    for index_set in cand_indexes[start:]:
        if len(label) >= num_to_predict:
            break
        is_any_index_covered = False
        for index in index_set:
            if index in covered_indexes:
                is_any_index_covered = True
                break
        if is_any_index_covered:
            continue
        for index in index_set:
            covered_indexes.add(index)

            masked_token = None
            # 80% of the time, replace with [MASK]
            if rng.random() < 0.8:
                masked_token = '[MASK]'
            else:
                # 10% of the time, keep original
                if rng.random() < 0.5:
                    masked_token = tokens[index]
                # 10% of the time, replace with random word
                else:
                    masked_token = vocab_words[
                        rng.randint(0, len(vocab_words) - 1)
                    ]

            output_tokens[index] = masked_token
            label.append(tokens[index])
            input_decoder.append(tokens[index - 1])
            
    return output_tokens, input_decoder, label

In [22]:
import json

def create_pair_instances(
    input_files,
    tokenizer,
):
    all_documents = []
    for input_file in input_files:
        with open(input_file) as fopen:
            data = json.load(fopen)
        for i in tqdm(range(len(data['left']))):
            line_l = tokenization.convert_to_unicode(data['left'][i])
            line_r = tokenization.convert_to_unicode(data['right'][i])
            
            line_l = line_l.strip()
            line_r = line_r.strip()
            if line_l and line_r:
                tokens_l = tokenizer.tokenize(line_l)
                tokens_r = tokenizer.tokenize(line_r)
                if len(tokens_l) < max_seq_length and len(tokens_r) < max_seq_length:
                    all_documents.append((tokens_l, tokens_r))

    return all_documents

In [10]:
# r_pair = create_pair_instances(['en-ms.json'], tokenizer)

In [11]:
def process_NoisyLanguagePairDataset(row):
    max_num_tokens = max_seq_length
    target_seq_length = max_num_tokens
    if rng.random() < short_seq_prob:
        target_seq_length = rng.randint(2, max_num_tokens)
    tokens_l = row[0][:max_num_tokens]
    tokens_r = row[1][:max_num_tokens]
    
    cand_indexes_l = []
    for (i, token) in enumerate(tokens_l):
        if token == '[CLS]' or token == '[SEP]':
            continue
        if (
            len(cand_indexes_l) >= 1
            and token.startswith('##')
        ):
            cand_indexes_l[-1].append(i)
        else:
            cand_indexes_l.append([i])
            
    cand_indexes_r = []
    for (i, token) in enumerate(tokens_r):
        if token == '[CLS]' or token == '[SEP]':
            continue
        if (
            len(cand_indexes_r) >= 1
            and token.startswith('##')
        ):
            cand_indexes_r[-1].append(i)
        else:
            cand_indexes_r.append([i])
            
    rng.shuffle(cand_indexes_l)
    rng.shuffle(cand_indexes_r)

    num_to_predict = min(
        max_predictions_per_seq,
        max(1, int(round(len(tokens_l) * masked_lm_prob))),
    )
    
    masked_lms = []
    output_tokens_l = list(tokens_l)
    covered_indexes = set()
    for index_set in cand_indexes_l:
        if len(masked_lms) >= num_to_predict:
            break
        is_any_index_covered = False
        for index in index_set:
            if index in covered_indexes:
                is_any_index_covered = True
                break
        if is_any_index_covered:
            continue
        for index in index_set:
            covered_indexes.add(index)

            masked_token = None
            # 80% of the time, replace with [MASK]
            if rng.random() < 0.8:
                masked_token = '[MASK]'
            else:
                # 10% of the time, keep original
                if rng.random() < 0.5:
                    masked_token = tokens_l[index]
                # 10% of the time, replace with random word
                else:
                    masked_token = vocab_words[
                        rng.randint(0, len(vocab_words) - 1)
                    ]

            output_tokens_l[index] = masked_token
            masked_lms.append(tokens_l[index])
    
    num_to_predict = min(
        max_predictions_per_seq,
        max(1, int(round(len(tokens_r) * masked_lm_prob))),
    )
    
    output_tokens_r = list(tokens_r)
    label, input_decoder = [], []
    covered_indexes = set()
    for index_set in cand_indexes_r:
        if len(label) >= num_to_predict:
            break
        is_any_index_covered = False
        for index in index_set:
            if index in covered_indexes:
                is_any_index_covered = True
                break
        if is_any_index_covered:
            continue
        for index in index_set:
            covered_indexes.add(index)

            masked_token = None
            # 80% of the time, replace with [MASK]
            if rng.random() < 0.8:
                masked_token = '[MASK]'
            else:
                # 10% of the time, keep original
                if rng.random() < 0.5:
                    masked_token = tokens_r[index]
                # 10% of the time, replace with random word
                else:
                    masked_token = vocab_words[
                        rng.randint(0, len(vocab_words) - 1)
                    ]

            output_tokens_r[index] = masked_token
            label.append(tokens_r[index])
            
    return output_tokens_l, output_tokens_r, tokens_r

In [12]:
import collections
import tensorflow as tf

maxlen = max_seq_length

def create_int_feature(values):
    feature = tf.train.Feature(
        int64_list = tf.train.Int64List(value = list(values))
    )
    return feature

def to_tfrecord(rows, filename):
    input_encoders, input_decoders, labels = [], [], []
    
    for i in tqdm(range(len(rows))):
        input_encoder, input_decoder, label = rows[0]
        input_encoder = tokenizer.convert_tokens_to_ids(input_encoder)
        input_decoder = tokenizer.convert_tokens_to_ids(input_decoder) + [eos_id]
        label = tokenizer.convert_tokens_to_ids(label) + [eos_id]
        input_encoder = input_encoder + [0] * (maxlen - len(input_encoder))
        input_decoder = input_decoder + [0] * (maxlen - len(input_decoder))
        label = label + [0] * (maxlen - len(label))
        input_encoders.append(input_encoder)
        input_decoders.append(input_decoder)
        labels.append(label)
    
    r = tf.python_io.TFRecordWriter(f'{filename}.tfrecord')
    for i in tqdm(range(len(labels))):
        features = collections.OrderedDict()
        features['input_encoder'] = create_int_feature(input_encoders[i])
        features['input_decoder'] = create_int_feature(input_decoders[i])
        features['y'] = create_int_feature(labels[i])
        tf_example = tf.train.Example(
            features = tf.train.Features(feature = features)
        )
        r.write(tf_example.SerializeToString())
    r.close()


In [20]:
import os

def single_pair(files, dupe_factor = 3):
    for file in files:
        print(file)
        f = os.path.split(file)[1]
        r = create_single_instances([file], tokenizer = tokenizer)
        results = []
        for row in r:
            for _ in range(dupe_factor):
                try:
                    results.append(process_MaskedLanguagePairDataset(row))
                except:
                    pass
        to_tfrecord(results, f)
        
def double_pair(files, dupe_factor = 20):
    for file in files:
        print(file)
        f = os.path.split(file)[1]
        r = create_pair_instances([file], tokenizer = tokenizer)
        results = []
        for row in r:
            for _ in range(dupe_factor):
                try:
                    results.append(process_NoisyLanguagePairDataset(row))
                except:
                    pass
        to_tfrecord(results, f)

In [15]:
selected = ['../pure-text/dumping-parliament.txt',
           '../pure-text/dumping-iium.txt',
           '../pure-text/dumping-wiki.txt',
           '../pure-text/dumping-news.txt',
           '../pure-text/dumping-watpadd.txt',
           '../pure-text/dumping-pdf.txt']

pairs = ['en-ms.json', 'ms-en.json']

In [16]:
single_pair(selected)

573it [00:00, 5720.27it/s]

../pure-text/dumping-parliament.txt


960868it [02:04, 7706.33it/s]
100%|██████████| 1219653/1219653 [00:36<00:00, 33380.67it/s]
100%|██████████| 1219653/1219653 [04:41<00:00, 4330.52it/s]
444it [00:00, 4432.95it/s]

../pure-text/dumping-iium.txt


1139838it [03:22, 5637.71it/s]
100%|██████████| 2102775/2102775 [01:02<00:00, 33632.71it/s]
100%|██████████| 2102775/2102775 [08:06<00:00, 4326.24it/s]
279it [00:00, 2785.73it/s]

../pure-text/dumping-wiki.txt


2050801it [09:44, 3509.09it/s]
100%|██████████| 4765857/4765857 [02:38<00:00, 30111.50it/s]
100%|██████████| 4765857/4765857 [18:30<00:00, 4290.66it/s]
263it [00:00, 2618.23it/s]

../pure-text/dumping-news.txt


1897306it [11:02, 2863.58it/s]
100%|██████████| 4799394/4799394 [02:07<00:00, 37591.00it/s]
100%|██████████| 4799394/4799394 [18:31<00:00, 4319.05it/s]
557it [00:00, 5569.73it/s]

../pure-text/dumping-watpadd.txt


1500469it [04:04, 6138.31it/s]
100%|██████████| 3223416/3223416 [01:34<00:00, 34251.87it/s]
100%|██████████| 3223416/3223416 [12:20<00:00, 4351.92it/s]
611it [00:00, 6104.36it/s]

../pure-text/dumping-pdf.txt


651956it [01:51, 5866.14it/s]
100%|██████████| 1325601/1325601 [00:40<00:00, 32677.50it/s]
100%|██████████| 1325601/1325601 [05:04<00:00, 4359.22it/s]
