In [4]:
import json

from tqdm import tqdm
import sys
import numpy as np

from colorama import Fore
from transformers import BertTokenizer

In [5]:
slow_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [6]:
slow_tokenizer

PreTrainedTokenizer(name_or_path='bert-base-uncased', vocab_size=30522, model_max_len=512, is_fast=False, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [8]:
slow_tokenizer.save_pretrained("bert-base-uncased")

('bert-base-uncased\\tokenizer_config.json',
 'bert-base-uncased\\special_tokens_map.json',
 'bert-base-uncased\\vocab.txt',
 'bert-base-uncased\\added_tokens.json')

In [9]:
from tokenizers import BertWordPieceTokenizer


# tokenizer = BertWordPieceTokenizer("week2/tokenzier/vocab.txt", lowercase=True)


In [11]:
# slow_tokenizer.save_pretrained("bert_base_uncased/")
tokenizer = BertWordPieceTokenizer("bert-base-uncased/vocab.txt", lowercase=True)

In [12]:
slow_tokenizer

PreTrainedTokenizer(name_or_path='bert-base-uncased', vocab_size=30522, model_max_len=512, is_fast=False, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [13]:
tokenizer

Tokenizer(vocabulary_size=30522, model=BertWordPiece, unk_token=[UNK], sep_token=[SEP], cls_token=[CLS], pad_token=[PAD], mask_token=[MASK], clean_text=True, handle_chinese_chars=True, strip_accents=None, lowercase=True, wordpieces_prefix=##)

In [14]:
with open("AAAI-21-SDU-shared-task-2-AD-master/dataset/train.json") as f:
    train = json.load(f)
with open("AAAI-21-SDU-shared-task-2-AD-master/dataset/diction.json") as f:
    diction = json.load(f)
with open("AAAI-21-SDU-shared-task-2-AD-master/dataset/dev.json") as f:
    dev = json.load(f)
with open("AAAI-21-SDU-shared-task-2-AD-master/dataset/predictions.json") as f:
    predictions = json.load(f)

In [15]:
print(len(train))

50034


In [16]:
print(len(dev))

6189


In [17]:
print(len(predictions))

6189


In [18]:
def normalize(list_token):
    return list(map(lambda x: x.lower(), list_token))

In [19]:
for s in train:
    s["tokens"] = normalize(s["tokens"])
    s["text"] = " ".join(s["tokens"])
    start_char_idx = 0
    for i in range(0, s["acronym"]):
        start_char_idx += len(s["tokens"][i])
        start_char_idx += 1
    s["start_char_idx"] = start_char_idx
    s["len_acronym"] = len(s["tokens"][s["acronym"]])
    

In [20]:
for s in dev:
    s["tokens"] = normalize(s["tokens"])
    s["text"] = " ".join(s["tokens"])
    start_char_idx = 0
    for i in range(0, s["acronym"]):
        start_char_idx += len(s["tokens"][i])
        start_char_idx += 1
    s["start_char_idx"] = start_char_idx
    s["len_acronym"] = len(s["tokens"][s["acronym"]])
    

In [21]:
class Sample:
    def __init__(self, tokenizer, expansion, context, start_char_idx, len_acronym, max_seq_lenght=384):
        self.tokenizer = tokenizer
        self.expansion = expansion
        self.context = context
        self.start_char_idx = start_char_idx
        self.len_acronym = len_acronym
        self.max_seq_lenght = max_seq_lenght
        self.skip = False
        
        self.start_token_idx = -1
        self.end_token_idx = -1
        
    def preprocess(self):
        tokenized_expansion = self.tokenizer.encode(self.expansion)
        tokenized_context = self.tokenizer.encode(self.context)
        
        end_char_idx = self.start_char_idx + self.len_acronym
        if end_char_idx >= len(self.context): 
            self.skip = True
            return
        
        is_char_in_context = [0]*len(self.context)
        for idx in range(self.start_char_idx, end_char_idx):
            is_char_in_context[idx] = 1
        
        arc_token_idx  = []
        for idx, (start, end) in enumerate(tokenized_context.offsets):
            if sum(is_char_in_context[start:end]) > 0: arc_token_idx.append(idx)
        if len(arc_token_idx) == 0:
            self.skip = True
            return
        self.start_token_idx = arc_token_idx[0]
        self.end_token_idx = arc_token_idx[-1]
        
        input_ids = tokenized_context.ids + tokenized_expansion.ids[1:]
        token_type_ids = [0] * len(tokenized_context.ids) + [1] * len(tokenized_expansion.ids[1:])
        attention_mask = [1] * len(input_ids)
        
        
        padding_length = self.max_seq_lenght - len(input_ids)
        if padding_length > 0:
            input_ids = input_ids + ([0]* padding_length)
            token_type_ids = token_type_ids + ([0]* padding_length)
            attention_mask = attention_mask + ([0]* padding_length)
        elif padding_length < 0:
            self.skip = True
            return
        
        self.input_ids = input_ids
        self.token_type_ids = token_type_ids
        self.attention_mask = attention_mask

        

In [22]:
def create_examples(raw_data, desc, tokenizer):
    p_bar = tqdm(total=len(raw_data), desc=desc,
                 position=0, leave=True,
                 file=sys.stdout, bar_format="{l_bar}%s{bar}%s{r_bar}" % (Fore.BLUE, Fore.RESET))
    examples = []
    for item in raw_data:
        expansion = item["expansion"]
        context = item["text"]
        start_char_idx = item["start_char_idx"]
        len_acronym = item["len_acronym"]
        example = Sample(tokenizer, expansion, context, start_char_idx, len_acronym)
        example.preprocess()
        examples.append(example)
        p_bar.update(1)
    p_bar.close()
    return examples

In [23]:
examples = create_examples(train, "Creating training points", tokenizer)

Creating training points: 100%|[34m██████████[39m| 50034/50034 [00:24<00:00, 2022.38it/s]


In [24]:
def create_inputs_targets(examples):
    dataset_dict = {
        "input_ids": [],
        "token_type_ids": [],
        "attention_mask": [],
        "start_token_idx": [],
        "end_token_idx": [],
    }
    for item in examples:
        if item.skip is False:
            for key in dataset_dict:
                dataset_dict[key].append(getattr(item, key))
    for key in dataset_dict:
        dataset_dict[key] = np.array(dataset_dict[key])
        
    x = [dataset_dict["input_ids"], dataset_dict["token_type_ids"], dataset_dict["attention_mask"]]
    y = [dataset_dict["start_token_idx"], dataset_dict["end_token_idx"]]
    return x, y

In [25]:
x_train, y_train = create_inputs_targets(examples)

In [26]:
x_train[2].shape

(49880, 384)

In [79]:
from transformers import BertPreTrainedModel