In [2]:
from transformers import (AdamW, get_linear_schedule_with_warmup, AutoModelForMaskedLM, AutoConfig, AutoTokenizer, DataCollatorWithPadding)
from torch.utils.data import Dataset, DataLoader

from tqdm import tqdm
import torch.nn.functional as F
import torch
import pickle
import json
import os

2022-05-25 08:32:44.799373: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1


In [3]:
# load pretrained-model
model_name = "bert-base-uncased"
model = AutoModelForMaskedLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [4]:
# tokenizer test
input_text = "[CLS] She goes school by [HL] bus [/HL] [SEP]"
tokenized_text = tokenizer.tokenize(input_text,add_special_tokens=False)
print(tokenized_text)

['[CLS]', 'she', 'goes', 'school', 'by', '[', 'h', '##l', ']', 'bus', '[', '/', 'h', '##l', ']', '[SEP]']


In [5]:
# add [HL], [/HL] token 
added_token_num = tokenizer.add_special_tokens({"additional_special_tokens":["[HL]","[/HL]"]})
tokenized_text = tokenizer.tokenize(input_text,add_special_tokens=False)
print(tokenized_text)

# add token number
print(model.get_input_embeddings())
model.resize_token_embeddings(tokenizer.vocab_size + added_token_num)
print(model.get_input_embeddings()) # 2개 증가 된 것을 확인 가능함

['[CLS]', 'she', 'goes', 'school', 'by', '[HL]', 'bus', '[/HL]', '[SEP]']
Embedding(30522, 768, padding_idx=0)
Embedding(30524, 768)


In [6]:
# load data
file_path = "./data/squad_nqg/train.json"
with open(file_path, 'r') as file:
    data = json.load(file)

In [7]:
# sampling data
print(data[0])

input_text = data[0]['context']
target_text = data[0]['question']

# input_text = "[CLS] Jane's favorite food is [HL] chicken [/HL] [SEP]"
# target_text = "What is Jane's favorite food"

{'context': "Heresy is any provocative belief or theory that is strongly at variance with established beliefs or customs. A heretic is a proponent of such claims or beliefs. Heresy is distinct from both apostasy, which is the explicit renunciation of one's religion, principles or cause, and blasphemy, which is an impious utterance or action concerning God or sacred things.", 'question': 'What is heresy mainly at odds with?', 'answers': [{'answer_start': 77, 'text': 'established beliefs or customs'}]}


In [8]:
# add [HL], [/HL]

cls_token = tokenizer.cls_token
sep_token = tokenizer.sep_token
mask_token = tokenizer.mask_token


answer_start = data[0]['answers'][0]['answer_start']
answer = data[0]['answers'][0]['text']

add_hl = f"{cls_token} {data[0]['context'][:answer_start]} [HL] {data[0]['context'][answer_start:answer_start+len(answer)]} [/HL] {data[0]['context'][answer_start+len(answer):]} {sep_token}"
print(add_hl)

[CLS] Heresy is any provocative belief or theory that is strongly at variance with  [HL] established beliefs or customs [/HL] . A heretic is a proponent of such claims or beliefs. Heresy is distinct from both apostasy, which is the explicit renunciation of one's religion, principles or cause, and blasphemy, which is an impious utterance or action concerning God or sacred things. [SEP]


In [9]:
tokenized_sentence = tokenizer(
            "[CLS] Jane's favorite food is [HL] chicken [/HL] [SEP]",
            padding=True,  # 문장의 길이가 짧다면 padding
            truncation=True,  # 문장이 길다면 truncate
            max_length=512,
            return_token_type_ids=True,  # roberta 모델에서는 False
            return_tensors="pt",  # Tensor로 반환!
        )

tokenized_sentence

{'input_ids': tensor([[  101,   101,  4869,  1005,  1055,  5440,  2833,  2003, 30522,  7975,
         30523,   102,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [10]:
cls_token = tokenizer.cls_token
sep_token = tokenizer.sep_token
mask_token = tokenizer.mask_token

train_pickle = './squad_train.pickle'

if os.path.isfile(train_pickle):
    with open(train_pickle, 'rb') as f:
        example_list = pickle.load(f)
    print('*** finished to load train pickle file!')
    
else:       
    example_list = []

    for d in tqdm(data[:len(data)], desc = "making pickle file...: "):
        example_pair = dict()
        target_text = d['question']
        answer_start = d['answers'][0]['answer_start']
        answers = d['answers'][0]['text']
        input_text = f"{cls_token} {d['context'][:answer_start]} [HL] {d['context'][answer_start:answer_start+len(answer)]} [/HL] {d['context'][answer_start+len(answer):]} {sep_token}"
        tokenized_target = tokenizer.tokenize(target_text) # tokenize question
        tokenized_text = tokenizer.tokenize(input_text,add_special_tokens=False)
        if len(tokenized_target + tokenized_text) + 2 >= 512:
            continue
        
        for i in range(0,len(tokenized_target)+1):
            # tokenized
            
            tokenized_text.extend(tokenized_target[:i]) # tokenized_context + tokenized_question[:i]
            tokenized_text.append(mask_token) # tokenized_context + tokenized_question[:i] + [MASK]
            indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
            #tokens_tensor = torch.tensor([indexed_tokens]).to(device)
        
            # label
            #loss_ids = [0] * (len(tokenizer.convert_tokens_to_ids(tokenized_text))-1)
            loss_ids = indexed_tokens.copy()
            
            if i == len(tokenized_target):
                loss_ids.append(tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sep_token))[0])
            else:
                loss_ids.append(tokenizer.convert_tokens_to_ids(tokenized_target[i]))

            loss_tensors = torch.tensor([loss_ids]).to(device)
            input_ids = tokenizer.convert_tokens_to_ids(tokenized_text)
            decodes_ids = tokenizer.decode(input_ids)
            example_pair[decodes_ids] = loss_tensors
            # print(tokenized_text,loss_ids,loss_ids[-1])
            # print(len(indexed_tokens),len(loss_tensors))
            
        example_list.append(example_pair)
        
    with open(train_pickle,'wb') as f:
        pickle.dump(example_list, f, pickle.HIGHEST_PROTOCOL)

In [None]:
len(example_list)

75612

In [None]:
# 문장 list
train_sentences = []
# 정답 list
train_label = []

for examples in tqdm(example_list):
    train_sentences.extend(list(examples.keys()))
    train_label.extend(list(examples.values()))
        
print(len(train_sentences), len(train_label))

100%|██████████| 75612/75612 [00:00<00:00, 220049.48it/s]

1001690 1001690





In [None]:
train_sentences[0]

"[CLS] heresy is any provocative belief or theory that is strongly at variance with [HL] established beliefs or customs [/HL]. a heretic is a proponent of such claims or beliefs. heresy is distinct from both apostasy, which is the explicit renunciation of one's religion, principles or cause, and blasphemy, which is an impious utterance or action concerning god or sacred things. [SEP] [MASK]"

In [None]:
def tokenized_dataset(data):  # data: 문장 list
    tokenized_sentence = tokenizer(
        data,
        padding=True,  # 문장의 길이가 짧다면 padding
        truncation=True,  # 문장이 길다면 truncate
        max_length=512,
        return_token_type_ids=True,  # roberta 모델에서는 False
        return_tensors="pt",  # Tensor로 반환!
        add_special_tokens=False
    )
    return tokenized_sentence

train_tokenized = tokenized_dataset(train_sentences)


NameError: name 'train_sentences' is not defined

In [None]:
pad_len = len(train_tokenized['input_ids'][11781])

NameError: name 'train_tokenized' is not defined

In [None]:
train_label_list = []
for tl in tqdm(train_label):
    target = torch.zeros(pad_len)
    try:
        target[:len(tl[0])]=tl[0]
    except:
        target = tl[0][:-1]
    train_label_list.append(target.tolist())
    
train_label_list = torch.tensor(train_label_list).int()
    

100%|██████████| 30945/30945 [00:02<00:00, 11788.00it/s]


In [None]:
train_tokenized['input_ids']

tensor([[  101, 28354,  2003,  ...,     0,     0,     0],
        [  101, 28354,  2003,  ...,     0,     0,     0],
        [  101, 28354,  2003,  ...,     0,     0,     0],
        ...,
        [  101,  2076,  2023,  ...,     0,     0,     0],
        [  101,  2076,  2023,  ...,     0,     0,     0],
        [  101,  2076,  2023,  ...,     0,     0,     0]])

In [None]:
train_label_list

tensor([[  101, 28354,  2003,  ...,     0,     0,     0],
        [  101, 28354,  2003,  ...,     0,     0,     0],
        [  101, 28354,  2003,  ...,     0,     0,     0],
        ...,
        [  101,  2076,  2023,  ...,     0,     0,     0],
        [  101,  2076,  2023,  ...,     0,     0,     0],
        [  101,  2076,  2023,  ...,     0,     0,     0]], dtype=torch.int32)

In [None]:
train_tokenized['labels'] = train_label_list.clone()

In [None]:
train_tokenized.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'labels'])

In [None]:
num_train_epochs = 10
weight_decay = 0.0
learning_rate = 5e-5
adam_epsilon = 1e-8
max_grad_norm = 1.0
warmup_steps = 0.0
batch_size = 8
max_steps = -1
gradient_accumulation_steps = 1


In [None]:
class SquadDataset(Dataset):
    def __init__(self, data):  # tokenized 된 것과 라벨이 들어옴
        self.data = data

    def __len__(self):  # data의 전체 길이
        return len(self.data)

    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.data.items()}
        #item["labels"] = torch.tensor(self.labels[idx])

        return item

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
train_dataloader = DataLoader(
    SquadDataset(train_tokenized), shuffle=False, batch_size=8, collate_fn=data_collator
)
t_total = len(train_tokenized) // gradient_accumulation_steps * num_train_epochs

In [None]:
for batch in train_dataloader:
    print(batch)
    break
{k: v.shape for k, v in batch.items()}

{'input_ids': tensor([[  101, 28354,  2003,  ...,     0,     0,     0],
        [  101, 28354,  2003,  ...,     0,     0,     0],
        [  101, 28354,  2003,  ...,     0,     0,     0],
        [  101, 28354,  2003,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([[  101, 28354,  2003,  ...,     0,     0,     0],
        [  101, 28354,  2003,  ...,     0,     0,     0],
        [  101, 28354,  2003,  ...,     0,     0,     0],
        [  101, 28354,  2003,  ...,     0,     0,     0]])}


{'input_ids': torch.Size([4, 467]),
 'token_type_ids': torch.Size([4, 467]),
 'attention_mask': torch.Size([4, 467]),
 'labels': torch.Size([4, 467])}

In [None]:
no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
{
"params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
"weight_decay": weight_decay,
},
{"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": weight_decay},
]
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total)



In [None]:
model.train()

for epoch in range(num_train_epochs):
    eveloss = 0
    train_loader = tqdm(train_dataloader, desc='Loading train dataset')
    for j, batch in enumerate(train_loader):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)#k,labels=v)
        loss = outputs.loss
        eveloss += loss.mean().item()
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        train_loader.set_description("Loss %.04f | step %d" % (loss, j))
            
    torch.save(model.state_dict(), f'model_weights_{epoch}_{eveloss}.pth')
    print("epoch "+ str(epoch) + " : " + str(eveloss))

Loss 15.3917 | step 0: 100%|██████████| 1/1 [00:00<00:00,  5.94it/s]


epoch 0 : 15.39171028137207


Loss 10.5941 | step 0: 100%|██████████| 1/1 [00:00<00:00,  6.07it/s]


epoch 1 : 10.594099998474121


Loss 7.8155 | step 0: 100%|██████████| 1/1 [00:00<00:00,  6.05it/s]


epoch 2 : 7.815539836883545


Loss 6.4056 | step 0: 100%|██████████| 1/1 [00:00<00:00,  6.10it/s]


epoch 3 : 6.405588150024414


Loss 5.7463 | step 0: 100%|██████████| 1/1 [00:00<00:00,  6.09it/s]


epoch 4 : 5.746278762817383
