# 질문-응답 task를 위한 BERT fine-tunning

In [1]:
# 라이브러리 임포트
from transformers import AutoModelForQuestionAnswering,AutoTokenizer, Trainer, TrainingArguments

import torch
import numpy as np

## datasets 로딩
datasets 라이브러리를 이용하여 데이터를 가져옵니다.

In [2]:
from datasets import load_dataset
datasets = load_dataset('squad')

Reusing dataset squad (/home/ubuntu/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)


  0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})

# 데이터 전처리

In [36]:
# 데이터 셋을 학습 데이터 셋과 검증 데이터 셋으로 나눔 
train_set = datasets['train']
test_set = datasets['validation']

In [37]:
train_contexts, train_questions,train_answers, = train_set['context'], train_set['question'], train_set['answers']
val_contexts, val_questions,val_answers, = test_set['context'], test_set['question'], test_set['answers']

In [38]:
# train_contexts 출력해보기
print(train_contexts[0])

Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.


In [39]:
# train_questions 출력해보기
print(train_questions[0])

To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?


In [40]:
# train_answers 출력해보기
print(train_answers[0])

{'text': ['Saint Bernadette Soubirous'], 'answer_start': [515]}


- train_answers를 살펴보면 `answer_start` 밖에 존재하지 않습니다.
- 모델에 학습을 하기위해서는 `answer_end`도 필요하기 때문에 answer의 text를 이용해 추출해줍니다.

In [42]:
def add_end_idx(answers, contexts):
    """
    answers의 text를 이용하여 answer_end 추출
    """
    
    for answer, context in zip(answers, contexts):
        # 참고할 text 지정
        gold_text = answer['text'][0]
        
        # start_idx 추출
        start_idx = answer['answer_start'][0]
        
        # text와 start_idx를 이용하여 end_idx 추출
        end_idx = start_idx + len(gold_text)
        
        # answer_end 변수를 만들어 줍니다.
        answer['answer_end'] = end_idx
        
'''        # ...however, sometimes squad answers are off by a character or two
        if context[start_idx:end_idx] == gold_text:
            # if the answer is not off :)
            
        else:
            # this means the answer is off by 1-2 tokens
            for n in [1, 2]:
                if context[start_idx-n:end_idx-n] == gold_text:
                    answer['answer_start'] = start_idx - n
                    answer['answer_end'] = end_idx - n'''
        
            
# and apply the function to our two answer lists
add_end_idx(train_answers, train_contexts)
add_end_idx(val_answers, val_contexts)

In [None]:
def add_end_idx(answers, contexts):
    """
    answers의 text를 이용하여 answer_end 추출
    """
    
    for answer, context in zip(answers, contexts):
        # 참고할 text 지정
        gold_text = answer['text'][0]
        
        # start_idx 추출
        start_idx = answer['answer_start'][0]
        
        # text와 start_idx를 이용하여 end_idx 추출
        end_idx = start_idx + len(gold_text)
        
        # answer_end 변수를 만들어 줍니다.
        answer['answer_end'] = end_idx
        
'''        # ...however, sometimes squad answers are off by a character or two
        if context[start_idx:end_idx] == gold_text:
            # if the answer is not off :)
            
        else:
            # this means the answer is off by 1-2 tokens
            for n in [1, 2]:
                if context[start_idx-n:end_idx-n] == gold_text:
                    answer['answer_start'] = start_idx - n
                    answer['answer_end'] = end_idx - n'''
        
            
# and apply the function to our two answer lists
add_end_idx(train_answers, train_contexts)
add_end_idx(val_answers, val_contexts)

In [43]:
# answer_end가 잘 추가 되었는지 확인 해봅니다.
train_answers[:5]

[{'text': ['Saint Bernadette Soubirous'],
  'answer_start': [515],
  'answer_end': 541},
 {'text': ['a copper statue of Christ'],
  'answer_start': [188],
  'answer_end': 213},
 {'text': ['the Main Building'], 'answer_start': [279], 'answer_end': 296},
 {'text': ['a Marian place of prayer and reflection'],
  'answer_start': [381],
  'answer_end': 420},
 {'text': ['a golden statue of the Virgin Mary'],
  'answer_start': [92],
  'answer_end': 126}]

In [44]:
# tokenizer 불러오기 
# xlm-roberta-base 모델을 학습할 때 사용된 tokenizer 를 불러 옵니다.

tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')

# tokenize
train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
val_encodings = tokenizer(val_contexts, val_questions, truncation=True, padding=True)

tokenizer를 이용해 encoding 된 데이터에 start_position과 end_position을 추가해 줍니다.


In [46]:
def add_token_positions(encodings, answers):
    
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        # char_to_token method를 이용하여 시작/종료 위치 토큰 추가
        start_positions.append(encodings.char_to_token(i, answers[i]['answer_start'][0]))
        end_positions.append(encodings.char_to_token(i, answers[i]['answer_end']))

        # start position 이 None 이라면, encoding 과정에서 잘린 것입니다. 
        # 그럴땐 max_length로 지정 해줍니다.
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
        # end position이 None이라면 None이 아닐때까지 한칸씩 앞으로 당겨줍니다.
        shift = 1
        while end_positions[-1] is None:
            end_positions[-1] = encodings.char_to_token(i, answers[i]['answer_end'] - shift)
            shift += 1
            
    # 최종적으로 encodings obejct를 업데이트 해줍니다.    
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

# apply function to our data
add_token_positions(train_encodings, train_answers)
add_token_positions(val_encodings, val_answers)

In [47]:
# encodings에 잘 추가 된 것을 확인할 수 있습니다.
train_encodings.keys()

dict_keys(['input_ids', 'attention_mask', 'start_positions', 'end_positions'])

## 모델 학습

In [50]:
import torch

class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

# build datasets for both our training and validation sets
train_dataset = SquadDataset(train_encodings)
val_dataset = SquadDataset(val_encodings)

In [65]:
model = AutoModelForQuestionAnswering.from_pretrained('xlm-roberta-base')

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForQuestionAnswering: ['lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias']
- This IS expected if you are initializing XLMRobertaForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForQuestionAnswering were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream tas

In [66]:
from torch.utils.data import DataLoader
from transformers import AdamW
from tqdm import tqdm

# device 지정
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model.to(device)

optim = AdamW(model.parameters(), lr=5e-5)

# 모델 학습을 위해 dataloader 생성
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

for epoch in range(3):
    # 모델을 train 모드로 지정
    model.train()
    # tqdm progress bar를 위한 설정
    loop = tqdm(train_loader, leave=True)
    
    for batch in loop:
        
        optim.zero_grad()
        
        # batch에서 데이터를 가져와 device로 보냄
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        
        # 모델을 통해 outputs 생성
        outputs = model(input_ids, attention_mask=attention_mask,
                        start_positions=start_positions,
                        end_positions=end_positions)
        # loss 추출
        loss = outputs[0]
        # loss 계산
        loss.backward()
        
        # 파라미터 업데이트
        optim.step()
        
        # progress bar에 표시
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

Epoch 0: 100%|██████████| 5475/5475 [48:10<00:00,  1.89it/s, loss=0.654]
Epoch 1: 100%|██████████| 5475/5475 [48:12<00:00,  1.89it/s, loss=0.84] 
Epoch 2: 100%|██████████| 5475/5475 [48:12<00:00,  1.89it/s, loss=0.835]


In [57]:
# 모델을 평가 모드로 변경
# 모델 추론 시 불필요한 기능 (batch normalization, drop out등)을 사용하지 않는다
model.eval()

# 평가를 위한 dataloader 생성
val_loader = DataLoader(val_dataset, batch_size=16)


acc = []


for batch in val_loader:
    
    with torch.no_grad():
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        
        # 정확도 계산시 사용 예정
        start_true = batch['start_positions'].to(device)
        end_true = batch['end_positions'].to(device)
        
        # 모델을 이용해 예측 실행
        outputs = model(input_ids, attention_mask=attention_mask)
        
        # 정확도 계산
        start_pred = torch.argmax(outputs['start_logits'], dim=1)
        end_pred = torch.argmax(outputs['end_logits'], dim=1)
        
        # 시작지점이 같은지, 종료 지점이 같은지 확인해서 정확도 계산
        acc.append(((start_pred == start_true).sum()/len(start_pred)).item())
        acc.append(((end_pred == end_true).sum()/len(end_pred)).item())
# 전체 정확도 계산
acc = sum(acc)/len(acc)

In [67]:
acc

0.7087840393478679

# reference
- https://towardsdatascience.com/how-to-fine-tune-a-q-a-transformer-86f91ec92997
- https://huggingface.co/docs/transformers/index
- https://github.com/huggingface/transformers/tree/master/notebooks
    