In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly&response_type=code

Enter your authorization code:
4/3wFEu8i6FB6Bet9970nkjkB5_Iy3Z5uR7MWgiqXHnHBz_l7NmcWfFDA
Mounted at /content/drive


In [2]:
!pip install -r "/content/drive/My Drive/Colab Notebooks/reformer/requirements.txt"

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/ae/05/c8c55b600308dc04e95100dc8ad8a244dd800fe75dfafcf1d6348c6f6209/transformers-3.1.0-py3-none-any.whl (884kB)
[K     |████████████████████████████████| 890kB 8.5MB/s 
[?25hCollecting reformer_pytorch
  Downloading https://files.pythonhosted.org/packages/a5/75/aaf6162bd305b7ee88136425b725df482b6cb812783ce82748bd469ea21a/reformer_pytorch-1.1.3-py3-none-any.whl
Collecting tokenizers==0.8.1.rc2
[?25l  Downloading https://files.pythonhosted.org/packages/80/83/8b9fccb9e48eeb575ee19179e2bdde0ee9a1904f97de5f02d19016b8804f/tokenizers-0.8.1rc2-cp36-cp36m-manylinux1_x86_64.whl (3.0MB)
[K     |████████████████████████████████| 3.0MB 30.7MB/s 
Collecting sentencepiece!=0.1.92
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 5

## 필요 패키지 선언

In [3]:
from __future__ import absolute_import, division, print_function
import warnings
warnings.filterwarnings("ignore")

import argparse
import logging
import os
import random
import sys
sys.path.append('/content/drive/My Drive/Colab Notebooks/reformer/')
from io import open

import numpy as np
import torch
from torch.utils.data import (DataLoader, RandomSampler,SequentialSampler, TensorDataset)
from tqdm import tqdm, trange

from model.mrc import ReformerMRCModel
from transformers.optimization import AdamW
from util.schedule import WarmupLinearSchedule
from transformers import BertTokenizer
from util.korquad_utils import (read_squad_examples, convert_examples_to_features, RawResult, write_predictions,evaluate)

if sys.version_info[0] == 2:
    import cPickle as pickle
else:
    import pickle


def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt='%m/%d/%Y %H:%M:%S',
                    level=logging.INFO)
logger = logging.getLogger(__name__)

## 파일경로 및 Hyper-parameters 초기화

#### PATH

In [4]:
gdrive_path = "/content/drive/My Drive/Colab Notebooks/reformer"
output_dir = gdrive_path
checkpoint = os.path.join(gdrive_path, "checkpoints/epoch27-reformer-small.pt")
train_file = os.path.join(gdrive_path, "finetuning/data/korquad/KorQuAD_v1.0_train.json")
vocab_file = os.path.join(gdrive_path, "ko_vocab_32k.txt")

#### Hyper Parameteer

In [5]:
doc_stride = 128 
max_query_length = 96 
max_answer_length = 30 
n_best_size = 20 

train_batch_size = 64 
learning_rate = 5e-5
warmup_proportion = 0.1 
num_train_epochs = 5.0

max_grad_norm = 1.0
adam_epsilon = 1e-6
weight_decay = 0.01

#### Device 설정

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
logger.info("device: {} n_gpu: {}".format(device, n_gpu))

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

if n_gpu > 0:
        torch.cuda.manual_seed_all(SEED)

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

09/09/2020 03:59:22 - INFO - __main__ -   device: cuda n_gpu: 1


In [7]:
# 1. Wordpiece Tokenizer
wordpiece_vocab_path = f"{gdrive_path}/data/vocab.txt"
tokenizer = BertTokenizer(vocab_file=wordpiece_vocab_path, do_lower_case=False)

# 2. Model Hyperparameter
max_seq_length = 512
batch_size = 128
dim = 512
depth = 6
heads = 8
causal = False

# 3. Prepare model
model = ReformerMRCModel(
    num_tokens=tokenizer.vocab_size,
    dim=dim,
    depth=depth,
    heads=heads,
    max_seq_len=max_seq_length,
    causal=causal  # auto-regressive 학습을 위한 설정
)


### Train Koquad

In [None]:

model.reformer.load_state_dict(torch.load(checkpoint, map_location=device),strict=False)
num_params = count_parameters(model)
logger.info("Total Parameter: %d" % num_params)
model.to(device)

cached_train_features_file = train_file + '_{0}_{1}_{2}'.format(str(max_seq_length), str(doc_stride),
                                                                      str(max_query_length))
train_examples = read_squad_examples(input_file=train_file, is_training=True, version_2_with_negative=False)
try:
    with open(cached_train_features_file, "rb") as reader:
        train_features = pickle.load(reader)
except:
    train_features = convert_examples_to_features(
        examples=train_examples,
        tokenizer=tokenizer,
        max_seq_length=max_seq_length,
        doc_stride=doc_stride,
        max_query_length=max_query_length,
        is_training=True)
    logger.info("  Saving train features into cached file %s", cached_train_features_file)
    with open(cached_train_features_file, "wb") as writer:
        pickle.dump(train_features, writer)

num_train_optimization_steps = int(len(train_features) / train_batch_size) * num_train_epochs

# Prepare optimizer
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
      'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters,
                  lr=learning_rate,
                  eps=adam_epsilon)
scheduler = WarmupLinearSchedule(optimizer,
                                  warmup_steps=num_train_optimization_steps*0.1,
                                  t_total=num_train_optimization_steps)

logger.info("***** Running training *****")
logger.info("  Num orig examples = %d", len(train_examples))
logger.info("  Num split examples = %d", len(train_features))
logger.info("  Batch size = %d", train_batch_size)
logger.info("  Num steps = %d", num_train_optimization_steps)
num_train_step = num_train_optimization_steps

all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
all_start_positions = torch.tensor([f.start_position for f in train_features], dtype=torch.long)
all_end_positions = torch.tensor([f.end_position for f in train_features], dtype=torch.long)
train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                            all_start_positions, all_end_positions)

train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=train_batch_size)

model.train()
global_step = 0
epoch = 0
for i in range(int(num_train_epochs)):
    iter_bar = tqdm(train_dataloader, desc=f"Epoch-{i} Train(XX Epoch) Step(XX/XX) (Mean loss=X.X) (loss=X.X)")
    tr_step, total_loss, mean_loss = 0, 0., 0.
    for step, batch in enumerate(iter_bar):
        if n_gpu == 1:
            batch = tuple(t.to(device) for t in batch)  # multi-gpu does scattering it-self
        input_ids, input_mask, segment_ids, start_positions, end_positions = batch
        loss = model(input_ids, start_positions, end_positions)

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)

        scheduler.step()
        optimizer.step()
        optimizer.zero_grad()
        global_step += 1
        tr_step += 1
        total_loss += loss.item()
        mean_loss = total_loss / tr_step
        iter_bar.set_description(f"Epoch-{i} Train Step(%d / %d) (Mean loss=%5.5f) (loss=%5.5f)" %
                                  (global_step, num_train_step, mean_loss, loss.item()))

    logger.info("** ** * Saving file * ** **")
    model_checkpoint = "korquad_%d.bin" % (epoch)
    logger.info(model_checkpoint)
    output_model_file = os.path.join(output_dir,model_checkpoint)

    torch.save(model.state_dict(), output_model_file)
    epoch += 1


09/07/2020 10:58:53 - INFO - __main__ -   Total Parameter: 29687810
09/07/2020 10:59:07 - INFO - __main__ -   ***** Running training *****
09/07/2020 10:59:07 - INFO - __main__ -     Num orig examples = 60407
09/07/2020 10:59:07 - INFO - __main__ -     Num split examples = 62787
09/07/2020 10:59:07 - INFO - __main__ -     Batch size = 64
09/07/2020 10:59:07 - INFO - __main__ -     Num steps = 4905
Epoch-0 Train Step(982 / 4905) (Mean loss=2.75833) (loss=2.07941): 100%|██████████| 982/982 [36:56<00:00,  2.26s/it]
09/07/2020 11:36:05 - INFO - __main__ -   ** ** * Saving file * ** **
09/07/2020 11:36:05 - INFO - __main__ -   korquad_0.bin
Epoch-1 Train Step(1964 / 4905) (Mean loss=1.12940) (loss=1.04347): 100%|██████████| 982/982 [36:55<00:00,  2.26s/it]
09/07/2020 12:13:03 - INFO - __main__ -   ** ** * Saving file * ** **
09/07/2020 12:13:03 - INFO - __main__ -   korquad_1.bin
Epoch-2 Train Step(2946 / 4905) (Mean loss=0.84125) (loss=1.32237): 100%|██████████| 982/982 [36:57<00:00,  2.26

## Evaluation

In [9]:
checkpoint = os.path.join(gdrive_path, "korquad_3.bin")
predict = os.path.join(gdrive_path, "finetuning/data/korquad/KorQuAD_v1.0_train.json")

predict_batch_size = 16 #@param {type: "integer"}

### Model load

In [10]:
model.load_state_dict(torch.load(checkpoint))
num_params = count_parameters(model)
logger.info("Total Parameter: %d" % num_params)

09/09/2020 04:10:43 - INFO - __main__ -   Total Parameter: 29687810


In [13]:
eval_examples = read_squad_examples(input_file=predict,
                               is_training=False,
                               version_2_with_negative=False)
eval_features = convert_examples_to_features(examples=eval_examples,
                                        tokenizer=tokenizer,
                                        max_seq_length=max_seq_length,
                                        doc_stride=doc_stride,
                                        max_query_length=max_query_length,
                                        is_training=False)

09/09/2020 04:11:15 - INFO - util.korquad_utils -   *** Example ***
09/09/2020 04:11:15 - INFO - util.korquad_utils -   unique_id: 1000000000
09/09/2020 04:11:15 - INFO - util.korquad_utils -   example_index: 0
09/09/2020 04:11:15 - INFO - util.korquad_utils -   doc_span_index: 0
09/09/2020 04:11:15 - INFO - util.korquad_utils -   tokens: [CLS] 바 ##그 ##너는 괴 ##테 ##의 파 ##우스 ##트를 읽고 무엇을 쓰고 ##자 했 ##는가 ? [SEP] 18 ##3 ##9년 바 ##그 ##너는 괴 ##테 ##의 파 ##우스 ##트 ##을 처음 읽고 그 내용 ##에 마음이 끌려 이를 소재로 해서 하나의 교 ##향 ##곡을 쓰 ##려는 뜻을 갖는 ##다 . 이 시기 바 ##그 ##너는 18 ##3 ##8년 ##에 빛 독 ##촉 ##으로 산 ##전 ##수 ##전을 다 걲 ##은 상황이라 좌절 ##과 실망 ##에 가득 ##했으며 메 ##피스 ##토 ##펠 ##레스 ##를 만나는 파 ##우스 ##트의 심 ##경에 공감 ##했다고 한다 . 또한 파리 ##에서 아 ##브 ##네 ##크의 지휘 ##로 파리 음악 ##원 관 ##현 ##악 ##단이 연주 ##하는 베 ##토 ##벤 ##의 교 ##향 ##곡 9 ##번을 듣고 깊은 감 ##명을 받았 ##는데 , 이것이 이듬해 1월 ##에 파 ##우스 ##트의 서 ##곡으로 쓰여 ##진 이 작품에 조금이라도 영향을 끼 ##쳤 ##으 ##리라는 것은 의심 ##할 여지가 없다 . 여기 ##의 라 ##단 ##조 조성 ##의 경우에도 그의 전기 ##에 적혀 있는 것처럼 단순한 정신적 피로 ##나 실 ##의가 반영 ##된 것이 아니라 베 ##토 ##벤 ##의 합 ##창 ##

In [14]:
all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index)
sampler = SequentialSampler(dataset)
dataloader = DataLoader(dataset, sampler=sampler, batch_size=predict_batch_size)

logger.info("***** Evaluating *****")
logger.info("  Num features = %d", len(dataset))
logger.info("  Batch size = %d", predict_batch_size)

09/09/2020 04:15:42 - INFO - __main__ -   ***** Evaluating *****
09/09/2020 04:15:42 - INFO - __main__ -     Num features = 62787
09/09/2020 04:15:42 - INFO - __main__ -     Batch size = 16


In [15]:
model.eval()
model.to(device)
all_results = []
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if n_gpu > 0:
    torch.cuda.manual_seed_all(SEED)

## KorQuAD1.0 검증

In [16]:
logger.info("Start evaluating!")
for input_ids, input_mask, segment_ids, example_indices in tqdm(dataloader, desc="Evaluating", leave=True, position=0):
    input_ids = input_ids.to(device)
    input_mask = input_mask.to(device)
    segment_ids = segment_ids.to(device)
    with torch.no_grad():
        batch_start_logits, batch_end_logits = model(input_ids)
    for i, example_index in enumerate(example_indices):
        start_logits = batch_start_logits[i].detach().cpu().tolist()
        end_logits = batch_end_logits[i].detach().cpu().tolist()
        eval_feature = eval_features[example_index.item()]
        unique_id = int(eval_feature.unique_id)
        all_results.append(RawResult(unique_id=unique_id,
                                     start_logits=start_logits,
                                     end_logits=end_logits))
output_prediction_file = os.path.join(output_dir, "predictions.json")
output_nbest_file = os.path.join(output_dir, "nbest_predictions.json")
write_predictions(eval_examples, eval_features, all_results,
                    n_best_size, max_answer_length,
                    False, output_prediction_file, output_nbest_file,
                    None, False, False, 0.0)

09/09/2020 04:19:53 - INFO - __main__ -   Start evaluating!
Evaluating: 100%|██████████| 3925/3925 [07:21<00:00,  8.88it/s]
09/09/2020 04:27:15 - INFO - util.korquad_utils -   Writing predictions to: /content/drive/My Drive/Colab Notebooks/reformer/predictions.json
09/09/2020 04:27:15 - INFO - util.korquad_utils -   Writing nbest to: /content/drive/My Drive/Colab Notebooks/reformer/nbest_predictions.json


## 결과 확인

In [17]:
import json

expected_version = 'KorQuAD_v1.0'
with open(predict) as dataset_file:
    dataset_json = json.load(dataset_file)
    read_version = "_".join(dataset_json['version'].split("_")[:-1])
    if (read_version != expected_version):
        logger.info('Evaluation expects ' + expected_version +
                    ', but got dataset with ' + read_version,
                    file=sys.stderr)
    dataset = dataset_json['data']
with open(os.path.join(output_dir, "predictions.json")) as prediction_file:
    predictions = json.load(prediction_file)
logger.info(json.dumps(evaluate(dataset, predictions)))

09/09/2020 04:29:34 - INFO - __main__ -   {"exact_match": 56.75832271094409, "f1": 84.96701596465331}
