In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install -r "/content/drive/My Drive/Colab Notebooks/reformer-language-model/requirements.txt"



## 필요 패키지 선언

In [3]:
from __future__ import absolute_import, division, print_function
import warnings
warnings.filterwarnings("ignore")

import argparse
import logging
import os
import random
import sys
sys.path.append('/content/drive/My Drive/Colab Notebooks/reformer-language-model/')
from io import open
import json

import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import (DataLoader, RandomSampler,SequentialSampler, TensorDataset)
from tqdm import tqdm, trange

from reformer_pytorch import ReformerLM
from util.arg import ElectraConfig
from model.electra import Electra
from model.electra_discriminator import DiscriminatorMRCModel
from transformers.optimization import AdamW
from util.schedule import WarmupLinearSchedule
from transformers import BertTokenizer
from util.korquad_utils import (read_squad_examples, convert_examples_to_features, RawResult, write_predictions,evaluate)

if sys.version_info[0] == 2:
    import cPickle as pickle
else:
    import pickle


def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt='%m/%d/%Y %H:%M:%S',
                    level=logging.INFO)
logger = logging.getLogger(__name__)

## 파일경로 및 Hyper-parameters 초기화

#### PATH

In [4]:
gdrive_path = "/content/drive/My Drive/Colab Notebooks/reformer-language-model"
output_dir = f'{gdrive_path}/korquad'
train_file = f'{gdrive_path}/data/korquad/KorQuAD_v1.0_train.json'
dev_file = f'{gdrive_path}/data/korquad/KorQuAD_v1.0_dev.json'
config_path =f'{gdrive_path}/config/electra/electra-korquad-finetuning.json'
model_name = "reformer-electra"

#### Hyper Parameteer

In [5]:
doc_stride = 64 
max_query_length = 96 
max_answer_length = 30 
n_best_size = 10 

train_batch_size = 64
learning_rate = 5e-4
warmup_proportion = 0.1 
num_train_epochs = 20.0

max_grad_norm = 1.0
adam_epsilon = 1e-6
weight_decay = 0.01

#### Device 설정

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
logger.info("device: {} n_gpu: {}".format(device, n_gpu))

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

if n_gpu > 0:
        torch.cuda.manual_seed_all(SEED)

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

01/15/2021 12:45:36 - INFO - __main__ -   device: cuda n_gpu: 1


#### Eval Method KorQuAD

In [7]:
def eval(model, epoch,eval_examples, eval_features,predict_batch_size):
  predict = dev_file

  all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
  all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
  all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
  all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
  dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index)
  sampler = SequentialSampler(dataset)
  dataloader = DataLoader(dataset, sampler=sampler, batch_size=predict_batch_size)

  logger.info("***** Evaluating *****")
  logger.info("  Num features = %d", len(dataset))
  logger.info("  Batch size = %d", predict_batch_size)

  model.eval()
  model.to(device)
  all_results = []
  random.seed(SEED)
  np.random.seed(SEED)
  torch.manual_seed(SEED)
  if n_gpu > 0:
    torch.cuda.manual_seed_all(SEED)

  logger.info("Start evaluating!")
  for input_ids, input_mask, segment_ids, example_indices in dataloader: #tqdm(dataloader, desc="Evaluating", leave=True,position=1):
    input_ids = input_ids.to(device)
    with torch.no_grad():
      batch_start_logits, batch_end_logits = model(input_ids)
    for i, example_index in enumerate(example_indices):
      start_logits = batch_start_logits[i].detach().cpu().tolist()
      end_logits = batch_end_logits[i].detach().cpu().tolist()
      eval_feature = eval_features[example_index.item()]
      unique_id = int(eval_feature.unique_id)
      all_results.append(RawResult(unique_id=unique_id,
                                   start_logits=start_logits,
                                   end_logits=end_logits))
  output_prediction_file = os.path.join(output_dir, f"{model_name}_predictions_{epoch}.json")
  output_nbest_file = os.path.join(output_dir, f"{model_name}_nbest_predictions_{epoch}.json")
  write_predictions(eval_examples, eval_features, all_results,
                    n_best_size, max_answer_length,
                    False, output_prediction_file, output_nbest_file,
                    None, False, False, 0.0)

  with open(predict) as dataset_file:
    dataset_json = json.load(dataset_file)
    dataset = dataset_json['data']

  with open(os.path.join(output_dir, f"{model_name}_predictions_{epoch}.json")) as prediction_file:
    predictions = json.load(prediction_file)
  logger.info(json.dumps(evaluate(dataset, predictions)))

### Model Load

In [9]:
# 1. Config
train_config, gen_config, disc_config = ElectraConfig(config_path = config_path).get_config()

# 2. Tokenizer
tokenizer = BertTokenizer(vocab_file=train_config.vocab_path, do_lower_case=False)

# Generator
generator = ReformerLM(
    num_tokens= tokenizer.vocab_size,
    emb_dim= gen_config.emb_dim,
    dim= gen_config.emb_dim,  # smaller hidden dimension
    heads= gen_config.heads,  # less heads
    ff_mult= gen_config.ff_mult,  # smaller feed forward intermediate dimension
    dim_head= gen_config.dim_head,
    depth= gen_config.depth,
    max_seq_len= train_config.max_len
)

discriminator = ReformerLM(
    num_tokens= tokenizer.vocab_size,
    emb_dim= disc_config.emb_dim,
    dim= disc_config.dim,
    dim_head= disc_config.dim_head,
    heads= disc_config.heads,
    depth= disc_config.depth,
    ff_mult= disc_config.ff_mult,
    max_seq_len= train_config.max_len,
    return_embeddings=True,
)
# 4.2 weight tie the token and positional embeddings of generator and discriminator
# 제너레이터와 디스크리미네이터의 토큰, 포지션 임베딩을 공유한다(tie).
generator.token_emb = discriminator.token_emb
generator.pos_emb = discriminator.pos_emb
# weight tie any other embeddings if available, token type embeddings, etc.
# 다른 임베딩 웨이트도 있다면 공유 필요.

# 4.3 instantiate electra
# 엘렉트라 모델 초기화
discriminator_with_adapter = nn.Sequential(discriminator, nn.Linear(disc_config.dim, 1))

electra = Electra(
    generator,
    discriminator_with_adapter,
    mask_token_id = tokenizer.mask_token_id,           # the token id reserved for masking
    pad_token_id = tokenizer.pad_token_id,             # the token id for padding
    mask_prob = 0.15,                                  # masking probability for masked language modeling
    mask_ignore_token_ids = tokenizer.all_special_ids  # ids of tokens to ignore for mask modeling ex. (cls, sep)
)
electra.load_state_dict(torch.load(train_config.checkpoint_path, map_location=device),strict=False)
print(f'Electra Model Load {train_config.checkpoint_path}')
electra_discriminator = electra.discriminator[0]

model = DiscriminatorMRCModel(discriminator=electra_discriminator,dim=disc_config.dim)


Electra Model Load /content/drive/My Drive/Colab Notebooks/reformer-language-model/checkpoints/reformer-electra.pth


In [12]:
#electra.discriminator[0]

##### Eval Data

In [13]:
eval_examples = read_squad_examples(input_file=dev_file,
                                    is_training=False,
                                    version_2_with_negative=False)
eval_features = convert_examples_to_features(examples=eval_examples,
                                              tokenizer=tokenizer,
                                              max_seq_length=train_config.max_len,
                                              doc_stride=doc_stride,
                                              max_query_length=max_query_length,
                                              is_training=False)

01/15/2021 12:47:54 - INFO - util.korquad_utils -   *** Example ***
01/15/2021 12:47:54 - INFO - util.korquad_utils -   unique_id: 1000000000
01/15/2021 12:47:54 - INFO - util.korquad_utils -   example_index: 0
01/15/2021 12:47:54 - INFO - util.korquad_utils -   doc_span_index: 0
01/15/2021 12:47:54 - INFO - util.korquad_utils -   tokens: [CLS] 임 ##종 ##석이 여 ##의 ##도 농 ##민 폭력 시위 ##를 주도 ##한 혐 ##의로 지명 ##수 ##배 된 날 ##은 ? [SEP] 1989년 2월 15일 여 ##의 ##도 농 ##민 폭력 시위 ##를 주도 ##한 혐 ##의 ( 폭력 ##행 ##위 ##등 ##처 ##벌 ##에 ##관 ##한 ##법 ##률 ##위 ##반 ) 으로 지명 ##수 ##배 ##되었다 . 1989년 3월 12일 서울 ##지방 ##검 ##찰 ##청 공 ##안 ##부는 임 ##종 ##석의 사전 ##구 ##속 ##영 ##장을 발 ##부 ##받았다 . 같은 해 6월 30일 평양 ##축 ##전에 임 ##수 ##경을 대표 ##로 파견 ##하여 국가 ##보 ##안 ##법 ##위 ##반 혐 ##의가 추가되었다 . 경찰 ##은 12월 18일 ~ 20일 사이 서울 경 ##희 ##대학교 ##에서 임 ##종 ##석이 성 ##명 발표 ##를 추진 ##하고 있다는 첩 ##보를 입수 ##했고 , 12월 18일 오전 7 ##시 40 ##분 경 가스 ##총 ##과 전자 ##봉 ##으로 무장 ##한 특 ##공 ##조 및 대공 ##과 직원 12 ##명 등 22 ##명의 사 ##복 경찰 ##을 승 ##용 ##차 8 ##대에 나누 ##어 경 ##희 ##대학교 ##에 투입 ##했다 . 1989년 12월 18일 

In [14]:
print(f'checkporint - {train_config.checkpoint_path}')

checkporint - /content/drive/My Drive/Colab Notebooks/reformer-language-model/checkpoints/reformer-electra.pth


### Train Koquad

In [None]:
start_epoch = 0
# checkpoint_path =f'{output_dir}/one_epoch_electra_korquad_{start_epoch-1}.bin'
# if os.path.exists(checkpoint_path):
#   model.load_state_dict(torch.load(checkpoint_path))
# else:
#   start_epoch = 0 

num_params = count_parameters(model)
logger.info("Total Parameter: %d" % num_params)
model.to(device)

cached_train_features_file = train_file + '_{0}_{1}_{2}'.format(str(train_config.max_len), str(doc_stride),
                                                                      str(max_query_length))
train_examples = read_squad_examples(input_file=train_file, is_training=True, version_2_with_negative=False)
try:
    with open(cached_train_features_file, "rb") as reader:
        train_features = pickle.load(reader)
except:
    train_features = convert_examples_to_features(
        examples=train_examples,
        tokenizer=tokenizer,
        max_seq_length=train_config.max_len,
        doc_stride=doc_stride,
        max_query_length=max_query_length,
        is_training=True)
    logger.info("  Saving train features into cached file %s", cached_train_features_file)
    with open(cached_train_features_file, "wb") as writer:
        pickle.dump(train_features, writer)

num_train_optimization_steps = int(len(train_features) / train_batch_size) * num_train_epochs

# Prepare optimizer
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
      'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters,
                  lr=learning_rate,
                  eps=adam_epsilon)
scheduler = WarmupLinearSchedule(optimizer,
                                  warmup_steps=num_train_optimization_steps*0.1,
                                  t_total=num_train_optimization_steps)

logger.info("***** Running training *****")
logger.info("  Num orig examples = %d", len(train_examples))
logger.info("  Num split examples = %d", len(train_features))
logger.info("  Batch size = %d", train_batch_size)
logger.info("  Num steps = %d", num_train_optimization_steps)
num_train_step = num_train_optimization_steps

all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
all_start_positions = torch.tensor([f.start_position for f in train_features], dtype=torch.long)
all_end_positions = torch.tensor([f.end_position for f in train_features], dtype=torch.long)
train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                            all_start_positions, all_end_positions)

train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=train_batch_size)

model.train()
global_step = 0
epoch = start_epoch
for i in range(start_epoch, int(num_train_epochs)):
    iter_bar = tqdm(train_dataloader, desc=f"Epoch-{i} Train(XX Epoch) Step(XX/XX) (Mean loss=X.X) (loss=X.X)")
    tr_step, total_loss, mean_loss = 0, 0., 0.
    for step, batch in enumerate(iter_bar):
        if n_gpu == 1:
            batch = tuple(t.to(device) for t in batch)  # multi-gpu does scattering it-self
        
        input_ids, input_mask, segment_ids, start_positions, end_positions = batch

        loss = model(input_ids, start_positions, end_positions)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)

        scheduler.step()
        optimizer.step()
        optimizer.zero_grad()
        global_step += 1
        tr_step += 1
        total_loss += loss.item()
        mean_loss = total_loss / tr_step
        iter_bar.set_description(f"Epoch-{i} Train Step(%d / %d) (Mean loss=%5.5f) (loss=%5.5f)" %
                                  (global_step, num_train_step, mean_loss, loss.item()))

    logger.info("** ** * Saving file * ** **")
    model_checkpoint = f"{model_name}_{epoch}.bin"
    logger.info(model_checkpoint)
    output_model_file = os.path.join(output_dir, model_checkpoint)
    # 평가
    eval(model, epoch,eval_examples, eval_features,train_batch_size)
    model.train()

    torch.save(model.state_dict(), output_model_file)
    epoch += 1


01/15/2021 12:48:26 - INFO - __main__ -   Total Parameter: 97270274
01/15/2021 12:48:41 - INFO - __main__ -   ***** Running training *****
01/15/2021 12:48:41 - INFO - __main__ -     Num orig examples = 60407
01/15/2021 12:48:41 - INFO - __main__ -     Num split examples = 64830
01/15/2021 12:48:41 - INFO - __main__ -     Batch size = 64
01/15/2021 12:48:41 - INFO - __main__ -     Num steps = 20240
Epoch-0 Train Step(1013 / 20240) (Mean loss=4.95631) (loss=4.78148): 100%|██████████| 1013/1013 [2:06:11<00:00,  7.47s/it]
01/15/2021 14:54:54 - INFO - __main__ -   ** ** * Saving file * ** **
01/15/2021 14:54:54 - INFO - __main__ -   reformer-electra_0.bin
01/15/2021 14:54:54 - INFO - __main__ -   ***** Evaluating *****
01/15/2021 14:54:54 - INFO - __main__ -     Num features = 6892
01/15/2021 14:54:54 - INFO - __main__ -     Batch size = 64
01/15/2021 14:54:54 - INFO - __main__ -   Start evaluating!
01/15/2021 14:58:27 - INFO - util.korquad_utils -   Writing predictions to: /content/drive/

## Evaluation

In [None]:
# electra_korquad_15.bin
num =11
checkpoint = os.path.join(gdrive_path, f"one_epoch_electra_korquad_{num}.bin")
predict = os.path.join(gdrive_path, "finetuning/data/korquad/KorQuAD_v1.0_dev.json")
prediction_name = f"electra_predictions_{num}.json"
nbest_name = f"one_nbest_predictions_{num}.json"

predict_batch_size = 16 #@param {type: "integer"}

### Model load

In [None]:
model.load_state_dict(torch.load(checkpoint))
num_params = count_parameters(model)
logger.info("Total Parameter: %d" % num_params)

In [None]:
eval_examples = read_squad_examples(input_file=predict,
                               is_training=False,
                               version_2_with_negative=False)
eval_features = convert_examples_to_features(examples=eval_examples,
                                        tokenizer=tokenizer,
                                        max_seq_length=train_config.max_len,
                                        doc_stride=doc_stride,
                                        max_query_length=max_query_length,
                                        is_training=False)

In [None]:
all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index)
sampler = SequentialSampler(dataset)
dataloader = DataLoader(dataset, sampler=sampler, batch_size=predict_batch_size)

logger.info("***** Evaluating *****")
logger.info("  Num features = %d", len(dataset))
logger.info("  Batch size = %d", predict_batch_size)

In [None]:
model.eval()
model.to(device)
all_results = []
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if n_gpu > 0:
    torch.cuda.manual_seed_all(SEED)

## KorQuAD1.0 검증

In [None]:
logger.info("Start evaluating!")
for input_ids, input_mask, segment_ids, example_indices in tqdm(dataloader, desc="Evaluating", leave=True, position=0):
    input_ids = input_ids.to(device)
    input_mask = input_mask.to(device)
    segment_ids = segment_ids.to(device)
    with torch.no_grad():
        batch_start_logits, batch_end_logits = model(input_ids)
    for i, example_index in enumerate(example_indices):
        start_logits = batch_start_logits[i].detach().cpu().tolist()
        end_logits = batch_end_logits[i].detach().cpu().tolist()
        eval_feature = eval_features[example_index.item()]
        unique_id = int(eval_feature.unique_id)
        all_results.append(RawResult(unique_id=unique_id,
                                     start_logits=start_logits,
                                     end_logits=end_logits))
output_prediction_file = os.path.join(output_dir, prediction_name)
output_nbest_file = os.path.join(output_dir, nbest_name)
write_predictions(eval_examples, eval_features, all_results,
                    n_best_size, max_answer_length,
                    False, output_prediction_file, output_nbest_file,
                    None, False, False, 0.0)

## 결과 확인

In [None]:
import json

expected_version = 'KorQuAD_v1.0'
with open(predict) as dataset_file:
    dataset_json = json.load(dataset_file)
    read_version = "_".join(dataset_json['version'].split("_")[:-1])
    if (read_version != expected_version):
        logger.info('Evaluation expects ' + expected_version +
                    ', but got dataset with ' + read_version,
                    file=sys.stderr)
    dataset = dataset_json['data']
with open(os.path.join(output_dir, prediction_name)) as prediction_file:
    predictions = json.load(prediction_file)
logger.info(json.dumps(evaluate(dataset, predictions)))