In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/81/91/61d69d58a1af1bd81d9ca9d62c90a6de3ab80d77f27c5df65d9a2c1f5626/transformers-4.5.0-py3-none-any.whl (2.1MB)
[K     |████████████████████████████████| 2.2MB 8.3MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/08/cd/342e584ee544d044fb573ae697404ce22ede086c9e87ce5960772084cad0/sacremoses-0.0.44.tar.gz (862kB)
[K     |████████████████████████████████| 870kB 44.2MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/ae/04/5b870f26a858552025a62f1649c20d29d2672c02ff3c3fb4c688ca46467a/tokenizers-0.10.2-cp37-cp37m-manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 42.8MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.44-cp37-none-any.whl size=886084 sha256=a998a

## 필요 패키지 선언

In [3]:
from __future__ import absolute_import, division, print_function
import warnings
warnings.filterwarnings("ignore")

import argparse
import logging
import os
import random
import sys
sys.path.append('/content/drive/My Drive/Colab Notebooks/transformer-electra/')

from io import open
import json

import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import (DataLoader, RandomSampler,SequentialSampler, TensorDataset)
from tqdm import tqdm, trange

from model.electra_v2 import Electra,ElectraMRCModel
from example.language_model.common.arg import ElectraConfig
from transformers.optimization import AdamW
from example.language_model.korquad.schedule import WarmupLinearSchedule
from transformers import BertTokenizer
from example.language_model.korquad.korquad_utils import (read_squad_examples, convert_examples_to_features, RawResult, write_predictions,evaluate)

if sys.version_info[0] == 2:
    import cPickle as pickle
else:
    import pickle


def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt='%m/%d/%Y %H:%M:%S',
                    level=logging.INFO)
logger = logging.getLogger(__name__)

## 파일경로 및 Hyper-parameters 초기화

#### PATH

In [4]:
gdrive_path = "/content/drive/My Drive/Colab Notebooks/transformer-electra"
output_dir = f'{gdrive_path}/korquad'
train_file = f'{gdrive_path}/data/korquad/KorQuAD_v1.0_train.json'
dev_file = f'{gdrive_path}/data/korquad/KorQuAD_v1.0_dev.json'
config_path =f'{gdrive_path}/config/electra/small/electra-train-v3.json'
checkpoint_path=f'{gdrive_path}/checkpoints/transformer-electra-small-v4.pth'
model_name = "transformer-electra-small-v4"

#### Hyper Parameteer

In [5]:
doc_stride = 64 
max_query_length = 96 
max_answer_length = 30 
n_best_size = 10 

train_batch_size = 32
learning_rate = 4e-4
warmup_proportion = 0.1 
num_train_epochs = 5.0

max_grad_norm = 1.0
adam_epsilon = 1e-6
weight_decay = 0.01

#### Device 설정

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
logger.info("device: {} n_gpu: {}".format(device, n_gpu))

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

if n_gpu > 0:
        torch.cuda.manual_seed_all(SEED)

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

04/12/2021 07:02:22 - INFO - __main__ -   device: cuda n_gpu: 1


#### Eval Method KorQuAD

In [7]:
def eval(model, epoch,eval_examples, eval_features,predict_batch_size):
  predict = dev_file

  all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
  all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
  all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
  all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
  dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index)
  sampler = SequentialSampler(dataset)
  dataloader = DataLoader(dataset, sampler=sampler, batch_size=predict_batch_size)

  logger.info("***** Evaluating *****")
  logger.info("  Num features = %d", len(dataset))
  logger.info("  Batch size = %d", predict_batch_size)

  model.eval()
  model.to(device)
  all_results = []
  random.seed(SEED)
  np.random.seed(SEED)
  torch.manual_seed(SEED)
  if n_gpu > 0:
    torch.cuda.manual_seed_all(SEED)

  logger.info("Start evaluating!")
  for input_ids, input_mask, segment_ids, example_indices in dataloader: #tqdm(dataloader, desc="Evaluating", leave=True,position=1):
    input_ids = input_ids.to(device)
    with torch.no_grad():
      batch_start_logits, batch_end_logits = model(input_ids)
    for i, example_index in enumerate(example_indices):
      start_logits = batch_start_logits[i].detach().cpu().tolist()
      end_logits = batch_end_logits[i].detach().cpu().tolist()
      eval_feature = eval_features[example_index.item()]
      unique_id = int(eval_feature.unique_id)
      all_results.append(RawResult(unique_id=unique_id,
                                   start_logits=start_logits,
                                   end_logits=end_logits))
  output_prediction_file = os.path.join(output_dir, f"{model_name}_predictions_{epoch}.json")
  output_nbest_file = os.path.join(output_dir, f"{model_name}_nbest_predictions_{epoch}.json")
  write_predictions(eval_examples, eval_features, all_results,
                    n_best_size, max_answer_length,
                    False, output_prediction_file, output_nbest_file,
                    None, False, False, 0.0)

  with open(predict) as dataset_file:
    dataset_json = json.load(dataset_file)
    dataset = dataset_json['data']

  with open(os.path.join(output_dir, f"{model_name}_predictions_{epoch}.json")) as prediction_file:
    predictions = json.load(prediction_file)
  logger.info(json.dumps(evaluate(dataset, predictions)))

### Model Load

In [8]:
# 1. Config
train_config, gen_config, disc_config = ElectraConfig(config_path = config_path).get_config()

# 2. Tokenizer
tokenizer = BertTokenizer(vocab_file=train_config.vocab_path, do_lower_case=False)

# 3. Electra
electra = Electra(
    config=train_config,
    gen_config=gen_config,
    disc_config=disc_config,
    num_tokens=tokenizer.vocab_size,
  )

# 4. Electra weight 
electra.tie_embedding_weight()
electra.cuda()

checkpoint = torch.load(checkpoint_path, map_location=device)
electra.load_state_dict(checkpoint['model_state_dict'])

print(f'Electra Model Load {checkpoint_path}')

model = ElectraMRCModel(electra=electra.discriminator,
                        dim=disc_config.dim)


Electra Model Load /content/drive/My Drive/Colab Notebooks/transformer-electra/checkpoints/transformer-electra-small-v4.pth


In [9]:
# electra.discriminator

##### Eval Data

In [10]:
eval_examples = read_squad_examples(input_file=dev_file,
                                    is_training=False,
                                    version_2_with_negative=False)
eval_features = convert_examples_to_features(examples=eval_examples,
                                              tokenizer=tokenizer,
                                              max_seq_length=train_config.max_seq_len,
                                              doc_stride=doc_stride,
                                              max_query_length=max_query_length,
                                              is_training=False)

04/12/2021 07:02:42 - INFO - example.language_model.korquad.korquad_utils -   *** Example ***
04/12/2021 07:02:42 - INFO - example.language_model.korquad.korquad_utils -   unique_id: 1000000000
04/12/2021 07:02:42 - INFO - example.language_model.korquad.korquad_utils -   example_index: 0
04/12/2021 07:02:42 - INFO - example.language_model.korquad.korquad_utils -   doc_span_index: 0
04/12/2021 07:02:42 - INFO - example.language_model.korquad.korquad_utils -   tokens: [CLS] 임 ##종 ##석이 여의도 농민 폭력 시위 ##를 주도 ##한 혐의로 지명 ##수 ##배 된 날 ##은 ? [SEP] 1989년 2월 15일 여의도 농민 폭력 시위 ##를 주도 ##한 혐의 ( 폭력 ##행위 ##등 ##처 ##벌 ##에 ##관 ##한 ##법 ##률 ##위 ##반 ) 으로 지명 ##수 ##배 ##되었다 . 1989년 3월 12일 서울 ##지방 ##검 ##찰 ##청 공 ##안 ##부는 임 ##종 ##석의 사전 ##구 ##속 ##영 ##장을 발 ##부 ##받았다 . 같은 해 6월 30일 평양 ##축 ##전에 임 ##수 ##경을 대표 ##로 파견 ##하여 국가 ##보 ##안 ##법 ##위 ##반 혐의 ##가 추가 ##되었다 . 경찰은 12월 18일 ~ 20일 사이 서울 경희 ##대학교 ##에서 임 ##종 ##석이 성명 발표 ##를 추진하고 있다는 첩 ##보를 입수 ##했고 , 12월 18일 오전 7시 40 ##분 경 가스 ##총 ##과 전자 ##봉 ##으로 무장 ##한 특 ##공 ##조 및 대공 ##과 직원 12 

In [11]:
# print(f'checkporint - {train_config.checkpoint_path}')

### Train Koquad

In [12]:
start_epoch = 0
# checkpoint_path =f'{output_dir}/one_epoch_electra_korquad_{start_epoch-1}.bin'
# if os.path.exists(checkpoint_path):
#   model.load_state_dict(torch.load(checkpoint_path))
# else:
#   start_epoch = 0 

num_params = count_parameters(model)
logger.info("Total Parameter: %d" % num_params)
model.to(device)

cached_train_features_file = train_file + '_{0}_{1}_{2}'.format(str(train_config.max_seq_len), str(doc_stride),
                                                                      str(max_query_length))
train_examples = read_squad_examples(input_file=train_file, is_training=True, version_2_with_negative=False)
try:
    with open(cached_train_features_file, "rb") as reader:
        train_features = pickle.load(reader)
except:
    train_features = convert_examples_to_features(
        examples=train_examples,
        tokenizer=tokenizer,
        max_seq_length=train_config.max_seq_len,
        doc_stride=doc_stride,
        max_query_length=max_query_length,
        is_training=True)
    logger.info("  Saving train features into cached file %s", cached_train_features_file)
    with open(cached_train_features_file, "wb") as writer:
        pickle.dump(train_features, writer)

num_train_optimization_steps = int(len(train_features) / train_batch_size) * num_train_epochs

# Prepare optimizer
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
      'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters,
                  lr=learning_rate,
                  eps=adam_epsilon)
scheduler = WarmupLinearSchedule(optimizer,
                                  warmup_steps=num_train_optimization_steps*0.1,
                                  t_total=num_train_optimization_steps)

logger.info("***** Running training *****")
logger.info("  Num orig examples = %d", len(train_examples))
logger.info("  Num split examples = %d", len(train_features))
logger.info("  Batch size = %d", train_batch_size)
logger.info("  Num steps = %d", num_train_optimization_steps)
num_train_step = num_train_optimization_steps

all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
all_start_positions = torch.tensor([f.start_position for f in train_features], dtype=torch.long)
all_end_positions = torch.tensor([f.end_position for f in train_features], dtype=torch.long)
train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                            all_start_positions, all_end_positions)

train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=train_batch_size)

model.train()
global_step = 0
epoch = start_epoch
for i in range(start_epoch, int(num_train_epochs)):
    iter_bar = tqdm(train_dataloader, desc=f"Epoch-{i} Train(XX Epoch) Step(XX/XX) (Mean loss=X.X) (loss=X.X)")
    tr_step, total_loss, mean_loss = 0, 0., 0.
    for step, batch in enumerate(iter_bar):
        if n_gpu == 1:
            batch = tuple(t.to(device) for t in batch)  # multi-gpu does scattering it-self
        
        input_ids, input_mask, segment_ids, start_positions, end_positions = batch
        loss = model(input_ids, input_mask.unsqueeze(1), start_positions, end_positions)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)

        scheduler.step()
        optimizer.step()
        optimizer.zero_grad()
        global_step += 1
        tr_step += 1
        total_loss += loss.item()
        mean_loss = total_loss / tr_step
        iter_bar.set_description(f"Epoch-{i} Train Step(%d / %d) (Mean loss=%5.5f) (loss=%5.5f)" %
                                  (global_step, num_train_step, mean_loss, loss.item()))

    logger.info("** ** * Saving file * ** **")
    model_checkpoint = f"{model_name}_{epoch}.bin"
    logger.info(model_checkpoint)
    output_model_file = os.path.join(output_dir, model_checkpoint)
    # 평가
    eval(model, epoch,eval_examples, eval_features,train_batch_size)
    model.train()

    torch.save(model.state_dict(), output_model_file)
    epoch += 1


04/12/2021 07:03:27 - INFO - __main__ -   Total Parameter: 12458498
04/12/2021 07:03:48 - INFO - __main__ -   ***** Running training *****
04/12/2021 07:03:48 - INFO - __main__ -     Num orig examples = 60407
04/12/2021 07:03:48 - INFO - __main__ -     Num split examples = 63907
04/12/2021 07:03:48 - INFO - __main__ -     Batch size = 32
04/12/2021 07:03:48 - INFO - __main__ -     Num steps = 9985
Epoch-0 Train Step(1998 / 9985) (Mean loss=2.25559) (loss=0.55059): 100%|██████████| 1998/1998 [14:23<00:00,  2.31it/s]
04/12/2021 07:18:13 - INFO - __main__ -   ** ** * Saving file * ** **
04/12/2021 07:18:13 - INFO - __main__ -   transformer-electra-small-v4_0.bin
04/12/2021 07:18:13 - INFO - __main__ -   ***** Evaluating *****
04/12/2021 07:18:13 - INFO - __main__ -     Num features = 6735
04/12/2021 07:18:13 - INFO - __main__ -     Batch size = 32
04/12/2021 07:18:13 - INFO - __main__ -   Start evaluating!
04/12/2021 07:18:38 - INFO - example.language_model.korquad.korquad_utils -   Writi