In [None]:
import os
import copy
import torch
import random
import evaluate
import datasets
import numpy as np
import pandas as pd
import pytorch_lightning as pl

from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
#from sklearn.metrics import accuracy_score, f1_score


from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForCausalLM,
    GPT2LMHeadModel
)
from rouge_score import rouge_scorer, scoring
from transformers import EarlyStoppingCallback
from transformers.optimization import get_cosine_with_hard_restarts_schedule_with_warmup
from transformers import Trainer, TrainingArguments, Seq2SeqTrainer, Seq2SeqTrainingArguments

In [None]:
class summ_dataset(Dataset):
    """dataframe을 torch dataset class로 변환"""
    def __init__(self, document, tokenizer):
      self.dataset = document
      self.tokenizer = tokenizer

    def __getitem__(self,idx):
        input_ids=torch.LongTensor(self.dataset["input_ids"][idx])
        labels=torch.LongTensor(self.dataset["labels"][idx])

        attention_mask=input_ids.ne(self.tokenizer.pad_token_id) ## padding token은 attention 계산에 반영되면 안되니까 mask를 정의한다..

        return dict(input_ids=input_ids, labels=labels, attention_mask=attention_mask)

    def __len__(self):
        return len(self.dataset["input_ids"])

In [None]:
def load_data(dataset_dir):
    """csv file을 dataframe으로 load"""
    dataset = pd.read_csv(dataset_dir)
    return dataset


def tokenized_dataset(dataset, doc_tokenizer, sum_tokenizer, doc_max_length, sum_max_len, mode="train"):
    """
    토크나이징을 위한 함수. training과 inference 단계에서의 토크나이징이 별도로 구축되어 있다.
    - 학습일 때는 본문과 요약이 함께 입력된다. --> 본문 [SEP] 요약
    - 반면 추론 단계에서는 본문만 입력되어 요약을 생성해야함.
    """
    ## 추론 단계
    if mode == "infer":
      ## inference 시에는 document 만 주어지고, 마지막에 bos_token을 붙여 생성 시작하게 한다.
      document_text = dataset['dialogue']
      summ_text = dataset['summary']

      ## document + bos
      ## <pad> <pad> d_1 d_2 d_3 ... d_n <bos>
      document = [doc_tokenizer(documents, padding = 'max_length', truncation=True, max_length=doc_max_length-1, add_special_tokens=True)['input_ids'] + [doc_tokenizer.bos_token_id] for documents in document_text.values]
      # labels에는 요약문만큼의 빈칸으로 채워준 후 모델이 예측하도록 함
      labels = [[-100] * sum_max_len for _ in document]
      out = {"input_ids": document, "labels": labels}

    elif mode == "train":
      document_text = dataset['dialogue']
      summary_text = dataset['summary']
      ## document 와 summary를 이어 붙여서 모델 학습에 사용. 
      ## document 뒤에는 bos_token 을 붙여 생성 시작을 명시하고, summary 를 붙인 후 맨 뒤에는 eos_token 으로 생성의 끝을 명시.
      ## ⭐️ document를 padding 할 때는 side를 left로 주고, summary를 padding 할 때는 side를 right로 줘서 연속된 문장이 생성될 수 있도록 한다.
      ## ⭐️ <pad> <pad> d_1 d_2 d_3 ... d_n <bos> s_1 s_2 ... s_m <eos> <pad> <pad>
      document = [doc_tokenizer(documents, padding='max_length', truncation=True, max_length=doc_max_length-1, add_special_tokens=True)['input_ids'] + [doc_tokenizer.bos_token_id] for documents in document_text.values]
      summary = [sum_tokenizer(summaries + sum_tokenizer.eos_token, padding = 'max_length',truncation=True, max_length=sum_max_len, add_special_tokens=True)['input_ids'] for summaries in summary_text.values]

      ## 구성해둔 document 와 summary를 결합하여 input 준비
      tokenized_senetences = [document + summary for (document, summary) in zip(document, summary)]
      ## document는 생성할 내용이 아니므로 -100으로 label을 부여한다.
      # Input : <pad> <pad> d_1  d_2  d_3  ... d_n  <bos> s_1 s_2 ... s_m <eos> <pad> <pad>
      # Label : -100  -100    -100 -100 -100  ... -100  -100  s_1 s_2 ... s_m <eos> -100 -100

      labels = [[-100] * len(document) + summary for (document, summary) in zip(document, summary)]
      ## ⭐️ Q. 다음에 올 Token을 생성하도록 학습해야 되니까 s_1의 label은 한 칸씩 밀린 s_2가 들어가야 되지 않나요?
      # A. Transformer 라이브러리의 GPT 구현(https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L1103-L1104)을 보면, 
      # 모델의 Logit을 [: -1]만 가져오고, Label은 [1: ]을 가져와서 Loss를 계산하게 됩니다.
      # 즉, Input과 Label이 한 칸씩 밀린채로 입력을 넣지 않아도, 내부 구현에 의해 자동으로 밀린 채로 계산이 됩니다.

      # padding 된 부분이 학습되지 않도록 -100 으로 치환
      labels = [[-100 if token == sum_tokenizer.pad_token_id else token for token in l] for l in labels]
      out = {"input_ids": tokenized_senetences, "labels": labels}

    return out

In [None]:
def prepare_dataset(dataset_dir, doc_tokenizer,sum_tokenizer,doc_max_len, sum_max_len):
    """학습(train)과 평가(test)를 위한 데이터셋을 준비"""
    # load_data
    train_dataset = load_data(os.path.join(dataset_dir, "train_translated.csv"))
    val_dataset = load_data(os.path.join(dataset_dir, "dev_translated.csv"))

    tokenized_train = tokenized_dataset(train_dataset, doc_tokenizer,sum_tokenizer, doc_max_len, sum_max_len)
    tokenized_val = tokenized_dataset(val_dataset, doc_tokenizer,sum_tokenizer, doc_max_len, sum_max_len)

    summ_train_dataset = summ_dataset(tokenized_train, doc_tokenizer)
    summ_val_dataset = summ_dataset(tokenized_val, doc_tokenizer)

    return summ_train_dataset , summ_val_dataset

In [None]:
def compute_metrics(args, pred):
    MODEL_NAME = args.model_name
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

    # 예측값과 정답
    labels = pred.label_ids
    preds  = pred.predictions.argmax(-1)
    if isinstance(preds, tuple):
      preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds[:, args.doc_max_len:], skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels[:, args.doc_max_len:], skip_special_tokens=True)

    metric = datasets.load_metric("rouge")
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    result = {k: round(v, 4) for k, v in result.items()}
    return {
        'Rouge-2' : result['rouge2']
        }

In [None]:
def load_tokenizer_and_model_for_train(args):
    MODEL_NAME = args.model_name
    doc_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, padding_side="left")
    sum_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, padding_side="right")
    
    model_config = AutoConfig.from_pretrained(MODEL_NAME)
    model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, config=model_config)
    
    return doc_tokenizer, sum_tokenizer, model

In [None]:
def load_trainer_for_train(args, model, summ_train_dataset, summ_val_dataset):
    """학습(train)을 위한 huggingface trainer 설정"""
    
    training_args = TrainingArguments(
        fp16=True,
        gradient_accumulation_steps=4,
        output_dir=args.save_path + "results",  # output directory
        save_total_limit=args.save_limit,  # number of total save model.
        save_steps=args.save_step,  # model saving step.
        num_train_epochs=args.epochs,  # total number of training epochs
        learning_rate=args.lr,  # learning_rate
        per_device_train_batch_size=args.batch_size,  # batch size per device during training
        per_device_eval_batch_size=1,  # batch size for evaluation
        warmup_steps=args.warmup_steps,  # number of warmup steps for learning rate scheduler
        weight_decay=args.weight_decay,  # strength of weight decay
        logging_dir=args.save_path + "logs",  # directory for storing logs
        logging_steps=args.logging_steps,  # log saving step.
        evaluation_strategy="steps",  # evaluation strategy to adopt during training
            # `no`: No evaluation during training.
            # `steps`: Evaluate every `eval_steps`.
            # `epoch`: Evaluate every end of epoch.
        eval_steps=args.eval_steps,  # evaluation step.
        load_best_model_at_end=True,
    )

    ## Add callback & optimizer & scheduler
    MyCallback = EarlyStoppingCallback(
        early_stopping_patience=5, early_stopping_threshold=0.001
    )

    optimizer = torch.optim.AdamW(
        model.parameters(),
        lr=args.lr,
        betas=(0.9, 0.999),
        eps=1e-08,
        weight_decay=args.weight_decay,
        amsgrad=False,
    )
    
    trainer = Trainer(
        model=model,  # the instantiated 🤗 Transformers model to be trained
        args=training_args,  # training arguments, defined above
        train_dataset=summ_train_dataset,  # training dataset
        eval_dataset=summ_val_dataset,  # evaluation dataset
        compute_metrics=lambda p: compute_metrics(args, p),
        callbacks=[MyCallback],
        optimizers=(
            optimizer,
            get_cosine_with_hard_restarts_schedule_with_warmup(
                    optimizer,
                    num_warmup_steps=args.warmup_steps,
                    num_training_steps=len(summ_train_dataset) * args.epochs,
            ),
        ),
    )

    return trainer

In [None]:
def train(args):
    pl.seed_everything(seed=42, workers=False)

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print("device:", device)

    doc_tokenizer, sum_tokenizer , model = load_tokenizer_and_model_for_train(args)
    model.to(device)

    summ_train_dataset, summ_val_dataset = prepare_dataset(args.dataset_dir,doc_tokenizer, sum_tokenizer,args.doc_max_len,args.sum_max_len)
    trainer = load_trainer_for_train(args, model, summ_train_dataset, summ_val_dataset)
    trainer.train()
    model.save_pretrained("./best_model")


In [None]:
class args():
    """학습(train)과 추론(infer)에 사용되는 arguments 관리하는 class"""
    dataset_dir = "../dataset"
    model_type = "gpt2"
    model_name = 'MrBananaHuman/kogpt2_small'
    save_path = "./GPT2"
    save_step = 400
    logging_steps = 200
    eval_steps = 200
    save_limit = 5
    seed = 42
    epochs = 20 # 10
    batch_size = 2
    doc_max_len = 512
    sum_max_len = 128
    lr = 3e-5
    weight_decay = 0.01
    warmup_steps = 5
    scheduler = "linear"
    model_dir = "./best_model"
    
train(args)