# 환경 세팅

In [None]:
!pip install transformers pytorch_lightning pickle5

In [None]:
import pickle5 as pickle
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import train_test_split

import torch
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader

import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint, LearningRateMonitor
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning import loggers as pl_loggers

from transformers import PreTrainedTokenizerFast
from transformers import BartForConditionalGeneration
from transformers.optimization import get_cosine_schedule_with_warmup

import logging

logger = logging.getLogger()
logger.setLevel(logging.INFO)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
cd 

# Make input data

In [None]:
with open('./data/data.pickle', 'rb') as f:
    data = pickle.load(f)

In [None]:
df = pd.DataFrame(data, columns =['text', 'summary'])
df = df.drop_duplicates(subset=None, keep='first', inplace=False, ignore_index=False)
df = df.sample(frac=1, random_state=42).reset_index(drop=True) 

# train, validation, test 분리
doc_len = df.shape[0]

train = df[:7000]
val = df[7000:8000]
test = df[8000:]


In [None]:
# aihub 데이터 불러오기
with open('./data/aihub.pickle', 'rb') as f:
    data_ai = pickle.load(f)
  
ai_df = pd.DataFrame(data_ai, columns =['text', 'summary'])

train = pd.concat([train, ai_df])

In [None]:
print("total_shape:", df.shape)
print("train_shape:", train.shape)
print("val_shape:", val.shape)
print("test_shape:", test.shape)

In [None]:
train.head()

In [None]:
class SummaryDataset(Dataset):
    def __init__(self, 
                 data: pd.DataFrame,
                 tokenizer: PreTrainedTokenizerFast,
                 max_len:int=512,
                 ignore_index=-100):
      
        super().__init__()
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.data = data
        self.len = self.data.shape[0]

        self.pad_index = self.tokenizer.pad_token_id
        self.ignore_index = ignore_index

    def add_padding_data(self, inputs):
        if len(inputs) < self.max_len:
            pad = np.array([self.pad_index] *(self.max_len - len(inputs)))
            inputs = np.concatenate([inputs, pad])
        else:
            inputs = inputs[:self.max_len]

        return inputs

    def add_ignored_data(self, inputs):
        if len(inputs) < self.max_len:
            pad = np.array([self.ignore_index] *(self.max_len - len(inputs)))
            inputs = np.concatenate([inputs, pad])
        else:
            inputs = inputs[:self.max_len]

        return inputs
    
    def __getitem__(self, idx):
        instance = self.data.iloc[idx]
        input_ids = self.tokenizer.encode(instance['text'])
        input_ids = self.add_padding_data(input_ids)

        label_ids = self.tokenizer.encode(instance['summary'])
        label_ids.append(self.tokenizer.eos_token_id)
        decoder_input_ids = [self.tokenizer.eos_token_id]
        decoder_input_ids += label_ids[:-1]
        decoder_input_ids = self.add_padding_data(decoder_input_ids)
        label_ids = self.add_ignored_data(label_ids)

        result = {'input_ids': np.array(input_ids, dtype=np.int_),
                  'decoder_input_ids': np.array(decoder_input_ids, dtype=np.int_),
                  'labels': np.array(label_ids, dtype=np.int_),
                  }
        return result

    def __len__(self):
        return self.len


In [None]:
class SummaryDataModule(pl.LightningDataModule):
    def __init__(self, 
                 train_df: pd.DataFrame,
                 val_df: pd.DataFrame,
                 test_df: pd.DataFrame, 
                 tokenizer: PreTrainedTokenizerFast,
                 max_len:int=512,
                 batch_size:int=8,
                 num_workers:int=2):
      
        super().__init__()
        self.batch_size = batch_size
        self.max_len = max_len
        self.train_df = train_df
        self.val_df = val_df
        self.test_df = test_df
        self.tokenizer = tokenizer
        self.num_workers = num_workers

    def setup(self, stage):
        self.train = SummaryDataset(self.train_df,
                                    self.tokenizer,
                                    self.max_len)
        self.val = SummaryDataset(self.val_df,
                                   self.tokenizer,
                                   self.max_len)
        self.test = SummaryDataset(self.test_df,
                                   self.tokenizer,
                                   self.max_len)

    def train_dataloader(self):
        train = DataLoader(self.train,
                           batch_size=self.batch_size,
                           num_workers=self.num_workers, 
                           shuffle=True)
        return train

    def val_dataloader(self):
        val = DataLoader(self.val,
                         batch_size=self.batch_size,
                         num_workers=self.num_workers, 
                         shuffle=False)
        return val

    def test_dataloader(self):
        test = DataLoader(self.test,
                          batch_size=self.batch_size,
                          num_workers=self.num_workers, 
                          shuffle=False)
        return test

In [None]:
tokenizer = PreTrainedTokenizerFast.from_pretrained('gogamza/kobart-base-v1')
data_module = SummaryDataModule(train, val, test, 
                                tokenizer,
                                batch_size=8,
                                max_len=512,
                                num_workers=2)

# Model

In [None]:
class KoBARTConditionalGeneration(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.model = BartForConditionalGeneration.from_pretrained('gogamza/kobart-base-v1')
        self.model.train()
        self.bos_token = '<s>'
        self.eos_token = '</s>'
        
        self.tokenizer = PreTrainedTokenizerFast.from_pretrained('gogamza/kobart-base-v1')
        self.pad_token_id = self.tokenizer.pad_token_id

    def forward(self, inputs):

        attention_mask = inputs['input_ids'].ne(self.pad_token_id).float()
        decoder_attention_mask = inputs['decoder_input_ids'].ne(self.pad_token_id).float()
        
        return self.model(input_ids=inputs['input_ids'],
                          attention_mask=attention_mask,
                          decoder_input_ids=inputs['decoder_input_ids'],
                          decoder_attention_mask=decoder_attention_mask,
                          labels=inputs['labels'], return_dict=True)
        
    def setup_steps(self, stage=None):
        train_loader = self.trainer._data_connector._train_dataloader_source.dataloader()
        return len(train_loader)

    def configure_optimizers(self):
        lr = 3e-5
        num_workers = 2
        batch_size = 8
        max_epochs = 30
        warmup_ratio = 0.1

        param_optimizer = list(self.model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(
                nd in n for nd in no_decay)], 'weight_decay': 0.01},
            {'params': [p for n, p in param_optimizer if any(
                nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
        
        optimizer = AdamW(optimizer_grouped_parameters, lr=lr)

        data_len = self.setup_steps(self)
        logging.info(f'number of workers {num_workers}, data length {data_len}')
        num_train_steps = int(data_len / (batch_size * num_workers) * max_epochs)
        logging.info(f'num_train_steps : {num_train_steps}')
        num_warmup_steps = int(num_train_steps * warmup_ratio)
        logging.info(f'num_warmup_steps : {num_warmup_steps}')
        scheduler = get_cosine_schedule_with_warmup(optimizer,
                                                    num_warmup_steps=num_warmup_steps, 
                                                    num_training_steps=num_train_steps)
        lr_scheduler = {'scheduler': scheduler, 
                        'monitor': 'loss', 'interval': 'step',
                        'frequency': 1}

        return [optimizer], [lr_scheduler]


    def training_step(self, batch, batch_idx):
        outs = self(batch)
        loss = outs.loss
        self.log('train_loss', loss, prog_bar=True, logger=True, batch_size=len(batch))
        return loss

    def validation_step(self, batch, batch_idx):
        outs = self(batch)
        loss = outs['loss']
        return (loss)

    def validation_epoch_end(self, outputs):
        losses = []
        for loss in outputs:
            losses.append(loss)
        self.log('val_loss', torch.stack(losses).mean(), prog_bar=True)

In [None]:
EPOCH = 30
checkpoint_callback = ModelCheckpoint(dirpath="checkpoints",
                                      filename="bast-checkpoint",
                                      save_top_k=1,
                                      verbose=True,
                                      monitor="val_loss",
                                      mode="min")

early_stop_callback = EarlyStopping(monitor="val_loss", min_delta=0.001, patience=3)

logger = TensorBoardLogger("lightning_logs", name="summary")

trainer = pl.Trainer(logger=logger,
                     checkpoint_callback=[checkpoint_callback, early_stop_callback],
                     max_epochs=EPOCH,
                     gpus=1,
                     progress_bar_refresh_rate=30)

model = KoBARTConditionalGeneration()

In [None]:
trainer.fit(model, data_module)

In [None]:
%load_ext tensorboard
%tensorboard --logdir ./lightning_logs

In [None]:
trained_model = KoBARTConditionalGeneration.load_from_checkpoint(trainer.checkpoint_callback.best_model_path)
trained_model.freeze()

trained_model.model.save_pretrained("./model")

In [None]:
def summarize(text):
  _load_model = BartForConditionalGeneration.from_pretrained('./model')
  raw_input_ids = tokenizer.encode(text)
  input_ids = [tokenizer.bos_token_id] + raw_input_ids + [tokenizer.eos_token_id]
  summary_ids = _load_model.generate(torch.tensor([input_ids]),  num_beams=4,  max_length=512,  eos_token_id=1)
  preds = tokenizer.decode(summary_ids.squeeze().tolist(), skip_special_tokens=True)

  return preds
  

In [None]:
sample_row = test.iloc[1]
text = sample_row["text"]
model_summary = summarize(text)

In [None]:
print(text)

In [None]:
print(sample_row["summary"])

In [None]:
print(model_summary)