<a href="https://colab.research.google.com/github/newfull5/AI-Project/blob/master/summarization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers datasets torch wandb tqdm

- model = t5-base
- dataset = CNN_dailymail 3.0.0
  - max_token_length = about 3,500
  

- config
  - max_length_truncate = 1024
  


In [2]:
from datasets import load_dataset
from tqdm import tqdm

In [8]:
# length 0 ~ 500 => 1900
# length 500 ~ 1000 => 5400
# length 1000 ~ 1500 => 2800
# length 1500 ~ 2000 -> 1150
# length 2000 ~  -> 500

In [40]:
# predict

outputs[0].shape

torch.Size([1, 42, 32128])

In [43]:
# logits

len(outputs[1])

24

In [39]:
outputs[2].shape

torch.Size([1, 787, 1024])

In [3]:
import argparse

def _get_parser():  
    parser = argparse.ArgumentParser()
    parser.add_argument()
    return parser
   

args = argparse.Namespace(  
  model_name="t5-large", 
  tokenizer_name="t5-large",
  dataset_name=['cnn_dailymail', '3.0.0'],
  batch_size = 4
)

In [4]:
from torch.utils.data import Dataset
from transformers import AutoTokenizer
from datasets import load_dataset

class Dataset(Dataset):
  def __init__(self, args, stage):
    super().__init__()
    self.stage = stage
    self.args = args
    self.input_ids, self.attention_mask, self.decoder_input_ids, self.labels = self._get_data(args, stage)

  def _get_data(self, args, stage):
    dataset = load_dataset(args.dataset_name[0], args.dataset_name[1])
    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name)
    input_ids = []
    attention_mask = []
    decoder_input_ids = []
    labels = []

    if stage in ['train', 'validation', 'test']:
      for train_set in dataset[stage]:
        inputs = tokenizer(
            text=train_set['article'],
            max_length=1024,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        input_ids.append(inputs['input_ids'])
        attention_mask.append(inputs['attention_mask'])
        
        outputs = tokenizer(
            text=train_set['highlights'],
            max_length=512,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        decoder_input_ids.append(outputs['input_ids'][:, :-1].contiguous())
        label = outputs['input_ids'][:, 1:].clone().detach()
        label[label == tokenizer.pad_token_id] = -100
        labels.append(label)
    else:
      raise Exception("you can set stage only 'train', 'test' or 'valid'")

    return input_ids, attention_mask, decoder_input_ids, labels
      
  def __len__(self):
    return len(self.labels)

  def __getitem__(self, idx):
    return self.input_ids[idx], self.attention_mask[idx], self.decoder_input_ids[idx], self.labels[idx]

In [5]:
from transformers import AutoModelForSeq2SeqLM
import torch
from torch import nn

class Model(nn.Module):
  def __init__(self, args):
    super(Model, self).__init__()
    self.model = AutoModelForSeq2SeqLM.from_pretrained(args.model_name)
    self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    self.model.to(self.device)

  def forward(self, batch):
    input_ids, attention_mask, decoder_input_ids, labels = batch
    outputs = self.model(
        input_ids=self._move_to_cuda(input_ids).squeeze(),
        attention_mask=self._move_to_cuda(attention_mask),
        decoder_input_ids=self._move_to_cuda(decoder_input_ids).squeeze(),
        labels=self._move_to_cuda(labels)    
    )
    return outputs

  def _move_to_cuda(self, inputs):
    if torch.is_tensor(inputs):
      return inputs.to(self.device)
    elif isinstance(inputs, list):
      return [self._move_to_cuda(x) for x in inputs]
    elif isinstance(inputs, dict):
      return {key: self._move_to_cuda(value) for key, value in inputs.items()}
    else:
      return inputs

  def save(self, save_dir):
    self.model.save_pretrained(save_dir)

  def load(self, save_dir):
    self.model.load_state_dict(
        torch.load(f"{save_dir}/pytorch_model.bin", map_location=torch.device(self.device))
    )

In [7]:
args

Namespace(batch_size=4, dataset_name=['cnn_dailymail', '3.0.0'], model_name='t5-large', tokenizer_name='t5-large')

In [6]:
from torch.utils.data import DataLoader

test_loader = DataLoader(
    dataset = Dataset(args, 'test'),
    batch_size = args.batch_size,
    shuffle = True
)

In [7]:
model = Model(args)

In [8]:
batch = [n for n in test_loader][0]

In [14]:
batch

[tensor([[[20855, 10213,    19,  ...,    63,     6,     1]],
 
         [[ 1589,  7223, 21608,  ...,     0,     0,     0]],
 
         [[   71,  4169,    18,  ...,     0,     0,     0]],
 
         [[    3, 29541,    13,  ...,     0,     0,     0]]]),
 tensor([[[1, 1, 1,  ..., 1, 1, 1]],
 
         [[1, 1, 1,  ..., 0, 0, 0]],
 
         [[1, 1, 1,  ..., 0, 0, 0]],
 
         [[1, 1, 1,  ..., 0, 0, 0]]]),
 tensor([[[23270, 20767,   106,  ...,     0,     0,     0]],
 
         [[13439, 10715,  7975,  ...,     0,     0,     0]],
 
         [[  549, 24947,  2365,  ...,     0,     0,     0]],
 
         [[    3, 29541,   130,  ...,     0,     0,     0]]]),
 tensor([[[20767,   106,     6,  ...,  -100,  -100,  -100]],
 
         [[10715,  7975,  7511,  ...,  -100,  -100,  -100]],
 
         [[24947,  2365,    10,  ...,  -100,  -100,  -100]],
 
         [[29541,   130,   646,  ...,  -100,  -100,  -100]]])]

In [None]:
class Trainer:
  