<a href="https://colab.research.google.com/github/newfull5/AI-Project/blob/master/summarization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers datasets torch wandb tqdm evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 KB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: evaluate
Successfully installed evaluate-0.4.0


- model = t5-base
- dataset = CNN_dailymail 3.0.0
  - max_token_length = about 3,500
  

- config
  - max_length_truncate = 1024
  


In [None]:
from datasets import load_dataset
from tqdm import tqdm

In [None]:
# length 0 ~ 500 => 1900
# length 500 ~ 1000 => 5400
# length 1000 ~ 1500 => 2800
# length 1500 ~ 2000 -> 1150
# length 2000 ~  -> 500

In [None]:
import argparse

def _get_parser():  
    parser = argparse.ArgumentParser()
    parser.add_argument()
    return parser
   

args = argparse.Namespace(  
  model_name="t5-large", 
  tokenizer_name="t5-large",
  dataset_name=['cnn_dailymail', '3.0.0'],
  batch_size = 4,
  lr=3e-5,
  val_check_interval= 2000
)

In [None]:
from torch.utils.data import Dataset
from transformers import AutoTokenizer
from datasets import load_dataset

class Dataset(Dataset):
  def __init__(self, args, stage):
    super().__init__()
    self.stage = stage
    self.args = args
    self.input_ids, self.attention_mask, self.decoder_input_ids, self.labels = self._get_data(args, stage)

  def _get_data(self, args, stage):
    dataset = load_dataset(args.dataset_name[0], args.dataset_name[1])
    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name)
    input_ids = []
    attention_mask = []
    decoder_input_ids = []
    labels = []

    if stage in ['train', 'validation', 'test']:
      for train_set in dataset[stage]:
        inputs = tokenizer(
            text=train_set['article'],
            max_length=1024,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        input_ids.append(inputs['input_ids'])
        attention_mask.append(inputs['attention_mask'])
        
        outputs = tokenizer(
            text=train_set['highlights'],
            max_length=256,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        decoder_input_ids.append(outputs['input_ids'][:, :-1].contiguous())
        label = outputs['input_ids'][:, 1:].clone().detach()
        label[label == tokenizer.pad_token_id] = -100
        labels.append(label)
    else:
      raise Exception("you can set stage only 'train', 'test' or 'valid'")

    return input_ids, attention_mask, decoder_input_ids, labels
      
  def __len__(self):
    return len(self.labels)

  def __getitem__(self, idx):
    return self.input_ids[idx], self.attention_mask[idx], self.decoder_input_ids[idx], self.labels[idx]

In [None]:
from transformers import AutoModelForSeq2SeqLM
import torch
from torch import nn

class Model(nn.Module):
  def __init__(self, args):
    super(Model, self).__init__()
    self.model = AutoModelForSeq2SeqLM.from_pretrained(args.model_name)
    self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    self.model.to(self.device)

  def forward(self, batch):
    input_ids, attention_mask, decoder_input_ids, labels = batch
    outputs = self.model(
        input_ids=self._move_to_cuda(input_ids).squeeze(),
        attention_mask=self._move_to_cuda(attention_mask),
        decoder_input_ids=self._move_to_cuda(decoder_input_ids).squeeze(),
        labels=self._move_to_cuda(labels)    
    )
    return outputs

  def _move_to_cuda(self, inputs):
    if torch.is_tensor(inputs):
      return inputs.to(self.device)
    elif isinstance(inputs, list):
      return [self._move_to_cuda(x) for x in inputs]
    elif isinstance(inputs, dict):
      return {key: self._move_to_cuda(value) for key, value in inputs.items()}
    else:
      return inputs

  def save(self, save_dir):
    self.model.save_pretrained(save_dir)

  def load(self, save_dir):
    self.model.load_state_dict(
        torch.load(f"{save_dir}/pytorch_model.bin", map_location=torch.device(self.device))
    )

In [None]:
args

Namespace(batch_size=4, dataset_name=['cnn_dailymail', '3.0.0'], model_name='t5-large', tokenizer_name='t5-large')

In [None]:
from torch.utils.data import DataLoader

test_loader = DataLoader(
    dataset = Dataset(args, 'test'),
    batch_size = args.batch_size,
    shuffle = True
)

In [None]:
model = Model(args)

Downloading:   0%|          | 0.00/2.95G [00:00<?, ?B/s]

In [None]:
batch = [n for n in test_loader][0]

In [None]:
batch

[tensor([[[   94,  1416,   114,  ...,    53,   590,     1]],
 
         [[  555,  9745, 17021,  ...,     0,     0,     0]],
 
         [[   71,  2095, 13100,  ...,     0,     0,     0]],
 
         [[ 9765, 13450,   808,  ...,     0,     0,     0]]]),
 tensor([[[1, 1, 1,  ..., 1, 1, 1]],
 
         [[1, 1, 1,  ..., 0, 0, 0]],
 
         [[1, 1, 1,  ..., 0, 0, 0]],
 
         [[1, 1, 1,  ..., 0, 0, 0]]]),
 tensor([[[   71,   388,  1938,  ...,     0,     0,     0]],
 
         [[    3, 10038,  5911,  ...,     0,     0,     0]],
 
         [[30214,  2715,   480,  ...,     0,     0,     0]],
 
         [[ 9765, 13450,    65,  ...,     0,     0,     0]]]),
 tensor([[[  388,  1938,    16,  ...,  -100,  -100,  -100]],
 
         [[10038,  5911,   100,  ...,  -100,  -100,  -100]],
 
         [[ 2715,   480,  8489,  ...,  -100,  -100,  -100]],
 
         [[13450,    65,   118,  ...,  -100,  -100,  -100]]])]

In [None]:
outputs = model(batch)

In [None]:
a,b,c,d = outputs

In [None]:
print(a,b,c,d)

loss logits past_key_values encoder_last_hidden_state


In [None]:
from torch.cuda.amp import GradScaler, autocast
from tqdm import tqdm
import wandb

class Trainer:
  def __init__(self, args, model, train_loader, validation_loader):
    self.args = args
    self.model = model
    self.train_loader = train_loader
    self.valid_loader = validation_loader
    self.global_steps = 0
    self.optimizer = torch.optim.SGD(self.model.parameters(), lr=args.lr)
    self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    self.val_check_interval = args.val_check_interval
    self.global_steps = 0

  def training_phase(self):
    self.model.train()
    scaler = GradScaler()
    total_train_loss = 0
    train_steps = 0

    for batch in tqdm(self.train_loader):
      self.optimizer.zero_grad()
      with autocast(device_type=self.device, dtype=torch.float16):
        outputs = model(batch)
      
      scaler.scale(outputs.loss).backward()
      scaler.step(self.optimizer)
      scaler.update()
      self.global_steps += 1
      totla_train_loss += float(outputs.loss)

      if self.global_step % self.val_check_step == 0 and self.global_step != 0:
        wandb.log({
            'train_loss': (total_train_loss/self.val_check_interval)
            })
        
        total_train_loss = 0
        model.save(f"./cnn_daily_summrization/{self.global_step}/")
        self.valid_phase()

  def valid_phase(self):
    self.model_eval()


  

In [None]:
outputs.logits.shape

torch.Size([4, 511, 32128])

In [None]:
tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-large automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [None]:
float(outputs.loss)

7.111657619476318

In [None]:
tokenizer.decode(outputs.logits)

TypeError: ignored

In [None]:
tokenizer.decode(3)

''

In [None]:
batch[-1][0]

tensor([[  388,  1938,    16,     8, 28355,    13,  5186,  1117, 18838,     3,
            60,     7,  8312,    53,     3,     9,   628,  2009,    19,    30,
             8,   512,    21, 15287,   770,     3,     5,    37,   785,   718,
           901, 18413,     9,     6,  2561,     7,    30,     3,  4552,  9704,
            13,   878,    77,   929,    15, 12574,  1161,   222,    11, 26533,
             7,   147,  7366, 14604,    13,  2608,  6849,     3,     5,     3,
         14454,    57, 13188,  5417, 16634,     6,    34,   751,  3746,  2548,
            13,     3, 16768,     7,  1384,    13,     8,  2929,    21,     8,
           538,    16,  1412,     3,     5,    94,  8125,     7,  1296,  8458,
             6,  1296, 14704,     6,   874, 19615,   725,     6,     3,     9,
           204,  5548,  4782, 25974,    11,     3,     9,     3, 17801,  5612,
           910, 14604,    45,     8,   629,     3,     5,     1,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  

In [None]:
outputs.logits[0].argmax(dim=1)

tensor([   71,     3,     3,    91,    91, 32077, 32077,    13, 18838,    13,
            5, 32077,     3,    53, 32077,    60, 32077, 32077,    11,     3,
            8,     3,    13,    13,     3,     3,     5,     3,    16,    19,
          491,     3,     9,    19,    47,     7,   219,     3,     9,    64,
           13,     3,    64,    64,    15,    64,    64,   222,    64,     3,
            7,     3,  7366,  2286,    13,   193,   229,     6,     5,    64,
            5,    11,    64,     6,     3,     7,     3,    31,     3,     7,
           21,     3,     9,     7,     3,    13,     3,     3,    21,     3,
            3,     3,     3,     3,     5,     3,     7,     7,     3,     3,
           64,  1296,  8458,     6,   305, 19615,   725,    64,    11,     9,
           64,  5548,    11,    11,    11,     3,     9,     3,     7,    18,
            6,     3,   193,     9,     3,     9,     9, 32077, 32077,     3,
            3,     3,     3,     3,     3,     3,     3,     3, 

In [None]:
evaluate.load('rouge')

NameError: ignored