In [None]:
!pip install sentencepiece
!pip install transformers
!pip install rouge_score
!pip install evaluate
!pip install datasets
!pip install rouge
!pip install wandb
!wandb login
!pip install pytorch_lightning

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m31.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.97
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m30.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.3-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14

In [None]:
import pandas as pd
import evaluate # library that makes evaluating and comparing models and reporting their performance easier and more standardized.
from datasets import load_dataset, load_metric
import os
import numpy as np
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW
from torch import cuda
from rouge import Rouge
import wandb
import time
from tqdm import tqdm
import pytorch_lightning as pl
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning.callbacks import ModelCheckpoint

In [None]:
dataset = load_dataset('ccdv/arxiv-summarization')



Downloading and preparing dataset arxiv-summarization/section to /root/.cache/huggingface/datasets/ccdv___arxiv-summarization/section/1.0.0/fa2c9abf4312afb8660ef8e041d576b8e3943ea96ae771bd3cd091b5798e7cc3...


Downloading data:   0%|          | 0.00/3.36G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/102M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/102M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset arxiv-summarization downloaded and prepared to /root/.cache/huggingface/datasets/ccdv___arxiv-summarization/section/1.0.0/fa2c9abf4312afb8660ef8e041d576b8e3943ea96ae771bd3cd091b5798e7cc3. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
train_df = dataset['train'].select(list(range(0, 100)))
test_df = dataset['test'].select(list(range(0, 100)))

train_df = pd.DataFrame(train_df)
test_df = pd.DataFrame(test_df)

In [None]:
class SummarizationDataset():
  def __init__(self, dataframe, tokenizer, source_len, target_len):
    self.tokenizer = tokenizer
    self.data = dataframe
    self.text_max_token_len = source_len
    self.summary_max_token_len = target_len

  def __len__(self):
    return len(self.data)

  def __getitem__(self, index):
    data_row = self.data.iloc[index]

    source_text = data_row['article']
    target_text = data_row['abstract']

    #encoding
    text_encoding = tokenizer(source_text, max_length= self.text_max_token_len, return_attention_mask = True, add_special_tokens=True, truncation=True, padding="max_length", return_tensors='pt')
    summary_encoding = tokenizer(target_text, max_length= self.summary_max_token_len, return_attention_mask = True, add_special_tokens=True, truncation=True, padding="max_length", return_tensors='pt')

    text_input_ids = text_encoding['input_ids'].flatten() #- список id токенов, кот. будут переданы модели
    text_attention_mask = text_encoding['attention_mask'].flatten() #список индексов, какие токены attended

    labels = summary_encoding['input_ids']
    labels[labels == 0] = -100  

    labels_attention_mask = summary_encoding['attention_mask'].flatten()

    return {
        'text' : source_text,
        'summary' : target_text,
        'text_input_ids': text_input_ids, 
        'text_attention_mask': text_attention_mask, 
        'labels': labels.flatten(),
        'labels_attention_mask': labels_attention_mask
    }

In [None]:
class SummarizationDataModule(pl.LightningDataModule):
  def __init__(self, train_df, test_df, tokenizer, batch_size, text_max_token_len, summary_max_token_len):
    super().__init__()
    self.train_df = train_df
    self.test_df = test_df

    self.batch_size = batch_size
    self.tokenizer = tokenizer
    self.text_max_token_len = text_max_token_len
    self.summary_max_token_len = summary_max_token_len

  def setup(self, stage = None):
    self.train_dataset = SummarizationDataset(self.train_df, self.tokenizer, self.text_max_token_len, self.summary_max_token_len)
    self.test_dataset = SummarizationDataset(self.test_df, self.tokenizer, self.text_max_token_len, self.summary_max_token_len)

  def train_dataloader(self):
    return DataLoader(self.train_dataset, batch_size = self.batch_size, shuffle = True, num_workers = 2)

  def val_dataloader(self):
    return DataLoader(self.test_dataset, batch_size = self.batch_size, shuffle = False, num_workers = 2)

  def test_dataloader(self):
    return DataLoader(self.test_dataset, batch_size = self.batch_size, shuffle = False, num_workers = 2)

In [None]:

model_params={
    "MODEL":"t5-large",
    "BATCH_SIZE":32,          # batch size брать 32/64/128
    "TRAIN_EPOCHS":100,              
    "LEARNING_RATE":1e-4,          # learning rate
    "MAX_SOURCE_TEXT_LENGTH":100,  # 
    "MAX_TARGET_TEXT_LENGTH":50,   # 
    "SEED": 42,                    # set seed for reproducibility 
    'device':'cuda' 
}


tokenizer = T5Tokenizer.from_pretrained(model_params["MODEL"])

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [None]:
data_module = SummarizationDataModule(
    train_df, test_df, 
    tokenizer, 
    batch_size = model_params['BATCH_SIZE'], 
    text_max_token_len = model_params['MAX_SOURCE_TEXT_LENGTH'], 
    summary_max_token_len = model_params["MAX_TARGET_TEXT_LENGTH"]
  )

In [None]:
wandb.init(project="t5_1",
           config = {
               'epochs': model_params['TRAIN_EPOCHS']
           })

[34m[1mwandb[0m: Currently logged in as: [33mmariya_goltsova[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
class SummarizationModel(pl.LightningModule):
  def __init__(self):
    super().__init__()
    self.model = T5ForConditionalGeneration.from_pretrained(model_params["MODEL"], return_dict = True)

  def forward(self, input_ids, attention_mask, decoder_attention_mask, labels = None):
    output = self.model(
        input_ids,
        attention_mask = attention_mask,
        labels = labels,
        decoder_attention_mask = decoder_attention_mask
    )

    generated_ids = self.model.generate(
                input_ids = input_ids,
                attention_mask = attention_mask, 
                max_length=150, # исходя из конфига
                num_beams=2,
                repetition_penalty=2.5, 
                length_penalty=1.0, 
                early_stopping=True
                )
    preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
    target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True)for t in labels]

    rouge = Rouge().get_scores(preds, target, avg=True)
    return output.loss, output.logits, rouge

  
  def training_step(self, batch, batch_idx):
    input_ids = batch['text_input_ids']
    attention_mask = batch['text_attention_mask']
    labels = batch['labels']
    labels_attention_mask = batch['labels_attention_mask']

    loss, outputs, rouge = self(input_ids = input_ids, attention_mask = attention_mask, decoder_attention_mask = labels_attention_mask, labels = labels)
    
    for i in ['rouge-1', 'rouge-2', 'rouge-l']:
      for y in ['p', 'r', 'f']:
        self.log(f'train {i} {y}', rouge[i][y], prog_bar=True, logger=True)

    self.log('train_loss', loss, prog_bar=True, logger=True) #for wandb log

    return loss

  def validation_step(self, batch, batch_idx):
    input_ids = batch['text_input_ids']
    attention_mask = batch['text_attention_mask']
    labels = batch['labels']
    labels_attention_mask = batch['labels_attention_mask']

    loss, outputs, rouge = self(input_ids = input_ids, attention_mask = attention_mask, decoder_attention_mask = labels_attention_mask, labels = labels)
    
    for i in ['rouge-1', 'rouge-2', 'rouge-l']:
      for y in ['p', 'r', 'f']:
        self.log(f'val {i} {y}', rouge[i][y], prog_bar=True, logger=True)

    self.log('val_loss', loss, prog_bar=True, logger=True) #for wandb log
    return loss

  def test_step(self, batch, batch_idx):
    input_ids = batch['text_input_ids']
    attention_mask = batch['text_attention_mask']
    labels = batch['labels']
    labels_attention_mask = batch['labels_attention_mask']

    loss, outputs, rouge = self(input_ids = input_ids, attention_mask = attention_mask, decoder_attention_mask = labels_attention_mask, labels = labels)
    
    for i in ['rouge-1', 'rouge-2', 'rouge-l']:
      for y in ['p', 'r', 'f']:
        self.log(f'test {i} {y}', rouge[i][y], prog_bar=True, logger=True)

    self.log('test_loss', loss, prog_bar=True, logger=True) #for wandb log
    return loss

  def configure_optimizers(self):
    return AdamW(self.parameters(), lr=0.0001)

In [None]:
model = SummarizationModel()
model = model.to('cuda')

In [None]:
checkpoint_callback = ModelCheckpoint(dirpath='checkpoints', filename='best-checkpoint', save_top_k=1, verbose=True, monitor = 'val_loss', mode='min')

logger = WandbLogger()
#в trainer не получилось добавить gpu. а accelerator gpu выводит MisconfigurationException: No supported gpu backend found!
trainer = pl.Trainer(logger = logger, callbacks = [checkpoint_callback], max_epochs = model_params['TRAIN_EPOCHS'])

  rank_zero_warn(
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


In [None]:
trainer.fit(model, data_module)

INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 60.5 M
-----------------------------------------------------
60.5 M    Trainable params
0         Non-trainable params
60.5 M    Total params
242.026   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

In [None]:
trainer.test(model, data_module)

In [None]:
trained_model = SummarizationModel.load_from_checkpoint(trainer.checkpoint_callback.best_model_path)
trained_model.freeze()

In [None]:
def summarize(text):
  text_encoding = tokenizer(text, max_length= model_params['MAX_SOURCE_TEXT_LENGTH'], return_attention_mask = True, add_special_tokens=True, truncation=True, padding="max_length", return_tensors='pt')

  generated_ids = trained_model.model.generate(input_ids = text_encoding['input_ids'], attention_mask = text_encoding['attention_mask'], max_length = model_params['MAX_TARGET_TEXT_LENGTH'], num_beams = 2, repetition_penalty = 2.5, length_penalty=  1.0, early_stopping = True)

  preds = [tokenizer.decode(gen_id, skip_special_tokens=True, clean_up_tokenization_spaces=True) for gen_id in generated_ids]

  return "".join(preds)

In [None]:
text = test_df['article'][0]
model_summary = summarize(text)