### Load Dataset

In [1]:
from datasets import load_dataset

In [2]:
dataset = load_dataset("CarperAI/openai_summarize_tldr", cache_dir="cache")

Found cached dataset parquet (D:/Code/summarization_lora/cache/CarperAI___parquet/CarperAI--openai_summarize_tldr-536d9955f5e6f921/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['prompt', 'label'],
        num_rows: 116722
    })
    test: Dataset({
        features: ['prompt', 'label'],
        num_rows: 6553
    })
    valid: Dataset({
        features: ['prompt', 'label'],
        num_rows: 6447
    })
})

### Process Dataset

In [4]:
from transformers import AutoTokenizer

In [5]:
tokenizer = AutoTokenizer.from_pretrained("t5-small", cache_dir="cache")

In [6]:
MAX_SOURCE_LENGTH = 512
MAX_TARGET_LENGTH = 512

In [7]:
def tokenize(batch):
    tokenized_input = tokenizer(batch['prompt'], truncation=True, max_length=MAX_SOURCE_LENGTH, padding="max_length")
    tokenized_output = tokenizer(batch['label'], truncation=True, max_length=MAX_SOURCE_LENGTH, padding="max_length")
    return {"input_ids": tokenized_input["input_ids"], "attention_mask": tokenized_input["attention_mask"], "labels": tokenized_output["input_ids"]}

In [8]:
tokenized_dataset = dataset.map(tokenize, remove_columns=dataset['train'].column_names, batched=True, batch_size=512)
tokenized_dataset

Loading cached processed dataset at D:\Code\summarization_lora\cache\CarperAI___parquet\CarperAI--openai_summarize_tldr-536d9955f5e6f921\0.0.0\2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec\cache-fb1ccf883e812870.arrow
Loading cached processed dataset at D:\Code\summarization_lora\cache\CarperAI___parquet\CarperAI--openai_summarize_tldr-536d9955f5e6f921\0.0.0\2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec\cache-94000f9ce8270320.arrow
Loading cached processed dataset at D:\Code\summarization_lora\cache\CarperAI___parquet\CarperAI--openai_summarize_tldr-536d9955f5e6f921\0.0.0\2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec\cache-602b54ebd6988695.arrow


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 116722
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 6553
    })
    valid: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 6447
    })
})

### Load Model

In [9]:
from transformers import T5ForConditionalGeneration

In [19]:
model = T5ForConditionalGeneration.from_pretrained("t5-small", cache_dir="cache")

In [11]:
model_size = sum(t.numel() for t in model.parameters())
print(f"Model size: {model_size/1000**2:.1f}M parameters")

Model size: 60.5M parameters


### Training

In [12]:
from transformers import Trainer, TrainingArguments

In [17]:
training_args = TrainingArguments(
    output_dir="models",
    logging_dir="logs",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="steps",
    eval_steps=500,
    logging_steps=500,
    gradient_accumulation_steps=8,
    num_train_epochs=10,
    weight_decay=0.1,
    warmup_steps=1000,
    lr_scheduler_type="cosine",
    learning_rate=1e-3,
    save_steps=1000,
    save_total_limit=1,
    fp16=True,
    report_to="tensorboard"
)

In [20]:
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["valid"]
)

trainer.train()



Step,Training Loss,Validation Loss
500,1.5666,0.193968
1000,0.2024,0.190083
1500,0.1987,0.186475
2000,0.194,0.184552
2500,0.19,0.18287
3000,0.1897,0.181831
3500,0.1887,0.181482
4000,0.1841,0.180699
4500,0.1827,0.179669
5000,0.1831,0.178697


TrainOutput(global_step=18230, training_loss=0.20809626513892848, metrics={'train_runtime': 19213.7937, 'train_samples_per_second': 60.749, 'train_steps_per_second': 0.949, 'total_flos': 1.5789867815613235e+17, 'train_loss': 0.20809626513892848, 'epoch': 10.0})

In [27]:
output_model_path = "models/fully_supervised_t5-small"
model.save_pretrained(output_model_path)

### Evaluation

In [1]:
from datasets import load_dataset

In [2]:
dataset = load_dataset("CarperAI/openai_summarize_tldr", split="test", cache_dir="cache")

Found cached dataset parquet (D:/Code/summarization_lora/cache/CarperAI___parquet/CarperAI--openai_summarize_tldr-536d9955f5e6f921/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


In [3]:
dataset

Dataset({
    features: ['prompt', 'label'],
    num_rows: 6553
})

In [4]:
from transformers import AutoTokenizer

In [5]:
tokenizer = AutoTokenizer.from_pretrained("t5-small", cache_dir="cache")

In [6]:
MAX_SOURCE_LENGTH = 512

In [9]:
def tokenize(batch):
    tokenized_input = tokenizer(batch['prompt'], truncation=True, max_length=MAX_SOURCE_LENGTH, padding="max_length")
    return {"input_ids": tokenized_input["input_ids"], "attention_mask": tokenized_input["attention_mask"]}

In [10]:
tokenized_dataset = dataset.map(tokenize, batched=True, batch_size=512)
tokenized_dataset

Map:   0%|          | 0/6553 [00:00<?, ? examples/s]

Dataset({
    features: ['prompt', 'label', 'input_ids', 'attention_mask'],
    num_rows: 6553
})

In [11]:
from transformers import T5ForConditionalGeneration

In [12]:
model = T5ForConditionalGeneration.from_pretrained("models/fully_supervised_t5-small")

In [13]:
device = "cuda"
model = model.to(device)

In [14]:
import torch

In [15]:
input_ids = torch.tensor(tokenized_dataset["input_ids"]).squeeze().to(device)
attention_masks = torch.tensor(tokenized_dataset["attention_mask"]).squeeze().to(device)

In [16]:
from torch.utils.data import TensorDataset, DataLoader

In [17]:
dataset = TensorDataset(input_ids, attention_masks)
dataloader = DataLoader(dataset, batch_size=32, shuffle=False)

In [18]:
from tqdm import tqdm

In [19]:
predicted_summaries = []
for batch in tqdm(dataloader):
    input_ids, attention_mask = batch
    output_ids = model.generate(input_ids, attention_mask=attention_mask, max_new_tokens=50, pad_token_id=tokenizer.eos_token_id)
#     print(output_ids.shape)
    output_ids = output_ids[:, -50:]
#     print(output_ids[0])
    output = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
    predicted_summaries += output

100%|████████████████████████████████████████████████████████████████████████████████| 205/205 [01:59<00:00,  1.71it/s]


In [20]:
len(predicted_summaries)

6553

In [21]:
predicted_summaries[0]

'Met a guy on Facebook, slept with him, he said he wanted to play it by ear, not slap labels. Is this a fwb-type situation, or a fw'

In [22]:
import evaluate

In [23]:
rouge_score = evaluate.load("rouge")

In [24]:
result = rouge_score.compute(predictions=predicted_summaries, references=tokenized_dataset['label'])

In [25]:
result

{'rouge1': 0.3366446030616901,
 'rouge2': 0.12388634748208141,
 'rougeL': 0.2625270246587339,
 'rougeLsum': 0.26232444486870243}