### Load Dataset

In [1]:
from datasets import load_dataset

In [2]:
dataset = load_dataset("CarperAI/openai_summarize_tldr", cache_dir="cache")

Found cached dataset parquet (D:/Code/summarization_lora/cache/CarperAI___parquet/CarperAI--openai_summarize_tldr-536d9955f5e6f921/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['prompt', 'label'],
        num_rows: 116722
    })
    test: Dataset({
        features: ['prompt', 'label'],
        num_rows: 6553
    })
    valid: Dataset({
        features: ['prompt', 'label'],
        num_rows: 6447
    })
})

### Process Dataset

In [4]:
from transformers import AutoTokenizer

In [5]:
tokenizer = AutoTokenizer.from_pretrained("t5-small", cache_dir="cache")

In [6]:
MAX_SOURCE_LENGTH = 512
MAX_TARGET_LENGTH = 512

In [7]:
def tokenize(batch):
    tokenized_input = tokenizer(batch['prompt'], truncation=True, max_length=MAX_SOURCE_LENGTH, padding="max_length")
    tokenized_output = tokenizer(batch['label'], truncation=True, max_length=MAX_SOURCE_LENGTH, padding="max_length")
    return {"input_ids": tokenized_input["input_ids"], "attention_mask": tokenized_input["attention_mask"], "labels": tokenized_output["input_ids"]}

In [8]:
tokenized_dataset = dataset.map(tokenize, remove_columns=dataset['train'].column_names, batched=True, batch_size=512)
tokenized_dataset

Map:   0%|          | 0/116722 [00:00<?, ? examples/s]

Map:   0%|          | 0/6553 [00:00<?, ? examples/s]

Map:   0%|          | 0/6447 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 116722
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 6553
    })
    valid: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 6447
    })
})

### Load Model

In [9]:
from transformers import T5ForConditionalGeneration

In [30]:
model = T5ForConditionalGeneration.from_pretrained("t5-small", cache_dir="cache")

In [11]:
model_size = sum(t.numel() for t in model.parameters())
print(f"Model size: {model_size/1000**2:.1f}M parameters")

Model size: 60.5M parameters


In [31]:
for param in model.parameters():
    param.requires_grad = False

In [25]:
from peft import LoraConfig, TaskType

peft_config = LoraConfig(task_type=TaskType.SEQ_2_SEQ_LM, inference_mode=False, r=4, lora_alpha=32, lora_dropout=0.1)

In [32]:
from peft import get_peft_model

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 147,456 || all params: 60,654,080 || trainable%: 0.24310977925969696


### Training

In [27]:
from transformers import Trainer, TrainingArguments

In [33]:
training_args = TrainingArguments(
    output_dir="models_lora/",
    logging_dir="logs_lora",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="steps",
    eval_steps=500,
    logging_steps=500,
    gradient_accumulation_steps=8,
    num_train_epochs=10,
    weight_decay=0.1,
    warmup_steps=1000,
    lr_scheduler_type="cosine",
    learning_rate=1e-3,
    save_steps=1000,
    save_total_limit=1,
    fp16=True,
    report_to="tensorboard"
)

In [34]:
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["valid"]
)

trainer.train()

Step,Training Loss,Validation Loss
500,3.8355,0.211508
1000,0.2205,0.204638
1500,0.2159,0.201346
2000,0.2129,0.199704
2500,0.2122,0.198657
3000,0.2107,0.197405
3500,0.2094,0.19693
4000,0.2082,0.196448
4500,0.2078,0.195619
5000,0.2075,0.19526


TrainOutput(global_step=18230, training_loss=0.3059346205836394, metrics={'train_runtime': 21103.6621, 'train_samples_per_second': 55.309, 'train_steps_per_second': 0.864, 'total_flos': 1.5842716015814246e+17, 'train_loss': 0.3059346205836394, 'epoch': 10.0})

In [35]:
output_model_path = "models_lora/lora_t5-small"
model.save_pretrained(output_model_path)

### Evaluation

In [1]:
from datasets import load_dataset

In [36]:
dataset = load_dataset("CarperAI/openai_summarize_tldr", split="test", cache_dir="cache")

Found cached dataset parquet (D:/Code/summarization_lora/cache/CarperAI___parquet/CarperAI--openai_summarize_tldr-536d9955f5e6f921/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


In [37]:
dataset

Dataset({
    features: ['prompt', 'label'],
    num_rows: 6553
})

In [4]:
from transformers import AutoTokenizer

In [5]:
tokenizer = AutoTokenizer.from_pretrained("t5-small", cache_dir="cache")

In [6]:
MAX_SOURCE_LENGTH = 512

In [38]:
def tokenize(batch):
    tokenized_input = tokenizer(batch['prompt'], truncation=True, max_length=MAX_SOURCE_LENGTH, padding="max_length")
    return {"input_ids": tokenized_input["input_ids"], "attention_mask": tokenized_input["attention_mask"]}

In [39]:
tokenized_dataset = dataset.map(tokenize, batched=True, batch_size=512)
tokenized_dataset

Map:   0%|          | 0/6553 [00:00<?, ? examples/s]

Dataset({
    features: ['prompt', 'label', 'input_ids', 'attention_mask'],
    num_rows: 6553
})

In [11]:
from transformers import T5ForConditionalGeneration

In [12]:
model = T5ForConditionalGeneration.from_pretrained("models/fully_supervised_t5-small")

In [40]:
device = "cuda"
model = model.to(device)

In [41]:
import torch

In [42]:
input_ids = torch.tensor(tokenized_dataset["input_ids"]).squeeze().to(device)
attention_masks = torch.tensor(tokenized_dataset["attention_mask"]).squeeze().to(device)

In [43]:
from torch.utils.data import TensorDataset, DataLoader

In [44]:
dataset = TensorDataset(input_ids, attention_masks)
dataloader = DataLoader(dataset, batch_size=32, shuffle=False)

In [45]:
from tqdm import tqdm

In [47]:
predicted_summaries = []
for batch in tqdm(dataloader):
    input_ids, attention_mask = batch
    output_ids = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_new_tokens=50, pad_token_id=tokenizer.eos_token_id)
#     print(output_ids.shape)
    output_ids = output_ids[:, -50:]
#     print(output_ids[0])
    output = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
    predicted_summaries += output

100%|████████████████████████████████████████████████████████████████████████████████| 205/205 [02:46<00:00,  1.23it/s]


In [48]:
len(predicted_summaries)

6553

In [49]:
predicted_summaries[0]

"I'm a fwb-type guy, and I'm not sure if I'm interested in having a fuck buddy."

In [50]:
import evaluate

In [51]:
rouge_score = evaluate.load("rouge")

Using the latest cached version of the module from C:\Users\win10\.cache\huggingface\modules\evaluate_modules\metrics\evaluate-metric--rouge\b01e0accf3bd6dd24839b769a5fda24e14995071570870922c71970b3a6ed886 (last modified on Mon Jun 12 00:11:24 2023) since it couldn't be found locally at evaluate-metric--rouge, or remotely on the Hugging Face Hub.


In [52]:
result = rouge_score.compute(predictions=predicted_summaries, references=tokenized_dataset['label'])

In [53]:
result

{'rouge1': 0.30313930448031134,
 'rouge2': 0.10134628813814994,
 'rougeL': 0.23678244191545478,
 'rougeLsum': 0.2368504241795525}