In [1]:
from datasets import load_dataset

dataset = load_dataset("../data/datasets/summary_title_tom_scott")

dataset

Using custom data configuration summary_title_tom_scott-b63a11504cc18d0d


Downloading and preparing dataset csv/summary_title_tom_scott to C:/Users/Marvin Kosmider/.cache/huggingface/datasets/csv/summary_title_tom_scott-b63a11504cc18d0d/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to C:/Users/Marvin Kosmider/.cache/huggingface/datasets/csv/summary_title_tom_scott-b63a11504cc18d0d/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'summary'],
        num_rows: 322
    })
})

In [2]:
def add_prefix(examples):
    examples["summary"] = "headline: " + examples["summary"]
    return examples

prefixed_dataset = dataset.map(add_prefix)
prefixed_dataset["train"][0]

  0%|          | 0/322 [00:00<?, ?ex/s]

{'id': 0,
 'title': "Keeping the world's longest railroad tunnel safe",
 'summary': "headline: The Gotthard Base Tunnel is the longest and deepest rail tunnel in the world, 57 kilometers through the Alps. Around the turn of the century, there were two devastating fires in road tunnels under those mountains, and the authorities here in Switzerland want to make sure that nothing like that can ever happen again. We're going to see the control center that watches over the tunnel, and also visit one of the intervention centers, with the firefighters and first responders that'll step in if the worst were to happen. And then I want to show you what I reckon is the most important bit of safety equipment, and most people will probably never notice it. It isn't anywhere near the tunnel. But before that, we start at the control centers, just over there. There are different sections to control the traffic. The signal box controls the track. The movement authority is given by the signal box, and by

In [3]:
split_datasets = prefixed_dataset["train"].train_test_split(train_size=0.8, seed=20)
split_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'summary'],
        num_rows: 257
    })
    test: Dataset({
        features: ['id', 'title', 'summary'],
        num_rows: 65
    })
})

In [4]:
split_datasets["validation"] = split_datasets.pop("test")

In [5]:
from transformers import AutoTokenizer

# tokenizer = AutoTokenizer.from_pretrained("bigscience/T0_3B", return_tensors="pt")
tokenizer = AutoTokenizer.from_pretrained("Michau/t5-base-en-generate-headline", return_tensors="pt")

https://huggingface.co/course/chapter7/4?fw=pt

In [6]:
def preprocess_function(examples):
    inputs = examples["summary"]
    targets = examples["title"]
    model_inputs = tokenizer(
        inputs, text_target=targets, padding="max_length", truncation=True
    )
    return model_inputs

In [7]:
tokenized_datasets = split_datasets.map(
    preprocess_function,
    batched=True,
    remove_columns=split_datasets["train"].column_names,
)

tokenized_datasets

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 257
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 65
    })
})

In [8]:
from transformers import AutoModelForSeq2SeqLM

# model = AutoModelForSeq2SeqLM.from_pretrained("bigscience/T0_3B")
model = AutoModelForSeq2SeqLM.from_pretrained("Michau/t5-base-en-generate-headline")

In [9]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [10]:
import evaluate

metric = evaluate.load("sacrebleu")

import numpy as np

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    # In case the model returns more than the prediction logits
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100s in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["score"]}

In [11]:
from transformers import Seq2SeqTrainingArguments

args = Seq2SeqTrainingArguments(
    f"t5-headline-tom-scott",
    evaluation_strategy="no",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
)

In [12]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [13]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 65
  Batch size = 64
You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


RuntimeError: CUDA out of memory. Tried to allocate 768.00 MiB (GPU 0; 2.00 GiB total capacity; 1.49 GiB already allocated; 0 bytes free; 1.57 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [88]:
trainer.train()

***** Running training *****
  Num examples = 5
  Num Epochs = 3
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 3


RuntimeError: [enforce fail at C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\c10\core\impl\alloc_cpu.cpp:81] data. DefaultCPUAllocator: not enough memory: you tried to allocate 167772160 bytes.