In [17]:
import pandas as pd
from datasets import load_dataset
df = pd.read_csv('headlines.csv', index_col=0)
df[['input_text', 'target_text']].to_csv('headlines.csv', index=False)
dataset = load_dataset('csv', data_files='cleaned2.csv')['train'].train_test_split(test_size=0.2)

Generating train split: 0 examples [00:00, ? examples/s]

In [18]:
dataset

DatasetDict({
    train: Dataset({
        features: ['input_text', 'target_text'],
        num_rows: 912
    })
    test: Dataset({
        features: ['input_text', 'target_text'],
        num_rows: 228
    })
})

In [19]:
def preprocess_data(examples):
    # Adjust "text" and "summary" to your dataset's column names for inputs and targets
    input_encodings = tokenizer(examples["input_text"], padding=True, truncation=True, max_length=512)
    label_encodings = tokenizer(examples["target_text"], padding=True, truncation=True, max_length=128)

    labels = label_encodings["input_ids"]
    # T5 uses -100 to ignore tokens in the loss calculation
    labels = [[-100 if token == tokenizer.pad_token_id else token for token in label] for label in labels]

    encodings = {
        "input_ids": input_encodings["input_ids"],
        "attention_mask": input_encodings["attention_mask"],
        "labels": labels,
    }

    return encodings

# Apply preprocessing - adjust dataset format (e.g., dataset["train"]) as necessary for your dataset
train_dataset = dataset["train"].map(preprocess_data, batched=True)
test_dataset = dataset["test"].map(preprocess_data, batched=True)


Map:   0%|          | 0/912 [00:00<?, ? examples/s]

Map:   0%|          | 0/228 [00:00<?, ? examples/s]

In [20]:
# from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Trainer
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments, DataCollatorForSeq2Seq

model_name = "t5-small"
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, padding='longest')

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

trainer.train()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
10,0.8003
20,0.7755
30,0.7954
40,0.7919
50,0.6822
60,0.7569
70,0.634
80,0.6449
90,0.5559
100,0.5788


TrainOutput(global_step=684, training_loss=0.23181695314614395, metrics={'train_runtime': 329.6272, 'train_samples_per_second': 8.3, 'train_steps_per_second': 2.075, 'total_flos': 108484912742400.0, 'train_loss': 0.23181695314614395, 'epoch': 3.0})

In [None]:
print(len(dataset["train"]))
print(len(dataset["test"]))


In [21]:
model_save_path = "t5-small-headline"
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

('t5-small-headline/tokenizer_config.json',
 't5-small-headline/special_tokens_map.json',
 't5-small-headline/spiece.model',
 't5-small-headline/added_tokens.json')

In [22]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
model = T5ForConditionalGeneration.from_pretrained("t5-small-headline")
tokenizer = T5Tokenizer.from_pretrained("t5-small-headline")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
def generate_headline(text):
    input_ids = tokenizer.encode(text, return_tensors="pt", max_length=512, truncation=True)
    outputs = model.generate(input_ids, max_length=128, num_beams=4, early_stopping=True)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)