In [1]:
pip install transformers datasets torch accelerate

Note: you may need to restart the kernel to use updated packages.


In [2]:
import torch

if torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")

print(f"Using device: {device}")

Using device: mps


In [3]:
from datasets import Dataset, DatasetDict

# Sample data
data = {
    'train': [
        {'html': 'Dinosaurs are a diverse group of reptiles[note 1] of the clade Dinosauria. They first appeared during the Triassic period, between 243 and 233.23 million years ago (mya), although the exact origin and timing of the evolution of dinosaurs is a subject of active research. They became the dominant terrestrial vertebrates after the Triassic–Jurassic extinction event 201.3 mya and their dominance continued throughout the Jurassic and Cretaceous periods. The fossil record shows that birds are feathered dinosaurs, having evolved from earlier theropods during the Late Jurassic epoch, and are the only dinosaur lineage known to have survived the Cretaceous–Paleogene extinction event approximately 66 mya. Dinosaurs can therefore be divided into avian dinosaurs—birds—and the extinct non-avian dinosaurs, which are all dinosaurs other than birds.', 'title': 'Dinosaurs'},
        {'html': 'The aircraft involved was a Dornier 228 which belonged to the Malawi Army Air Wing of the Malawi Defence Force. It had previously been used to transport President Lazarus Chakwera several times and had conducted its previous flight hours before the crash.[1]  On 10 June 2024, the aircraft, carrying Vice-President Saulos Chilima, former First Lady Patricia Shanil Muluzi, and seven other occupants,[2] including members of Chilima\'s staff and security detail[3] and three military crew,[4] left Kamuzu International Airport in the capital Lilongwe at 9:17 a.m. CAT, and was scheduled to arrive at Mzuzu Airport in the Northern Region at 10:02 a.m.[5] The passengers were on their way to attend the funeral for former government minister Ralph Kasambara, and were to return to Lilongwe afterward.[6][7]', 'title': 'Dornier 228 Plane Crash'},
    ],
    'validation': [
        {'html': 'Nowadays, AI is able to do many things such as generating images that never existed, coding, talking to people and much more. It can also make texts shorter or in other words it can summarize texts. In this article, This project is done using TensorFlow with Seq2Seq model. There are several processes and steps which will be fully covered with the full implementation and the source code. Before getting to code, there are some theories that should be understood before coding.', 'title': 'Utilizing AI'}
    ]
}

# Create DatasetDict
dataset_dict = DatasetDict({
    'train': Dataset.from_list(data['train']),
    'validation': Dataset.from_list(data['validation'])
})

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("google/long-t5-tglobal-base")

def preprocess_function(examples):
    inputs = examples['html']
    targets = examples['title']
    model_inputs = tokenizer(inputs, max_length=4096, truncation=True, return_tensors="pt")
    labels = tokenizer(targets, max_length=128, truncation=True, return_tensors="pt").input_ids
    model_inputs['labels'] = labels
    model_inputs = {k: v.to(device) for k, v in model_inputs.items()}
    return model_inputs

tokenized_datasets = dataset_dict.map(preprocess_function, batched=True)


Map:   0%|          | 0/2 [00:00<?, ? examples/s]


ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`input_ids` in this case) have excessive nesting (inputs type `list` where type `int` is expected).

In [None]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments

model = AutoModelForSeq2SeqLM.from_pretrained("google/long-t5-tglobal-base")
model.to(device)

training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
)

trainer.train()


In [None]:
def generate_title(html_content):
    inputs = tokenizer(html_content, return_tensors="pt", max_length=4096, truncation=True).to(device)
    output = model.generate(**inputs)
    title = tokenizer.decode(output[0], skip_special_tokens=True)
    return title

# Example usage
html_content = 'This is example content.'
generated_title = generate_title(html_content)
print("Generated Title:", generated_title)