In [None]:
!pip install transformers datasets evaluate accelerate

# Load dataset

In [None]:
from datasets import load_dataset
root = '/kaggle/input/newspaper-text-summarization-cnn-dailymail/cnn_dailymail/'

In [None]:
import pandas as pd
import os

In [None]:
data_files = {}
for file in os.listdir(root):
    file_path = root+file
    name = file.split('.')[0]
    data_files[name] = os.path.join(root, file_path)

In [None]:
data_files

In [None]:
datasets = load_dataset("csv", data_files = data_files)

In [None]:
datasets['train'] = datasets['train'].shuffle(seed=42).select(range(30000))

In [None]:
datasets

# Preprocess

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("t5-small")

In [None]:
prefix = "summarize: "


def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["article"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["highlights"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
batch_size=256

tokenizer_datasets = datasets.map(preprocess_function, batched = True, batch_size=batch_size, remove_columns=['id', 'article', 'highlights'])

In [None]:
tokenizer_datasets

In [None]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")

In [None]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=1,
    num_train_epochs=15,
    fp16=True,
    report_to = 'none'
)


In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenizer_datasets["train"],
    eval_dataset=tokenizer_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator
)

trainer.train()

In [None]:
model.save_pretrained('pretrained_model')

In [3]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load the tokenizer
tokenizer = T5Tokenizer.from_pretrained('/kaggle/input/21312/keras/default/1/results/checkpoint-28125')

# Load the model
model = T5ForConditionalGeneration.from_pretrained('/kaggle/input/21312/keras/default/1/results/checkpoint-28125')


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [4]:
input_text = "summarize: The quick brown fox jumps over the lazy dog. This text is part of an example to demonstrate how summarization works using T5."


In [5]:
input_ids = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=512).input_ids


In [7]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load fine-tuned model and tokenizer
# Load the tokenizer
tokenizer = T5Tokenizer.from_pretrained('/kaggle/input/21312/keras/default/1/results/checkpoint-28125')

# Load the model
model = T5ForConditionalGeneration.from_pretrained('/kaggle/input/21312/keras/default/1/results/checkpoint-28125')


# Text to summarize
input_text = (
    "summarize: The Transformers library provides thousands of pretrained models to perform tasks on texts such as classification, "
    "information extraction, question answering, summarization, translation, text generation, and more. It is maintained by Hugging Face."
)

# Tokenize input
input_ids = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=512).input_ids

# Generate summary
outputs = model.generate(input_ids, max_length=100, min_length=20, length_penalty=2.0, num_beams=4, early_stopping=True)

# Decode and print summary
summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Summary:", summary)


Summary: Transformers library provides thousands of pretrained models to perform tasks on texts such as classification, information extraction, question answering, summarization, translation.
