<a href="https://colab.research.google.com/github/neel26desai/news_summarization/blob/main/SummarizationHuggingFace_Training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install sacremoses==0.0.53
!pip install datasets
!pip install transformers
!pip install transformers[torch]


In [None]:
!pip install accelerate -U
! pip install evaluate rouge_score

In [None]:
from datasets import load_dataset
from transformers import pipeline

In [None]:
#loading the dataset
xsum_dataset = load_dataset(
    "xsum",
    version="1.2.0",
    cache_dir='/content/drive/MyDrive/Hugging_Face/data'
)  # Note: We specify cache_dir to use predownloaded data.
xsum_dataset
# The printed representation of this object shows the `num_rows`
# of each dataset split

Downloading builder script:   0%|          | 0.00/5.76k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/6.24k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/255M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.00M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/204045 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11332 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11334 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 204045
    })
    validation: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 11332
    })
    test: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 11334
    })
})

In [None]:
import pickle
#save the dataset as a pickle file
with open('/content/drive/MyDrive/xsum.pkl', 'wb') as f:
    pickle.dump(xsum_dataset, f)

In [None]:
#we have save the data
import pickle
with open('/content/drive/MyDrive/xsum.pkl','rb') as f:
  xsum_dataset = pickle.load(f)


In [None]:
from transformers import AutoTokenizer
checkpoint = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

In [None]:
prefix = "summarize: "


def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["document"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    labels = tokenizer(text_target=examples["summary"], max_length=250, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_xsum = xsum_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/204045 [00:00<?, ? examples/s]

Map:   0%|          | 0/11332 [00:00<?, ? examples/s]

Map:   0%|          | 0/11334 [00:00<?, ? examples/s]

In [None]:
# Save the dataset dict
with open('/content/drive/MyDrive/tokenized.pkl', 'wb') as f:
    pickle.dump(tokenized_xsum, f)

In [None]:
#as we have already created the tokenized version' we'll simply load it
with open('/content/drive/MyDrive/tokenized.pkl','rb') as f:
  tokenized_xsum = pickle.load(f)

In [None]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [None]:
import evaluate

rouge = evaluate.load("rouge")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [None]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [None]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained('t5-small')

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir='/content/drive/MyDrive/fine_tuned_t5/'+"my_awesome_xsum_model/results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=4,
    predict_with_generate=True,
    fp16=True
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_xsum["train"],
    eval_dataset=tokenized_xsum["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)



In [None]:
trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,2.7068,2.482651,0.2862,0.0793,0.225,0.225,18.8269
2,2.646,2.437687,0.2915,0.0828,0.2292,0.2292,18.8369
3,2.6185,2.41763,0.2949,0.0852,0.232,0.232,18.8364
4,2.6059,2.411622,0.2963,0.086,0.2331,0.2331,18.8359




TrainOutput(global_step=25508, training_loss=2.671520611363854, metrics={'train_runtime': 8843.375, 'train_samples_per_second': 92.293, 'train_steps_per_second': 2.884, 'total_flos': 2.2051178810022298e+17, 'train_loss': 2.671520611363854, 'epoch': 4.0})

In [None]:
model_dir = '/content/drive/MyDrive/fine_tuned_t5/model'
trainer.save_model(model_dir)

In [None]:
new_model = AutoModelForSeq2SeqLM.from_pretrained(model_dir)

In [None]:
text = "summarize: The Inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs. It's the most aggressive action on tackling the climate crisis in American history, which will lift up American workers and create good-paying, union jobs across the country. It'll lower the deficit and ask the ultra-wealthy and corporations to pay their fair share. And no one making under $400,000 per year will pay a penny more in taxes."

In [None]:
tokernizer2 = AutoTokenizer.from_pretrained(model_dir)

In [None]:
from transformers import pipeline

summarizer = pipeline("summarization", model=new_model,tokenizer=tokernizer2)
summarizer("summarize:" +xsum_dataset['train'][0]['document'])

Token indices sequence length is longer than the specified maximum sequence length for this model (543 > 512). Running this sequence through the model will result in indexing errors




In [None]:
"summarize:" +xsum_dataset['train'][0]['document']



In [None]:
#uplading the new model to hugging face
from huggingface_hub import notebook_login

notebook_login()
ft_t5.push_to_hub('neel26d/newstuned_t5_summarizer')