In [1]:
from datasets import load_dataset

dataset  = load_dataset("ILSUM/ILSUM-1.0", "English")

README.md:   0%|          | 0.00/4.78k [00:00<?, ?B/s]

train.csv:   0%|          | 0.00/46.5M [00:00<?, ?B/s]

test.csv:   0%|          | 0.00/16.8M [00:00<?, ?B/s]

val.csv:   0%|          | 0.00/3.37M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/12565 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/4487 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/898 [00:00<?, ? examples/s]

In [2]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'Article', 'Heading', 'Summary'],
        num_rows: 12565
    })
    test: Dataset({
        features: ['id', 'Article', 'Heading', 'Summary'],
        num_rows: 4487
    })
    validation: Dataset({
        features: ['id', 'Article', 'Heading', 'Summary'],
        num_rows: 898
    })
})

In [3]:
# Filter Samples to fit total tokens limits (facebook/bart-base - 1024)
# Limiting no of words to 800 itself because 1 word can be divided into multiple tokens sometimes
def filterSample(sample):
    return len(sample['Article'].split()) < 800 and len(sample['Summary'].split()) < 200

dataset = dataset.filter(filterSample)

Filter:   0%|          | 0/12565 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4487 [00:00<?, ? examples/s]

Filter:   0%|          | 0/898 [00:00<?, ? examples/s]

In [4]:
# Load tokenizer and model

import tensorflow as tf
from transformers import TFAutoModelForSeq2SeqLM, AutoTokenizer, DataCollatorForSeq2Seq

modelCheckpoint = "facebook/bart-base"
tokenizer = AutoTokenizer.from_pretrained(modelCheckpoint)      # Tokenizing the input using prerained model
model = TFAutoModelForSeq2SeqLM.from_pretrained(modelCheckpoint)  # TensorFlow version of the facebook/bart-base model for sequence-to-sequence learning (Seq2SeqLM).

2025-04-10 15:08:05.236015: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744297685.404835      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744297685.453918      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

I0000 00:00:1744297708.188287      31 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 15513 MB memory:  -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0
All PyTorch model weights were used when initializing TFBartForConditionalGeneration.

All the weights of TFBartForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBartForConditionalGeneration for predictions without further training.


In [5]:
maxInputLen = 1024
maxTargetLen = 128

def preprocessInput(sample):
    # pretrained tokenizer ( convert text to numerical token IDs)
    modelInputs = tokenizer(sample['Article'], max_length = maxInputLen, padding = "max_length", truncation = True)
    # Tokenizes the summary
    labels = tokenizer(sample["Summary"], max_length = maxTargetLen, padding = "max_length", truncation = True)
    modelInputs["labels"] = labels["input_ids"]
    return modelInputs

tokenizedDataset = dataset.map(preprocessInput, batched = True, remove_columns = ["id", "Heading", "Article", "Summary"])

Map:   0%|          | 0/10709 [00:00<?, ? examples/s]

Map:   0%|          | 0/3789 [00:00<?, ? examples/s]

Map:   0%|          | 0/770 [00:00<?, ? examples/s]

In [6]:
# Automatically batches, pads, and formats examples
#Prepares them as TensorFlow (or PyTorch) model-ready inputs
dataCollator = DataCollatorForSeq2Seq(tokenizer = tokenizer, model = model, return_tensors = "tf")

In [7]:
tfTrain = tokenizedDataset["train"].to_tf_dataset(
    columns = ["input_ids", "attention_mask", "labels"],
    shuffle = True,
    batch_size = 2,
    collate_fn = dataCollator
)

tfVal = tokenizedDataset["validation"].to_tf_dataset(
    columns = ["input_ids", "attention_mask", "labels"],
    shuffle = False,
    batch_size = 2,
    collate_fn = dataCollator
)

In [8]:
# Compilation
from transformers import create_optimizer

optimizer, schedule = create_optimizer(init_lr = 2e-5, num_warmup_steps = 0, num_train_steps = 1000)
model.compile(optimizer = optimizer)

In [9]:
# Fine tune the model
model.fit(tfTrain, validation_data = tfVal, epochs = 3)

Epoch 1/3


I0000 00:00:1744297797.477979     116 service.cc:148] XLA service 0x791bf60db1f0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1744297797.478740     116 service.cc:156]   StreamExecutor device (0): Tesla P100-PCIE-16GB, Compute Capability 6.0
I0000 00:00:1744297797.546216     116 cuda_dnn.cc:529] Loaded cuDNN version 90300
I0000 00:00:1744297797.663836     116 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/3
Epoch 3/3


<tf_keras.src.callbacks.History at 0x791c4bc18550>

In [10]:
model.save_pretrained("./tf-bart-ilsum")
tokenizer.save_pretrained("./tf-bart-ilsum")

Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0}


('./tf-bart-ilsum/tokenizer_config.json',
 './tf-bart-ilsum/special_tokens_map.json',
 './tf-bart-ilsum/vocab.json',
 './tf-bart-ilsum/merges.txt',
 './tf-bart-ilsum/added_tokens.json',
 './tf-bart-ilsum/tokenizer.json')

* ROUGE( Recall-Oriented Understudy for Gisting Evaluation) evaluation — it’s the most widely used metric for evaluating text summarization tasks.
* ROUGE measures overlap between generated and reference summaries

In [17]:
# ROUGE Evaluation (on validation set)
import evaluate

rouge = evaluate.load("rouge")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [18]:
def generate_summary(example):
    input_ids = tokenizer(example["Article"], return_tensors="tf", max_length=1024, truncation=True).input_ids
    output_ids = model.generate(input_ids, max_length=128, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return {"Generated Summary": summary}

In [19]:
# Evaluate on small subset
val_subset = dataset["validation"].select(range(50))
preds = []
refs = []

for ex in val_subset:
    result = generate_summary(ex)
    preds.append(result["Generated Summary"])
    refs.append(ex["Summary"])

scores = rouge.compute(predictions=preds, references=refs)
print(scores)

{'rouge1': 0.5288508848630459, 'rouge2': 0.4172175906247514, 'rougeL': 0.4865132082229662, 'rougeLsum': 0.4890689951063759}
