<a href="https://colab.research.google.com/github/pb-roshith/Fine-Tuning-BART-for-Text-Summarization/blob/main/Text_Summarization_using_BART_Transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install --upgrade datasets



In [2]:
from datasets import load_dataset

# Clear the cache
import shutil
shutil.rmtree('/root/.cache/huggingface/datasets')

# Try loading the dataset again
ds = load_dataset("knkarthick/dialogsum")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/4.65k [00:00<?, ?B/s]

train.csv:   0%|          | 0.00/11.3M [00:00<?, ?B/s]

validation.csv:   0%|          | 0.00/442k [00:00<?, ?B/s]

test.csv:   0%|          | 0.00/1.35M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/12460 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/500 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1500 [00:00<?, ? examples/s]

In [3]:
ds

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 12460
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 500
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 1500
    })
})

In [4]:
ds['train'][1]['dialogue']

"#Person1#: Hello Mrs. Parker, how have you been?\n#Person2#: Hello Dr. Peters. Just fine thank you. Ricky and I are here for his vaccines.\n#Person1#: Very well. Let's see, according to his vaccination record, Ricky has received his Polio, Tetanus and Hepatitis B shots. He is 14 months old, so he is due for Hepatitis A, Chickenpox and Measles shots.\n#Person2#: What about Rubella and Mumps?\n#Person1#: Well, I can only give him these for now, and after a couple of weeks I can administer the rest.\n#Person2#: OK, great. Doctor, I think I also may need a Tetanus booster. Last time I got it was maybe fifteen years ago!\n#Person1#: We will check our records and I'll have the nurse administer and the booster as well. Now, please hold Ricky's arm tight, this may sting a little."

In [5]:
ds['train'][1]['summary']

'Mrs Parker takes Ricky for his vaccines. Dr. Peters checks the record and then gives Ricky a vaccine.'

**without fine - tuning**

In [6]:
!pip install transformers



In [7]:
from transformers import pipeline

pipe = pipeline("summarization", model="facebook/bart-large-cnn")

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cuda:0


In [8]:
article_1 = ds['train'][1]['dialogue']

In [9]:
pipe(article_1, max_length=20, min_length=10, do_sample=False)

[{'summary_text': 'Ricky has received his Polio, Tetanus and Hepatitis B shots.'}]

**with fine - tuning**

In [10]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")

In [11]:
def preprocess_function(batch):
  source = batch['dialogue']
  target = batch['summary']
  source_ids = tokenizer(source, truncation=True, padding='max_length', max_length=128)
  target_ids = tokenizer(target, truncation=True, padding='max_length', max_length=128)

  labels = target_ids['input_ids']
  labels = [[(label if label != tokenizer.pad_token_id else -100) for label in labels_example] for labels_example in labels]

  return {
    'input_ids': source_ids['input_ids'],
    'attention_mask': source_ids['attention_mask'],
    'labels': labels
  }

In [12]:
df_source = ds.map(preprocess_function, batched=True)

Map:   0%|          | 0/12460 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

In [21]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir='/content',
    num_train_epochs=1,
    per_device_train_batch_size=4,
    remove_unused_columns=True,
    run_name="my_unique_run_name",
    report_to="none"
)

In [22]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=df_source['train'],
    eval_dataset=df_source['test']
)

In [23]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [24]:
trainer.train()

Step,Training Loss
500,0.9204
1000,1.5747
1500,1.5035
2000,1.4629
2500,1.4076
3000,1.3797




TrainOutput(global_step=3115, training_loss=1.373433635093236, metrics={'train_runtime': 2381.6491, 'train_samples_per_second': 5.232, 'train_steps_per_second': 1.308, 'total_flos': 3375265417789440.0, 'train_loss': 1.373433635093236, 'epoch': 1.0})

In [25]:
eval_results = trainer.evaluate()

**saving the model**

In [26]:
model.save_pretrained('/content/model_directory')
tokenizer.save_pretrained('/content/model_directory')

('/content/model_directory/tokenizer_config.json',
 '/content/model_directory/special_tokens_map.json',
 '/content/model_directory/vocab.json',
 '/content/model_directory/merges.txt',
 '/content/model_directory/added_tokens.json',
 '/content/model_directory/tokenizer.json')

In [27]:
tokenizer = AutoTokenizer.from_pretrained('/content/model_directory')
model = AutoModelForSeq2SeqLM.from_pretrained('/content/model_directory')



In [42]:
from transformers import GenerationConfig

generation_config = GenerationConfig(
    max_length=150,
    min_length=40,
    length_penalty=2.0,
    num_beams=4,
    early_stopping=True,
    pad_token_id=tokenizer.pad_token_id,
    eos_token_id=tokenizer.eos_token_id,
    bos_token_id=tokenizer.bos_token_id,
    decoder_start_token_id=tokenizer.bos_token_id,  # safer to use bos_token_id here
    forced_bos_token_id=tokenizer.bos_token_id,
    forced_eos_token_id=tokenizer.eos_token_id
)


In [47]:
def summarize(text):
    inputs = tokenizer(text, return_tensors='pt', max_length=1024, truncation=True, padding=True)
    summary_ids = model.generate(
        inputs['input_ids'],
        generation_config=generation_config
    )
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

In [51]:
text = "The Kutch conflict was a brief but significant military confrontation between India and Pakistan in 1965, centered around the Rann of Kutch, a disputed region located in the western part of India.Operation Desert Hawk was the codename for the military operation launched by the Pakistan Army in this area. At the time, the Rann of Kutch was under Indian control as per the long-standing status quo, but its boundary remained one of the few unresolved territorial disputes stemming from the 1947 Partition of India."

In [52]:
text

'The Kutch conflict was a brief but significant military confrontation between India and Pakistan in 1965, centered around the Rann of Kutch, a disputed region located in the western part of India.Operation Desert Hawk was the codename for the military operation launched by the Pakistan Army in this area. At the time, the Rann of Kutch was under Indian control as per the long-standing status quo, but its boundary remained one of the few unresolved territorial disputes stemming from the 1947 Partition of India.'

In [53]:
summarize(text)

' the Rann of Kutch was a disputed by India and Pakistan in 1965, a disputed region located in the western part of India in the western part of India in 1965.'