# Import Libraries

In [21]:
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM
from transformers import AutoTokenizer
from transformers import GenerationConfig
import torch
import torchvision

In [22]:
import pandas as pd

# Load Dataset

In [23]:
df = pd.read_csv('data/news_summary.csv',encoding='cp1252',engine='python')

In [24]:
df.head(5)

Unnamed: 0,author,date,headlines,read_more,text,ctext
0,Chhavi Tyagi,"03 Aug 2017,Thursday",Daman & Diu revokes mandatory Rakshabandhan in...,http://www.hindustantimes.com/india-news/raksh...,The Administration of Union Territory Daman an...,The Daman and Diu administration on Wednesday ...
1,Daisy Mowke,"03 Aug 2017,Thursday",Malaika slams user who trolled her for 'divorc...,http://www.hindustantimes.com/bollywood/malaik...,Malaika Arora slammed an Instagram user who tr...,"From her special numbers to TV?appearances, Bo..."
2,Arshiya Chopra,"03 Aug 2017,Thursday",'Virgin' now corrected to 'Unmarried' in IGIMS...,http://www.hindustantimes.com/patna/bihar-igim...,The Indira Gandhi Institute of Medical Science...,The Indira Gandhi Institute of Medical Science...
3,Sumedha Sehra,"03 Aug 2017,Thursday",Aaj aapne pakad liya: LeT man Dujana before be...,http://indiatoday.intoday.in/story/abu-dujana-...,Lashkar-e-Taiba's Kashmir commander Abu Dujana...,Lashkar-e-Taiba's Kashmir commander Abu Dujana...
4,Aarushi Maheshwari,"03 Aug 2017,Thursday",Hotel staff to get training to spot signs of s...,http://indiatoday.intoday.in/story/sex-traffic...,Hotels in Maharashtra will train their staff t...,Hotels in Mumbai and other Indian cities are t...


In [25]:
dataset = df[['text','ctext']].dropna()

In [26]:
import pandas as pd
from datasets import Dataset, DatasetDict

# Assuming you have loaded your pandas DataFrame into a variable 'df'
data = pd.DataFrame({
    'id': range(len(df)),
    'dialogue': df['ctext'],
    'summary': df['text']
})

data = data[data['dialogue'].notna()]
# Split the data into train, test, and validation sets
train_data = data.loc[:3158]
test_data = data.loc[3160:3600]
validation_data = data.loc[3601:4155]

# Convert each split into a Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_data)
test_dataset = Dataset.from_pandas(test_data)
validation_dataset = Dataset.from_pandas(validation_data)

# Create a DatasetDict with these datasets
dataset = DatasetDict({
    'train': train_dataset,
    'test': test_dataset,
    'validation': validation_dataset
})

example full and summary

In [27]:
example_indices = [40, 200]

dash_line = '-'.join('' for x in range(100))

for i, index in enumerate(example_indices):
    print(dash_line)
    print('Example ', i + 1)
    print(dash_line)
    print('INPUT DIALOGUE:')
    print(dataset['test'][index]['dialogue'])
    print(dash_line)
    print('BASELINE HUMAN SUMMARY:')
    print(dataset['test'][index]['summary'])
    print(dash_line)
    print()

---------------------------------------------------------------------------------------------------
Example  1
---------------------------------------------------------------------------------------------------
INPUT DIALOGUE:
A high alert has been sounded in Punjab's Pathankot district after a suspicious bag was spotted near Mamoon military station last night.A local resident informed the police about the bag on Sunday.Five shirts and two trousers were found in a wheat flour bag, with 'Jammu' on written on it, at a secluded place near the Defence Road.  Army, SWAT commandoes have launched a search operation after the bag was found hidden and stuffed in a gunny sack.Earlier in May, security alert was sounded in Pathankot after two suspicious bags were found lying unattended just a few yards away from the military base. The police had recovered mobile tower batteries from the bags.Punjab:High alert in Pathankot, search Op being conducted by police SWAT team & Army after a suspicious bag

 # Model Development

In [28]:
model_name='google/flan-t5-base'

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [29]:
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

## Without Prompt Engineering

In [30]:
for i, index in enumerate(example_indices):
    dialogue = dataset['test'][index]['dialogue']
    summary = dataset['test'][index]['summary']
    
    inputs = tokenizer(dialogue, return_tensors='pt')
    output = tokenizer.decode(
        model.generate(
            inputs["input_ids"], 
            max_new_tokens=50,
        )[0], 
        skip_special_tokens=True
    )
    
    print(dash_line)
    print('Example ', i + 1)
    print(dash_line)
    print(f'INPUT PROMPT:\n{dialogue}')
    print(dash_line)
    print(f'BASELINE HUMAN SUMMARY:\n{summary}')
    print(dash_line)
    print(f'MODEL GENERATION - WITHOUT PROMPT ENGINEERING:\n{output}\n')

---------------------------------------------------------------------------------------------------
Example  1
---------------------------------------------------------------------------------------------------
INPUT PROMPT:
A high alert has been sounded in Punjab's Pathankot district after a suspicious bag was spotted near Mamoon military station last night.A local resident informed the police about the bag on Sunday.Five shirts and two trousers were found in a wheat flour bag, with 'Jammu' on written on it, at a secluded place near the Defence Road.  Army, SWAT commandoes have launched a search operation after the bag was found hidden and stuffed in a gunny sack.Earlier in May, security alert was sounded in Pathankot after two suspicious bags were found lying unattended just a few yards away from the military base. The police had recovered mobile tower batteries from the bags.Punjab:High alert in Pathankot, search Op being conducted by police SWAT team & Army after a suspicious bag c

## Zero shot interference with an instruct prompt

In [31]:
for i, index in enumerate(example_indices):
    dialogue = dataset['test'][index]['dialogue']
    summary = dataset['test'][index]['summary']

    prompt = f"""
Summarize the news.

{dialogue}

Summary:
    """

    # Input constructed prompt instead of the dialogue.
    inputs = tokenizer(prompt, return_tensors='pt')
    output = tokenizer.decode(
        model.generate(
            inputs["input_ids"], 
            max_new_tokens=50,
        )[0], 
        skip_special_tokens=True
    )
    
    print(dash_line)
    print('Example ', i + 1)
    print(dash_line)
    print(f'INPUT PROMPT:\n{prompt}')
    print(dash_line)
    print(f'BASELINE HUMAN SUMMARY:\n{summary}')
    print(dash_line)    
    print(f'MODEL GENERATION - ZERO SHOT:\n{output}\n')

---------------------------------------------------------------------------------------------------
Example  1
---------------------------------------------------------------------------------------------------
INPUT PROMPT:

Summarize the news.

A high alert has been sounded in Punjab's Pathankot district after a suspicious bag was spotted near Mamoon military station last night.A local resident informed the police about the bag on Sunday.Five shirts and two trousers were found in a wheat flour bag, with 'Jammu' on written on it, at a secluded place near the Defence Road.  Army, SWAT commandoes have launched a search operation after the bag was found hidden and stuffed in a gunny sack.Earlier in May, security alert was sounded in Pathankot after two suspicious bags were found lying unattended just a few yards away from the military base. The police had recovered mobile tower batteries from the bags.Punjab:High alert in Pathankot, search Op being conducted by police SWAT team & Army af

## Zero shot with template instruct prompt from FLAN T-5 template prompt

In [32]:
for i, index in enumerate(example_indices):
    dialogue = dataset['test'][index]['dialogue']
    summary = dataset['test'][index]['summary']
        
    prompt = f"""
Dialogue:

{dialogue}

What was going on?
"""

    inputs = tokenizer(prompt, return_tensors='pt')
    output = tokenizer.decode(
        model.generate(
            inputs["input_ids"], 
            max_new_tokens=50,
        )[0], 
        skip_special_tokens=True
    )

    print(dash_line)
    print('Example ', i + 1)
    print(dash_line)
    print(f'INPUT PROMPT:\n{prompt}')
    print(dash_line)
    print(f'BASELINE HUMAN SUMMARY:\n{summary}\n')
    print(dash_line)
    print(f'MODEL GENERATION - ZERO SHOT:\n{output}\n')

---------------------------------------------------------------------------------------------------
Example  1
---------------------------------------------------------------------------------------------------
INPUT PROMPT:

Dialogue:

A high alert has been sounded in Punjab's Pathankot district after a suspicious bag was spotted near Mamoon military station last night.A local resident informed the police about the bag on Sunday.Five shirts and two trousers were found in a wheat flour bag, with 'Jammu' on written on it, at a secluded place near the Defence Road.  Army, SWAT commandoes have launched a search operation after the bag was found hidden and stuffed in a gunny sack.Earlier in May, security alert was sounded in Pathankot after two suspicious bags were found lying unattended just a few yards away from the military base. The police had recovered mobile tower batteries from the bags.Punjab:High alert in Pathankot, search Op being conducted by police SWAT team & Army after a susp

### One shot interference

In [33]:
def make_prompt(example_indices_full, example_index_to_summarize):
    prompt = ''
    for index in example_indices_full:
        dialogue = dataset['test'][index]['dialogue']
        summary = dataset['test'][index]['summary']
        
        # The stop sequence '{summary}\n\n\n' is important for FLAN-T5. Other models may have their own preferred stop sequence.
        prompt += f"""
Dialogue:

{dialogue}

What was going on?
{summary}


"""
    
    dialogue = dataset['test'][example_index_to_summarize]['dialogue']
    
    prompt += f"""
Dialogue:

{dialogue}

What was going on?
"""
        
    return prompt

In [34]:
example_indices_full = [40]
example_index_to_summarize = 200

one_shot_prompt = make_prompt(example_indices_full, example_index_to_summarize)

print(one_shot_prompt)


Dialogue:

A high alert has been sounded in Punjab's Pathankot district after a suspicious bag was spotted near Mamoon military station last night.A local resident informed the police about the bag on Sunday.Five shirts and two trousers were found in a wheat flour bag, with 'Jammu' on written on it, at a secluded place near the Defence Road.  Army, SWAT commandoes have launched a search operation after the bag was found hidden and stuffed in a gunny sack.Earlier in May, security alert was sounded in Pathankot after two suspicious bags were found lying unattended just a few yards away from the military base. The police had recovered mobile tower batteries from the bags.Punjab:High alert in Pathankot, search Op being conducted by police SWAT team & Army after a suspicious bag containing 3 uniforms was found pic.twitter.com/WbeKEq6N6pIn January last year, seven jawans were killed and over 37 people were injured in a terror attack on Pathankot air base.In 2015, three heavily-armed terrori

In [35]:
summary = dataset['test'][example_index_to_summarize]['summary']

inputs = tokenizer(one_shot_prompt, return_tensors='pt')
output = tokenizer.decode(
    model.generate(
        inputs["input_ids"],
        max_new_tokens=50,
    )[0], 
    skip_special_tokens=True
)

print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{summary}\n')
print(dash_line)
print(f'MODEL GENERATION - ONE SHOT:\n{output}')

Token indices sequence length is longer than the specified maximum sequence length for this model (616 > 512). Running this sequence through the model will result in indexing errors


---------------------------------------------------------------------------------------------------
BASELINE HUMAN SUMMARY:
Filmmaker Rohit Shetty has said he's happy and relieved that the Rajinikanth-Akshay Kumar starrer '2.0' isn't releasing on Diwali, the same day as his film 'Golmaal Again'. He added, "We know if we release a film with another big film, business does get affected." Earlier, Aamir's 'Secret Superstar', 'Golmaal Again' and '2.0' were scheduled to release on Diwali this year.

---------------------------------------------------------------------------------------------------
MODEL GENERATION - ONE SHOT:
Rohit Shetty says he is happy that his upcoming film "Golmaal 4" will not face competition from Ranjinikanths "2.0" at the box office as now both the movies are releasing on separate


## A few shot interference

In [36]:
example_indices_full = [40, 80, 120]
example_index_to_summarize = 200

few_shot_prompt = make_prompt(example_indices_full, example_index_to_summarize)

print(few_shot_prompt)


Dialogue:

A high alert has been sounded in Punjab's Pathankot district after a suspicious bag was spotted near Mamoon military station last night.A local resident informed the police about the bag on Sunday.Five shirts and two trousers were found in a wheat flour bag, with 'Jammu' on written on it, at a secluded place near the Defence Road.  Army, SWAT commandoes have launched a search operation after the bag was found hidden and stuffed in a gunny sack.Earlier in May, security alert was sounded in Pathankot after two suspicious bags were found lying unattended just a few yards away from the military base. The police had recovered mobile tower batteries from the bags.Punjab:High alert in Pathankot, search Op being conducted by police SWAT team & Army after a suspicious bag containing 3 uniforms was found pic.twitter.com/WbeKEq6N6pIn January last year, seven jawans were killed and over 37 people were injured in a terror attack on Pathankot air base.In 2015, three heavily-armed terrori

In [37]:
summary = dataset['test'][example_index_to_summarize]['summary']

inputs = tokenizer(few_shot_prompt, return_tensors='pt')
output = tokenizer.decode(
    model.generate(
        inputs["input_ids"],
        max_new_tokens=50,
    )[0], 
    skip_special_tokens=True
)

print(dialogue)
print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{summary}\n')
print(dash_line)
print(f'MODEL GENERATION - FEW SHOT:\n{output}')

Mumbai, May 11 (PTI) Director Rohit Shetty says he is happy that his upcoming film "Golmaal 4" will not have to face competition from Ranjinikanths "2.0" at the box office as now both the movies are releasing on separate dates. Earlier Rohits film starring Ajay Devgn, Tabu, Parineeti Chopra and Arshad Warsi, was set to release this Diwali alongside "2.0". "We tried to do that (referring to pushing ahead the release date) but we were not getting the right date. If we come on solo week or normal week then it is ok. But when you clash (at the box office) obviously the window is not that big as far as business is concerned," Rohit told PTI. 
---------------------------------------------------------------------------------------------------
BASELINE HUMAN SUMMARY:
Filmmaker Rohit Shetty has said he's happy and relieved that the Rajinikanth-Akshay Kumar starrer '2.0' isn't releasing on Diwali, the same day as his film 'Golmaal Again'. He added, "We know if we release a film with another big 

## General Configuration Parameter for Interfence

In [38]:
#generation_config = GenerationConfig(max_new_tokens=50)
#generation_config = GenerationConfig(max_new_tokens=10)
#generation_config = GenerationConfig(max_new_tokens=50, do_sample=True, temperature=0.1)
#generation_config = GenerationConfig(max_new_tokens=50, do_sample=True, temperature=0.5)
generation_config = GenerationConfig(max_new_tokens=50, do_sample=True, temperature=1.0)

inputs = tokenizer(few_shot_prompt, return_tensors='pt')
output = tokenizer.decode(
    model.generate(
        inputs["input_ids"],
        generation_config=generation_config,
    )[0], 
    skip_special_tokens=True
)
print(dash_line)
print(f'dialogue:\n{dialogue}')
print(dash_line)
print(f'MODEL GENERATION - FEW SHOT:\n{output}')
print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{summary}\n')

---------------------------------------------------------------------------------------------------
dialogue:
Mumbai, May 11 (PTI) Director Rohit Shetty says he is happy that his upcoming film "Golmaal 4" will not have to face competition from Ranjinikanths "2.0" at the box office as now both the movies are releasing on separate dates. Earlier Rohits film starring Ajay Devgn, Tabu, Parineeti Chopra and Arshad Warsi, was set to release this Diwali alongside "2.0". "We tried to do that (referring to pushing ahead the release date) but we were not getting the right date. If we come on solo week or normal week then it is ok. But when you clash (at the box office) obviously the window is not that big as far as business is concerned," Rohit told PTI. 
---------------------------------------------------------------------------------------------------
MODEL GENERATION - FEW SHOT:
"Golmaal 4" is set to release on June 15 alongside "2.0".
---------------------------------------------------------

# Fine Tuning Model

In [39]:
import time
import torch
from transformers import Trainer, TrainingArguments
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer
import torch
import time
import evaluate
import pandas as pd
import numpy as np

In [40]:
model_name='google/flan-t5-base'

original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [41]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

print(print_number_of_trainable_model_parameters(original_model))

trainable model parameters: 247577856
all model parameters: 247577856
percentage of trainable model parameters: 100.00%


In [42]:
def tokenize_function(example):
    start_prompt = 'Summarize the following conversation.\n\n'
    end_prompt = '\n\nSummary: '

    # Check if "dialogue" is not None
    if example["dialogue"] is not None:
        prompt = [start_prompt + dialogue + end_prompt for dialogue in example["dialogue"]]
        example['input_ids'] = tokenizer(prompt, padding="max_length", truncation=True, return_tensors="pt").input_ids
    else:
        # Handle the case where "dialogue" is None
        example['input_ids'] = [[]]  # Provide an empty list if "dialogue" is None

    example['labels'] = tokenizer(example["summary"], padding="max_length", truncation=True, return_tensors="pt").input_ids
    
    return example

# The dataset actually contains 3 different splits: train, validation, test.
# The tokenize_function code is handling all data across all splits in batches.
tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(['id', 'dialogue', 'summary'])

                                                                 

In [43]:
tokenized_datasets = tokenized_datasets.filter(lambda example, index: index % 100 == 0, with_indices=True)

                                                                    

In [44]:
print(f"Shapes of the datasets:")
print(f"Training: {tokenized_datasets['train'].shape}")
print(f"Validation: {tokenized_datasets['validation'].shape}")
print(f"Test: {tokenized_datasets['test'].shape}")

print(tokenized_datasets)

Shapes of the datasets:
Training: (31, 3)
Validation: (6, 3)
Test: (5, 3)
DatasetDict({
    train: Dataset({
        features: ['__index_level_0__', 'input_ids', 'labels'],
        num_rows: 31
    })
    test: Dataset({
        features: ['__index_level_0__', 'input_ids', 'labels'],
        num_rows: 5
    })
    validation: Dataset({
        features: ['__index_level_0__', 'input_ids', 'labels'],
        num_rows: 6
    })
})


## Full Fine Tuning

In [45]:
output_dir = f'./dialogue-summary-training-{str(int(time.time()))}'
batch_size = 8
training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=1e-5,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_steps=1,
    max_steps=1,
    per_device_train_batch_size=batch_size,  # Set the batch size here
    per_device_eval_batch_size=batch_size,   # Set the batch size for evaluation here
    save_steps=100,  # Adjust as needed
    save_total_limit=1  # Adjust as needed

)

trainer = Trainer(
    model=original_model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation']
)

In [None]:
trainer.train()

