# Import Libraries

In [1]:
%pip install --disable-pip-version-check \
    torch==1.13.1 \
    torchdata==0.5.1 --quiet

%pip install \
    transformers==4.27.2 \
    datasets==2.11.0  --quiet

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM
from transformers import AutoTokenizer
from transformers import GenerationConfig

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import pandas as pd

# Load Dataset

In [4]:
df = pd.read_csv('data/news_summary.csv',encoding='cp1252',engine='python')

In [5]:
df.head(5)

Unnamed: 0,author,date,headlines,read_more,text,ctext
0,Chhavi Tyagi,"03 Aug 2017,Thursday",Daman & Diu revokes mandatory Rakshabandhan in...,http://www.hindustantimes.com/india-news/raksh...,The Administration of Union Territory Daman an...,The Daman and Diu administration on Wednesday ...
1,Daisy Mowke,"03 Aug 2017,Thursday",Malaika slams user who trolled her for 'divorc...,http://www.hindustantimes.com/bollywood/malaik...,Malaika Arora slammed an Instagram user who tr...,"From her special numbers to TV?appearances, Bo..."
2,Arshiya Chopra,"03 Aug 2017,Thursday",'Virgin' now corrected to 'Unmarried' in IGIMS...,http://www.hindustantimes.com/patna/bihar-igim...,The Indira Gandhi Institute of Medical Science...,The Indira Gandhi Institute of Medical Science...
3,Sumedha Sehra,"03 Aug 2017,Thursday",Aaj aapne pakad liya: LeT man Dujana before be...,http://indiatoday.intoday.in/story/abu-dujana-...,Lashkar-e-Taiba's Kashmir commander Abu Dujana...,Lashkar-e-Taiba's Kashmir commander Abu Dujana...
4,Aarushi Maheshwari,"03 Aug 2017,Thursday",Hotel staff to get training to spot signs of s...,http://indiatoday.intoday.in/story/sex-traffic...,Hotels in Maharashtra will train their staff t...,Hotels in Mumbai and other Indian cities are t...


In [6]:
dataset = df[['text','ctext']]

In [7]:
dataset = dataset.rename(columns={'ctext':'full','text':'summary'})

example full and summary

In [8]:
example_indices = [40, 200]

dash_line = '-'.join('' for x in range(100))

for i, index in enumerate(example_indices):
    print(dash_line)
    print('Example ', i + 1)
    print(dash_line)
    print('INPUT DIALOGUE:')
    print(dataset.loc[index]['full'])
    print(dash_line)
    print('BASELINE HUMAN SUMMARY:')
    print(dataset.loc[index]['summary'])
    print(dash_line)
    print()

---------------------------------------------------------------------------------------------------
Example  1
---------------------------------------------------------------------------------------------------
INPUT DIALOGUE:
A Republican senator said Tuesday that US President Donald Trump has told him he would go to war to destroy North Korea rather than allow it to develop a long-range nuclear-armed missile.Influential lawmaker Lindsey Graham, a foreign policy hawk, told NBC?s Today Show: ?There is a military option: To destroy North Korea?s programme and North Korea itself.?Last week, North Korean leader Kim Jong-Un boasted that his country could now strike any target in the United States after carrying out its latest intercontinental ballistic missile test.World powers have been trying to stifle Pyongyang?s weapons programme through United Nations-backed sanctions, but have failed to daunt the regime and Washington is growing frustrated.Graham said if diplomacy, and in particular 

#

 # Model Development

In [9]:
model_name='google/flan-t5-base'

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [10]:
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

## Without Prompt Engineering

In [11]:
for i, index in enumerate(example_indices):
    dialogue = dataset.loc[index]['full']
    summary = dataset.loc[index]['summary']

    inputs = tokenizer(dialogue, return_tensors='pt')
    output = tokenizer.decode(
        model.generate(
            inputs["input_ids"],
            max_new_tokens=50,
        )[0],
        skip_special_tokens=True
    )

    print(dash_line)
    print('Example ', i + 1)
    print(dash_line)
    print(f'INPUT PROMPT:\n{dialogue}')
    print(dash_line)
    print(f'BASELINE HUMAN SUMMARY:\n{summary}')
    print(dash_line)
    print(f'MODEL GENERATION - WITHOUT PROMPT ENGINEERING:\n{output}\n')

---------------------------------------------------------------------------------------------------
Example  1
---------------------------------------------------------------------------------------------------
INPUT PROMPT:
A Republican senator said Tuesday that US President Donald Trump has told him he would go to war to destroy North Korea rather than allow it to develop a long-range nuclear-armed missile.Influential lawmaker Lindsey Graham, a foreign policy hawk, told NBC?s Today Show: ?There is a military option: To destroy North Korea?s programme and North Korea itself.?Last week, North Korean leader Kim Jong-Un boasted that his country could now strike any target in the United States after carrying out its latest intercontinental ballistic missile test.World powers have been trying to stifle Pyongyang?s weapons programme through United Nations-backed sanctions, but have failed to daunt the regime and Washington is growing frustrated.Graham said if diplomacy, and in particular pr

## Zero shot interference with an instruct prompt

In [12]:
for i, index in enumerate(example_indices):
    dialogue = dataset.loc[index]['full']
    summary = dataset.loc[index]['summary']

    prompt = f"""
Summarize the following conversation.

{dialogue}

Summary:
    """

    # Input constructed prompt instead of the dialogue.
    inputs = tokenizer(prompt, return_tensors='pt')
    output = tokenizer.decode(
        model.generate(
            inputs["input_ids"],
            max_new_tokens=50,
        )[0],
        skip_special_tokens=True
    )

    print(dash_line)
    print('Example ', i + 1)
    print(dash_line)
    print(f'INPUT PROMPT:\n{prompt}')
    print(dash_line)
    print(f'BASELINE HUMAN SUMMARY:\n{summary}')
    print(dash_line)
    print(f'MODEL GENERATION - ZERO SHOT:\n{output}\n')

---------------------------------------------------------------------------------------------------
Example  1
---------------------------------------------------------------------------------------------------
INPUT PROMPT:

Summarize the following conversation.

A Republican senator said Tuesday that US President Donald Trump has told him he would go to war to destroy North Korea rather than allow it to develop a long-range nuclear-armed missile.Influential lawmaker Lindsey Graham, a foreign policy hawk, told NBC?s Today Show: ?There is a military option: To destroy North Korea?s programme and North Korea itself.?Last week, North Korean leader Kim Jong-Un boasted that his country could now strike any target in the United States after carrying out its latest intercontinental ballistic missile test.World powers have been trying to stifle Pyongyang?s weapons programme through United Nations-backed sanctions, but have failed to daunt the regime and Washington is growing frustrated.Graham

## Zero shot with template instruct prompt from FLAN T-5

In [13]:
for i, index in enumerate(example_indices):
    dialogue = dataset.loc[index]['full']
    summary = dataset.loc[index]['summary']

    prompt = f"""
Dialogue:

{dialogue}

What was going on?
"""

    inputs = tokenizer(prompt, return_tensors='pt')
    output = tokenizer.decode(
        model.generate(
            inputs["input_ids"],
            max_new_tokens=50,
        )[0],
        skip_special_tokens=True
    )

    print(dash_line)
    print('Example ', i + 1)
    print(dash_line)
    print(f'INPUT PROMPT:\n{prompt}')
    print(dash_line)
    print(f'BASELINE HUMAN SUMMARY:\n{summary}\n')
    print(dash_line)
    print(f'MODEL GENERATION - ZERO SHOT:\n{output}\n')

---------------------------------------------------------------------------------------------------
Example  1
---------------------------------------------------------------------------------------------------
INPUT PROMPT:

Dialogue:

A Republican senator said Tuesday that US President Donald Trump has told him he would go to war to destroy North Korea rather than allow it to develop a long-range nuclear-armed missile.Influential lawmaker Lindsey Graham, a foreign policy hawk, told NBC?s Today Show: ?There is a military option: To destroy North Korea?s programme and North Korea itself.?Last week, North Korean leader Kim Jong-Un boasted that his country could now strike any target in the United States after carrying out its latest intercontinental ballistic missile test.World powers have been trying to stifle Pyongyang?s weapons programme through United Nations-backed sanctions, but have failed to daunt the regime and Washington is growing frustrated.Graham said if diplomacy, and in p

### One shot interference

In [14]:
def make_prompt(example_indices_full, example_index_to_summarize):
    prompt = ''
    for index in example_indices_full:
        dialogue = dataset.loc[index]['full']
        summary = dataset.loc[index]['summary']

        # The stop sequence '{summary}\n\n\n' is important for FLAN-T5. Other models may have their own preferred stop sequence.
        prompt += f"""
News:

{dialogue}

What was going on?
{summary}


"""

    dialogue = dataset.loc[example_index_to_summarize]['full']

    prompt += f"""
News:

{dialogue}

What was going on?
"""

    return prompt

In [15]:
example_indices_full = [40]
example_index_to_summarize = 200

one_shot_prompt = make_prompt(example_indices_full, example_index_to_summarize)

print(one_shot_prompt)


News:

A Republican senator said Tuesday that US President Donald Trump has told him he would go to war to destroy North Korea rather than allow it to develop a long-range nuclear-armed missile.Influential lawmaker Lindsey Graham, a foreign policy hawk, told NBC?s Today Show: ?There is a military option: To destroy North Korea?s programme and North Korea itself.?Last week, North Korean leader Kim Jong-Un boasted that his country could now strike any target in the United States after carrying out its latest intercontinental ballistic missile test.World powers have been trying to stifle Pyongyang?s weapons programme through United Nations-backed sanctions, but have failed to daunt the regime and Washington is growing frustrated.Graham said if diplomacy, and in particular pressure from the North?s neighbour China, fails to halt the programme then the United States will have no choice but to take devastating military action.?They?ve kicked the can down the road for 20 years. There will be

In [16]:
summary = dataset.loc[example_index_to_summarize]['summary']

inputs = tokenizer(one_shot_prompt, return_tensors='pt')
output = tokenizer.decode(
    model.generate(
        inputs["input_ids"],
        max_new_tokens=50,
    )[0],
    skip_special_tokens=True
)

print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{summary}\n')
print(dash_line)
print(f'MODEL GENERATION - ONE SHOT:\n{output}')

Token indices sequence length is longer than the specified maximum sequence length for this model (820 > 512). Running this sequence through the model will result in indexing errors


---------------------------------------------------------------------------------------------------
BASELINE HUMAN SUMMARY:
Congress on Monday informed Rajya Sabha that Raj Babbar has been injured in police action against villagers protesting demolition of their houses in Uttar Pradesh. They added that due to the injuries, he was unable to attend the Parliament proceedings. The house's Deputy Chairman dismissed the issue of Babbar's safety and security saying the state government should deal with this.

---------------------------------------------------------------------------------------------------
MODEL GENERATION - ONE SHOT:
Opposition Congress MP Raj Babbar injured in police action against villagers protesting demolition of their houses in Ambedkar Nagar district of Uttar Pradesh.


## A few shot interference

In [17]:
example_indices_full = [40, 80, 120]
example_index_to_summarize = 200

few_shot_prompt = make_prompt(example_indices_full, example_index_to_summarize)

print(few_shot_prompt)


News:

A Republican senator said Tuesday that US President Donald Trump has told him he would go to war to destroy North Korea rather than allow it to develop a long-range nuclear-armed missile.Influential lawmaker Lindsey Graham, a foreign policy hawk, told NBC?s Today Show: ?There is a military option: To destroy North Korea?s programme and North Korea itself.?Last week, North Korean leader Kim Jong-Un boasted that his country could now strike any target in the United States after carrying out its latest intercontinental ballistic missile test.World powers have been trying to stifle Pyongyang?s weapons programme through United Nations-backed sanctions, but have failed to daunt the regime and Washington is growing frustrated.Graham said if diplomacy, and in particular pressure from the North?s neighbour China, fails to halt the programme then the United States will have no choice but to take devastating military action.?They?ve kicked the can down the road for 20 years. There will be

In [18]:
summary = dataset.loc[example_index_to_summarize]['summary']

inputs = tokenizer(few_shot_prompt, return_tensors='pt')
output = tokenizer.decode(
    model.generate(
        inputs["input_ids"],
        max_new_tokens=50,
    )[0],
    skip_special_tokens=True
)

print(dialogue)
print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{summary}\n')
print(dash_line)
print(f'MODEL GENERATION - FEW SHOT:\n{output}')

Opposition Congress said in Rajya Sabha on Monday that its MP Raj Babbar was allegedly injured in police action against villagers protesting demolition of their houses in the Ambedkar Nagar district of Uttar Pradesh.Raising the issue through a notice under rule 267 that seeks setting aside of the business to take up discussion on the issue, Pramod Tiwari (Cong) said force was used against the villagers protesting against demolition.Raj Babbar, he said, was injured and is unable to attend the proceedings in Parliament.Deputy chairman PJ Kurien said if the member has a complaint, he can raise it and give a privilege notice.Tiwari was joined by other Congress members in raising the issue of the safety and security of Babbar, who is sitting on dharna at the protest site.Kurien asked them if the lawmaker was being prevented from attending Parliament. When they replied in negative, he said the notice is not admitted.Finance minister and Leader of the House Arun Jaitley said Babbar cannot be 

## General Configuration Parameter for Interfence

In [19]:
#generation_config = GenerationConfig(max_new_tokens=50)
#generation_config = GenerationConfig(max_new_tokens=10)
#generation_config = GenerationConfig(max_new_tokens=50, do_sample=True, temperature=0.1)
#generation_config = GenerationConfig(max_new_tokens=50, do_sample=True, temperature=0.5)
generation_config = GenerationConfig(max_new_tokens=50, do_sample=True, temperature=1.0)

inputs = tokenizer(few_shot_prompt, return_tensors='pt')
output = tokenizer.decode(
    model.generate(
        inputs["input_ids"],
        generation_config=generation_config,
    )[0],
    skip_special_tokens=True
)
print(dash_line)
print(f'dialogue:\n{dialogue}')
print(dash_line)
print(f'MODEL GENERATION - FEW SHOT:\n{output}')
print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{summary}\n')

---------------------------------------------------------------------------------------------------
dialogue:
Opposition Congress said in Rajya Sabha on Monday that its MP Raj Babbar was allegedly injured in police action against villagers protesting demolition of their houses in the Ambedkar Nagar district of Uttar Pradesh.Raising the issue through a notice under rule 267 that seeks setting aside of the business to take up discussion on the issue, Pramod Tiwari (Cong) said force was used against the villagers protesting against demolition.Raj Babbar, he said, was injured and is unable to attend the proceedings in Parliament.Deputy chairman PJ Kurien said if the member has a complaint, he can raise it and give a privilege notice.Tiwari was joined by other Congress members in raising the issue of the safety and security of Babbar, who is sitting on dharna at the protest site.Kurien asked them if the lawmaker was being prevented from attending Parliament. When they replied in negative, h

# Fine Tuning Model

In [20]:
import time
import torch
from transformers import Trainer, TrainingArguments

if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

In [21]:
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer
import torch
import time
import evaluate
import pandas as pd
import numpy as np

In [22]:
model_name='google/flan-t5-base'

original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(model_name)
original_model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=768, out_features=2048, bias=False)
              (wi_1): Linear(in_features=768, out_features=2048, bias=False)
              (wo):

In [23]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

print(print_number_of_trainable_model_parameters(original_model))

trainable model parameters: 247577856
all model parameters: 247577856
percentage of trainable model parameters: 100.00%


In [24]:
def tokenize_function(example):
    start_prompt = 'Summarize the following conversation.\n\n'
    end_prompt = '\n\nSummary: '

    # Check if 'full' column is NaN or empty, and skip those rows
    if pd.notna(example["full"]) and example["full"].strip() != "":
        prompt = [start_prompt + full + end_prompt for full in example["full"]]
        example['input_ids'] = tokenizer(prompt, padding="max_length", truncation=True, return_tensors="pt").input_ids
        example['labels'] = tokenizer(example["summary"], padding="max_length", truncation=True, return_tensors="pt").input_ids
    else:
        # Handle empty or NaN values as needed (e.g., set default values)
        pass

    return example

# The dataset actually contains 3 diff splits: train, validation, test.
# The tokenize_function code is handling all data across all splits in batches.
# Assuming you have a Pandas DataFrame named 'dataset'
tokenized_datasets = dataset.apply(tokenize_function, axis=1)

In [25]:
tokenized_datasets.drop(['full','summary'],axis=1,inplace=True)

In [48]:
train_dataset = tokenized_datasets.loc[:3158]
test_dataset = tokenized_datasets.loc[3160:3600]
val_dataset = tokenized_datasets.loc[3601:4155]

In [71]:
from sklearn.model_selection import train_test_split

# Assuming you have a DataFrame named 'tokenized_datasets'
# First, split your data into training and a combined validation/test set
train_dataset, test_dataset = train_test_split(tokenized_datasets, test_size=0.2, random_state=42)

# Then, split the combined validation/test set into separate validation and test sets
val_dataset, test_dataset = train_test_split(test_dataset, test_size=0.5, random_state=42)

In [72]:
print(f"Shapes of the datasets:")
print(f"Training: {train_dataset.shape}")
print(f"Validation: {test_dataset .shape}")
print(f"Test: {val_dataset.shape}")

Shapes of the datasets:
Training: (3611, 2)
Validation: (452, 2)
Test: (451, 2)


In [73]:
import torch
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, input_ids, labels):
        self.input_ids = input_ids
        self.labels = labels

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        input_id = self.input_ids[idx]
        label = self.labels[idx]
        return {
            'input_ids': input_id,
            'labels': label
        }

# Assuming you already have train_dataset and val_dataset as tensors
# Convert them into CustomDataset objects
# Now you have train_data, validation_data, and test_data DataFrames
train_dataset = CustomDataset(train_dataset['input_ids'], train_dataset['labels'])
val_dataset = CustomDataset(val_dataset['input_ids'], val_dataset['labels'])
test_dataset = CustomDataset(test_dataset['input_ids'], test_dataset['labels'])

## Full Fine Tuning

In [76]:
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding

# ... (Previous code for CustomDataset and train_dataset/val_dataset) ...

# Create a data collator with padding

output_dir = f'./news-summary-training-{str(int(time.time()))}'

training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=1e-5,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_steps=1,
    max_steps=1
)

trainer = Trainer(
    model=original_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)


In [77]:
trainer.train()

RuntimeError: stack expects each tensor to be equal size, but got [1573, 512] at entry 0 and [1663, 512] at entry 1