<a href="https://colab.research.google.com/github/nowshinJahan17/Text-Summarization/blob/Nowshin_Jahan/Copy_of_gitcommand.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [26]:
# Install datasets
!pip install datasets
!pip install evaluate
!pip install -U sacrebleu
!pip install rouge_score
!pip install huggingface_hub

# Import required libraries
import pandas as pd
import numpy as np
from transformers import pipeline, set_seed
import matplotlib.pyplot as plt
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')

# Import datasets and transformers
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# Load the dataset
dataset = load_dataset("cnn_dailymail", "3.0.0")

# Print the dataset and some sample data
print(dataset)

print(f"Freatures in cnn_dailymail :{dataset['train'].column_names}")
print(dataset['train'][0])
print(dataset['validation'][0])
print(dataset['test'][1])



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 11490
    })
})
Freatures in cnn_dailymail :['article', 'highlights', 'id']
{'article': 'LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won\'t cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don\'t plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something simi

# Prepare text for summarization

In [2]:
sample_text = dataset["train"][0]["article"][:1000]
summaries = {}


# Baseline summarization function

In [3]:
def baseline_summary_three_sent(text):
    return "\n".join(sent_tokenize(text)[:3])


# Generate baseline summary

In [4]:
summaries['baseline'] = baseline_summary_three_sent(sample_text)
summaries['baseline']


'LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won\'t cast a spell on him.\nDaniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties.\n"I don\'t plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month.'

# Model implementation: GPT2-Medium

In [5]:
from transformers import pipeline, set_seed
set_seed(42)
pipe = pipeline('text-generation', model='gpt2-medium')
gpt2_query = sample_text + "\nTL;DR:\n"
pipe_out = pipe(gpt2_query, max_length=512, clean_up_tokenization_spaces=True)


config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


## View Generated Text

In [6]:
pipe_out

[{'generated_text': 'LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won\'t cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don\'t plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don\'t think I\'ll be particularly extravagant. "The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs." At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hostel: Part II," currently six places below his number one movie on the UK box off

In [7]:
pipe_out[0]['generated_text'][len(gpt2_query):]


'To be financially secure you need to lose money. You can sell your house with any of the following options. "Gambling for the weekend" - this means buying books, TV shows and CDs for ten shillings each. "Gambling at night" - you can spend £20, buy a drink and watch a horror film or two, then go home to your mum, who is probably still dying in a nursing home. "Gambling on an off weekend" - if you go home on a Sunday night but are only halfway through a movie and will spend the rest of your day with friends, you can buy anything from a bottle of wine to a bottle of wine. "Gambling on a Monday morning" - you can gamble anything at all from cigarettes, alcohol, lottery tickets and even a copy of the Sun on Sunday.\n"Gambling for the weekend" - this means buying books, TV shows and CDs for ten shillings each. "Gambling at night" - you can spend £20, buy a drink and watch a horror film or two, then go home to your mum, who is probably still dying in a nursing home. "Gambling on an off weeke

In [8]:
summaries['gpt2'] = "\n".join(sent_tokenize(pipe_out[0]['generated_text'][len(gpt2_query):]))

# **BART**

In [9]:
pipe = pipeline("summarization", model="facebook/bart-large-cnn")
pipe_out = pipe(sample_text)

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [10]:
pipe_out

[{'summary_text': 'Harry Potter star Daniel Radcliffe turns 18 on Monday. He gains access to a reported £20 million ($41.1 million) fortune. Radcliffe says he has no plans to fritter his cash away on fast cars, drink and parties. At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see "Hostel: Part II"'}]

In [11]:
summaries['bart'] = "\n".join(sent_tokenize(pipe_out[0]['summary_text'][len(gpt2_query):]))

# PEGASUS

In [12]:
pipe = pipeline('summarization', model="google/pegasus-cnn_dailymail")
pipe_out = pipe(sample_text)

config.json:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/88.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

In [13]:
pipe_out

[{'summary_text': 'Harry Potter star Daniel Radcliffe gains access to a reported £20 million fortune .<n>The young actor says he has no plans to fritter his cash away .<n>Radcliffe: "I don\'t think I\'ll be particularly extravagant"'}]

In [14]:
summaries["pegasus"] = pipe_out[0]["summary_text"].replace(" .<n>", ".\n").replace("<n>", "\n")


In [15]:
summaries["pegasus"]

'Harry Potter star Daniel Radcliffe gains access to a reported £20 million fortune.\nThe young actor says he has no plans to fritter his cash away.\nRadcliffe: "I don\'t think I\'ll be particularly extravagant"'

# T5

In [16]:
pipe = pipeline('summarization', model="t5-small")
pipe_out = pipe(sample_text)

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

In [17]:
pipe_out

[{'summary_text': 'the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties . he will be able to gamble in a casino, buy a drink in pub or see horror film "Hostel: Part II"'}]

In [18]:
summaries['t5'] = 'n'.join(sent_tokenize(pipe_out[0]['summary_text']))

## comparing different summaries

In [19]:
print ("GROUND TRUTH")

print (dataset['train'][0]['highlights'])

for model_name in summaries:
  print(model_name.upper())
  print (summaries[model_name])

GROUND TRUTH
Harry Potter star Daniel Radcliffe gets £20M fortune as he turns 18 Monday .
Young actor says he has no plans to fritter his cash away .
Radcliffe's earnings from first five Potter films have been held in trust fund .
BASELINE
LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell on him.
Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties.
"I don't plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month.
GPT2
To be financially secure you need to lose money.
You can sell your house with any of the following op

In [20]:


from evaluate import load

bleu_metric = load("sacrebleu")




Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

In [27]:
bleu_metric.add_batch(predictions=[summaries['t5']], references=[[dataset['train'][0]['highlights']]])
results = bleu_metric.compute()
results['precision'] = [np.round(p, 2) for p in results['precisions']]
df= pd.DataFrame.from_dict(results, orient = 'index', columns = ['value'])
print(df)

                                                        value
score                                               21.624135
counts                                         [13, 10, 9, 8]
totals                                       [47, 46, 45, 44]
precisions  [27.659574468085108, 21.73913043478261, 20.0, ...
bp                                                        1.0
sys_len                                                    47
ref_len                                                    41
precision                         [27.66, 21.74, 20.0, 18.18]


In [23]:


from evaluate import load


rouge_metric = load("rouge")


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [24]:
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
reference = dataset['train'][0]['highlights']
records = []
for model_name in summaries:
  rouge_metric.add(prediction= summaries[model_name], reference = reference)
  score = rouge_metric. compute()
  rouge_dict = {rn: score[rn]  for rn in rouge_names}
  print(score)
  records.append(rouge_dict)
  print (len(records))
  print(len(summaries.keys()))
  pd.DataFrame.from_records(records, index= list(summaries.keys())[:len(records)])
  df= pd.DataFrame.from_records(records, index= list(summaries.keys())[:len(records)])
  print(df)





{'rouge1': 0.3354838709677419, 'rouge2': 0.24836601307189543, 'rougeL': 0.29677419354838713, 'rougeLsum': 0.3354838709677419}
1
5
            rouge1    rouge2    rougeL  rougeLsum
baseline  0.335484  0.248366  0.296774   0.335484
{'rouge1': 0.0299625468164794, 'rouge2': 0.0, 'rougeL': 0.02247191011235955, 'rougeLsum': 0.0299625468164794}
2
5
            rouge1    rouge2    rougeL  rougeLsum
baseline  0.335484  0.248366  0.296774   0.335484
gpt2      0.029963  0.000000  0.022472   0.029963
{'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
3
5
            rouge1    rouge2    rougeL  rougeLsum
baseline  0.335484  0.248366  0.296774   0.335484
gpt2      0.029963  0.000000  0.022472   0.029963
bart      0.000000  0.000000  0.000000   0.000000
{'rouge1': 0.5066666666666667, 'rouge2': 0.4383561643835617, 'rougeL': 0.5066666666666667, 'rougeLsum': 0.5066666666666667}
4
5
            rouge1    rouge2    rougeL  rougeLsum
baseline  0.335484  0.248366  0.296774   0.335484
gpt2      

Evaluation on the test set of the CNN/DAILYMAIL DATASET


In [None]:
def calculate_metric_on_baseline_test_ds(dataset, metric, column_test='article', column_summary='highlights'):
    summaries = [baseline_summary_three_sent(text) for text in dataset[column_test]]
    metric.add_batch(predictions=summaries, references=dataset[column_summary])
    score = metric.compute()
    return score


In [None]:
test_sampled = dataset['train'].shuffle(seed=42).select(range(1000))
score = calculate_metric_on_baseline_test_ds(
    dataset=test_sampled,
    metric=rouge_metric,
    column_test='article',
    column_summary='highlights'
)

# Process ROUGE scores into a DataFrame
rouge_dict = {rn: score[rn] for rn in rouge_names}
df = pd.DataFrame.from_dict(rouge_dict, orient='index', columns=['baseline']).T
print(df)


In [None]:
 from tqdm import tqdm
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'

def generate_batch_sized_chunks(list_of_elements, batch_size):
    for i in range(0, len(list_of_elements), batch_size):
        yield list_of_elements[i:i + batch_size]

def calculate_metric_on_test_ds(dataset, metric, model, tokenizer, batch_size=1, device=device, column_text="article", column_summary="highlights"):
    article_batches = list(generate_batch_sized_chunks(dataset[column_text], batch_size))
    target_batches = list(generate_batch_sized_chunks(dataset[column_summary], batch_size))

    for article_batch, target_batch in tqdm(zip(article_batches, target_batches), total=len(article_batches)):
        inputs = tokenizer(article_batch, max_length=1024, truncation=True, padding="max_length", return_tensors="pt")
        inputs = {key: value.to(device) for key, value in inputs.items()}

        summaries = model.generate(input_ids=inputs['input_ids'].to(device), attention_mask=inputs['attention_mask'].to(device),
                                   length_penalty=0.8, num_beams=8, max_length=128)

        decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True, clean_up_tokenization_spaces=True) for s in summaries]
        decoded_summaries = [d.replace("<n>", " ") for d in decoded_summaries]

        metric.add_batch(predictions=decoded_summaries, references=target_batch)

    score = metric.compute()
    return score


In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from huggingface_hub import login
login("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")
model_ckpt = "goodle/pegasus-cnn_dailymail"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model_pegasus= AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device)
score= calculate_metric_on_test_ds(test_sampled, rouge_metric, model_pegasus, tokenizer, batch_size)

pd.DataFrame(rouge_dict, index = ['pegasus'])