"https://rubikscode.net/2022/04/25/text-summarization-with-huggingface-transformers/"

In [20]:
!pip install datasets
!pip install transformers
!pip install sentencepiece
!pip install evaluate
!pip install sentence_transformers
!pip install rouge_score
!pip install bert_score

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [47]:
from datasets import load_dataset,get_dataset_split_names,load_from_disk
from transformers import AutoTokenizer
from transformers import DataCollatorForSeq2Seq
import evaluate
import numpy as np
from transformers import pipeline
import os
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
import torch
import pickle
from typing import List
import yaml
import time
from sentence_transformers import SentenceTransformer
from nltk.tokenize import sent_tokenize
from sklearn.metrics.pairwise import cosine_similarity
import nltk
import numpy as np
import pandas as pd

In [None]:
nltk.download('punkt')

In [22]:
dataset = load_dataset('cnn_dailymail','3.0.0','validation')



  0%|          | 0/3 [00:00<?, ?it/s]

In [24]:
def loadModel(model_path):
  return SentenceTransformer(model_path)

def getSentences(text):
  return sent_tokenize(text)

def getEmbeddings(model, sentences):
  embeddings = model.encode(sentences)
  return embeddings

def getSimilarityMatrix(generated_embs, reference_embs):
  return cosine_similarity(generated_embs, reference_embs)

def getScore(sim_mat):
  
  # get maximum along reference text axis
  scores = np.max(sim_mat, axis = 0)

  # calculate mean
  final_score = np.mean(scores)

  return final_score

def calcSBERTScore(model, generated_texts, reference_texts, debug = False):

  score_final = 0

  # get sentences
  for i in range(len(generated_texts)):

    generated_text = generated_texts[i]
    reference_text = reference_texts[i]

    generated_sents = getSentences(generated_text)
    reference_sents = getSentences(reference_text)

    # get embeddings
    generated_embs = getEmbeddings(model, generated_sents)
    reference_embs = getEmbeddings(model, reference_sents)

    # calculate pairwise cosine similarity
    sim_mat = cosine_similarity(generated_embs, reference_embs)

    if debug:
      print("sim mat shape", sim_mat.shape)
      print(sim_mat)
    # get score
    score = getScore(sim_mat)
    score_final = score_final + score
  
  return score_final/len(generated_texts)

In [25]:
from transformers import pipeline
summarizer = pipeline("summarization", model = "google/pegasus-xsum")

In [41]:
inputs = list(dataset['validation']['article'])[0:50]
ground_truths = list(dataset['validation']['highlights'])[0:50]
predicted_summary = []

In [42]:
count = 0
for input in inputs:
  if(count%20 == 0):
    print(count)
  predicted_summary.append(summarizer(input[:512])[0]['summary_text'])
  count = count + 1

0


In [49]:
results_dataframe_summarization_CNN = pd.DataFrame()
results_dataframe_summarization_CNN['predicted_summary'] = predicted_summary
results_dataframe_summarization_CNN['groundtruth_summary'] = ground_truths
results_dataframe_summarization_CNN.to_csv('results_dataframe_summarization_CNN.csv')

In [46]:
rouge = evaluate.load('rouge')
bertscore = evaluate.load("bertscore")
modelsbert_path = 'paraphrase-MiniLM-L6-v2'

# Measure time for results_rouge
start_time = time.time()
results_rouge = rouge.compute(predictions=predicted_summary, references=ground_truths)
end_time = time.time()
results_rouge_time = end_time - start_time

# Measure time for results_bertscore
start_time = time.time()
results_bertscore = bertscore.compute(predictions=predicted_summary, references=ground_truths, lang="en")
end_time = time.time()
results_bertscore_time = end_time - start_time

# Measure time for results_sBERT
modelsbert = loadModel(modelsbert_path)
start_time = time.time()
results_sBERT = calcSBERTScore(modelsbert, predicted_summary, ground_truths)
end_time = time.time()
results_sBERT_time = end_time - start_time

# Print the time taken for each prediction
print("Time taken for results_rouge: ", results_rouge_time, " seconds")
print("Time taken for results_bertscore: ", results_bertscore_time, " seconds")
print("Time taken for results_sBERT: ", results_sBERT_time, " seconds")

results = {
    'results_rouge':[results_rouge,results_rouge_time],
    'results_bertscore':[results_bertscore,results_bertscore_time],
    'results_sBERT':[results_sBERT,results_sBERT_time]
}


Time taken for results_rouge:  0.23982477188110352  seconds
Time taken for results_bertscore:  6.601160526275635  seconds
Time taken for results_sBERT:  0.47241878509521484  seconds


In [None]:
with open('temp', "wb") as file:
    pickle.dump(results,file)