In [1]:
import numpy as np
import pandas as pd

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/newspaper-text-summarization-cnn-dailymail/cnn_dailymail/validation.csv
/kaggle/input/newspaper-text-summarization-cnn-dailymail/cnn_dailymail/train.csv
/kaggle/input/newspaper-text-summarization-cnn-dailymail/cnn_dailymail/test.csv


In [2]:
df_train = pd.read_csv("/kaggle/input/newspaper-text-summarization-cnn-dailymail/cnn_dailymail/train.csv")
df_test = pd.read_csv("/kaggle/input/newspaper-text-summarization-cnn-dailymail/cnn_dailymail/test.csv")
df_validation = pd.read_csv("/kaggle/input/newspaper-text-summarization-cnn-dailymail/cnn_dailymail/validation.csv")

In [3]:
df_train.head()

Unnamed: 0,id,article,highlights
0,0001d1afc246a7964130f43ae940af6bc6c57f01,By . Associated Press . PUBLISHED: . 14:11 EST...,"Bishop John Folda, of North Dakota, is taking ..."
1,0002095e55fcbd3a2f366d9bf92a95433dc305ef,(CNN) -- Ralph Mata was an internal affairs li...,Criminal complaint: Cop used his role to help ...
2,00027e965c8264c35cc1bc55556db388da82b07f,A drunk driver who killed a young woman in a h...,"Craig Eccleston-Todd, 27, had drunk at least t..."
3,0002c17436637c4fe1837c935c04de47adb18e9a,(CNN) -- With a breezy sweep of his pen Presid...,Nina dos Santos says Europe must be ready to a...
4,0003ad6ef0c37534f80b55b4235108024b407f0b,Fleetwood are the only team still to have a 10...,Fleetwood top of League One after 2-0 win at S...


In [4]:
df_train.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 287113 entries, 0 to 287112
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   id          287113 non-null  object
 1   article     287113 non-null  object
 2   highlights  287113 non-null  object
dtypes: object(3)
memory usage: 1.7 GB


In [5]:
df_train.iloc[0].highlights

'Bishop John Folda, of North Dakota, is taking time off after being diagnosed .\nHe contracted the infection through contaminated food in Italy .\nChurch members in Fargo, Grand Forks and Jamestown could have been exposed .'

In [6]:
df_train.iloc[0].article

"By . Associated Press . PUBLISHED: . 14:11 EST, 25 October 2013 . | . UPDATED: . 15:36 EST, 25 October 2013 . The bishop of the Fargo Catholic Diocese in North Dakota has exposed potentially hundreds of church members in Fargo, Grand Forks and Jamestown to the hepatitis A virus in late September and early October. The state Health Department has issued an advisory of exposure for anyone who attended five churches and took communion. Bishop John Folda (pictured) of the Fargo Catholic Diocese in North Dakota has exposed potentially hundreds of church members in Fargo, Grand Forks and Jamestown to the hepatitis A . State Immunization Program Manager Molly Howell says the risk is low, but officials feel it's important to alert people to the possible exposure. The diocese announced on Monday that Bishop John Folda is taking time off after being diagnosed with hepatitis A. The diocese says he contracted the infection through contaminated food while attending a conference for newly ordained 

## Extractive Methods

All extractive methods follow these three basic steps:
1. Create an intermediate representation of the text.
2. Score the sentences/phrases based on the chosen representation.
3. Rank and choose sentences to create a summary of the text.

## Data Preprocessing

In [7]:
! pip install textacy

Collecting textacy
  Downloading textacy-0.13.0-py3-none-any.whl (210 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m210.7/210.7 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
Collecting floret~=0.10.0 (from textacy)
  Obtaining dependency information for floret~=0.10.0 from https://files.pythonhosted.org/packages/16/ee/388a5c76c9292f4bef85d7ef895005bb39a0899f8004e9daceb57b2bb0c9/floret-0.10.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Downloading floret-0.10.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.1 kB)
Collecting jellyfish>=0.8.0 (from textacy)
  Obtaining dependency information for jellyfish>=0.8.0 from https://files.pythonhosted.org/packages/26/87/8d31224804af9dfa7b34657e083b67b24b322c41dd9464b52218c1a33890/jellyfish-1.0.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Downloading jellyfish-1.0.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.5 kB)
Collectin

In [8]:
from textacy import preprocessing as tprep
from spacy.lang.en.stop_words import STOP_WORDS
import re
from tqdm.autonotebook import tqdm

tqdm.pandas()

process = tprep.make_pipeline(
    tprep.replace.emails,
    tprep.replace.emojis,
    tprep.replace.urls,
    tprep.replace.phone_numbers,
    tprep.replace.hashtags,
    tprep.replace.currency_symbols,
    lambda text: re.sub(r"\n", " ", text),
    tprep.remove.html_tags,
    tprep.remove.brackets,
    # tprep.remove.punctuation,
    tprep.normalize.hyphenated_words,
    tprep.normalize.quotation_marks,
    tprep.normalize.unicode,
    tprep.normalize.bullet_points,
    tprep.normalize.whitespace,
)

  from tqdm.autonotebook import tqdm


In [10]:
def sample_df(df, frac=0.1):
    return df.sample(frac=frac, random_state=42).reset_index(drop=True)

def preprocess(df):
    df.article = df.article.progress_apply(process)
    df.highlights = df.highlights.progress_apply(process)
    return df

ds = sample_df(df_train, 0.01)
ds = preprocess(ds)

ds_test = sample_df(df_test, 0.1)
ds_test = preprocess(ds_test)

  0%|          | 0/2871 [00:00<?, ?it/s]

  0%|          | 0/2871 [00:00<?, ?it/s]

  0%|          | 0/1149 [00:00<?, ?it/s]

  0%|          | 0/1149 [00:00<?, ?it/s]

## Summarizing Text Using Topic Representation

The simplest approach would be to identify important sentences based on an aggregate of the TF-IDF values of the words in that sentence. We will apply the TF-IDF vectorization and then aggregate the values to a sentence level. We can generate a score for each sentence as a sum of the TF-IDF values for each word in that sentence. This would mean that a sentence with a high score contains many important words as compared to other sentences in the article

In [11]:
article0, highlights0 = ds.loc[0, 'article'], ds.loc[0, 'highlights']
article0

"By . Mia De Graaf . Britons flocked to beaches across the southern coast yesterday as millions look set to bask in glorious sunshine today. Temperatures soared to 17C in Brighton and Dorset, with people starting their long weekend in deck chairs by the sea. Figures from Asda suggest the unexpected sunshine has also inspired a wave of impromptu barbecues, with sales of sausages and equipment expected to triple those in April. Sun's out: Brighton beach was packed with Britons enjoying the unexpected sunshine to start the long weekend as temperatures hit 17C across the south coast . Although frost is set to hit the south tonight - with temperatures dropping to 1C - Britons stocking up for a barbecue will be in luck tomorrow, with forecasters predicting dry and sunny weather across southern England, southern Wales and the south Midlands. In Weymouth, Dorset, the sun came out in time for the town's annual kite festival, held on the beach. But the good weather has not been enjoyed by all as

In [12]:
highlights0

'People enjoyed temperatures of 17C at Brighton beach in West Sussex and Weymouth in Dorset . Asda claims it will sell a million sausages over long weekend despite night temperatures dropping to minus 1C . But the good weather has not been enjoyed by all as the north west and Scotland have seen heavy rain .'

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import tokenize


def tfidf_summary(text, num_summary_sentence):
    summary_sentence = []
    sentences = tokenize.sent_tokenize(text)
    
    tfidf = TfidfVectorizer()
    words_tfidf = tfidf.fit_transform(sentences)
    
    # Sort the sentences in descending order by the sum of TF-IDF values
    sentence_sum = words_tfidf.sum(axis=1)
    important_sentences = np.argsort(sentence_sum, axis=0)[::-1]
    
    for i in range(0, len(sentences)):
        if i in important_sentences[:num_summary_sentence]:
            summary_sentence.append(sentences[i])
    return summary_sentence

In [14]:
tfidf_summary(article0, 3)

['Figures from Asda suggest the unexpected sunshine has also inspired a wave of impromptu barbecues, with sales of sausages and equipment expected to triple those in April.',
 'Although frost is set to hit the south tonight - with temperatures dropping to 1C - Britons stocking up for a barbecue will be in luck tomorrow, with forecasters predicting dry and sunny weather across southern England, southern Wales and the south Midlands.',
 'Activity: The sun came out in time for the opening day of Weymouth Kite Festival, which sees people from all over come to show off creative kites .']

Some information about the temperature is in the top 3. For a simple not bad, although we just stick to actual sentences in the whole article, this method won't work for all cases but it's fast to get started.

## LSA Algorithm

One of the modern methods used in extractive-based summarization is latent semantic analysis (LSA). LSA is a general-purpose method that is used for topic modeling, document similarity, and other tasks. LSA assumes that words that are close in meaning will occur in the same documents. In the LSA algorithm, we first represent the entire article in the form of a sentence-term matrix.

We will be using sumy package for this but gensim also provide LsiModel which can be used as well

In [15]:
! pip install sumy

Collecting sumy
  Downloading sumy-0.11.0-py2.py3-none-any.whl (97 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m97.3/97.3 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
Collecting breadability>=0.1.20 (from sumy)
  Downloading breadability-0.1.20.tar.gz (32 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting pycountry>=18.2.23 (from sumy)
  Obtaining dependency information for pycountry>=18.2.23 from https://files.pythonhosted.org/packages/48/12/fdbcd29b5a243af2f1c1a83636a21e3837aeaa070c9212ebe657e39ce563/pycountry-23.12.11-py3-none-any.whl.metadata
  Downloading pycountry-23.12.11-py3-none-any.whl.metadata (12 kB)
Collecting chardet (from breadability>=0.1.20->sumy)
  Obtaining dependency information for chardet from https://files.pythonhosted.org/packages/38/6f/f5fbc992a329ee4e0f288c1fe0e2ad9485ed064cac731ed2fe47dcc38cbf/chardet-5.2.0-py3-none-any.whl.metadata
  Downloading chardet-5.2.0-py3-none-any.whl.metadata (3.4 kB)
Downloading pycountry-23.12.11

In [16]:
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words
from sumy.summarizers.lsa import LsaSummarizer


def lsa_summary(text, num_summary_sentence, language='english'):
    summary_sentence = []
    
    stemmer = Stemmer(language)
    parser = PlaintextParser.from_string(text, Tokenizer(language))

    summarizer = LsaSummarizer(stemmer)
    summarizer.stop_words = STOP_WORDS    # get_stop_words(language)
    
    for sentence in summarizer(parser.document, num_summary_sentence):
        summary_sentence.append(str(sentence))
    return summary_sentence

In [17]:
lsa_summary(article0, 3)

['Figures from Asda suggest the unexpected sunshine has also inspired a wave of impromptu barbecues, with sales of sausages and equipment expected to triple those in April.',
 "Sun's out: Brighton beach was packed with Britons enjoying the unexpected sunshine to start the long weekend as temperatures hit 17C across the south coast .",
 'However, the north east enjoyed a bright spell at midday today with sun shining in Harrogate and York ahead of the rainy weekend.']

## Summarizing Text Using an Indicator Representation

Indicator representation methods aim to create the intermediate representation of a sentence by using features of the sentence and its relationship to others in the document rather than using only the words in the sentence. TextRank is one of the most popular examples of an indicator-based method

In [18]:
from sumy.summarizers.text_rank import TextRankSummarizer


def textrank_summary(text, num_summary_sentence, language='english'):
    summary_sentence = []
    
    stemmer = Stemmer(language)
    parser = PlaintextParser.from_string(text, Tokenizer(language))
    
    summarizer = TextRankSummarizer(stemmer)
    summarizer.stop_words = STOP_WORDS    # get_stop_words(language)
    
    for sentence in summarizer(parser.document, num_summary_sentence):
        summary_sentence.append(str(sentence))
    return summary_sentence

In [19]:
textrank_summary(article0, 3)

['Although frost is set to hit the south tonight - with temperatures dropping to 1C - Britons stocking up for a barbecue will be in luck tomorrow, with forecasters predicting dry and sunny weather across southern England, southern Wales and the south Midlands.',
 'Three day forecast: Tomorrow sunny spells and warm weather is set to cover southern England, but the heavy rain up north is set to stay .',
 'Though rain poured down in the north west, the north east enjoyed a bright spell at midday today with sun shining in Harrogate ahead of the rainy weekend .']

Looks like we got a summary to point to temparature changes which looks to be better previous so far

## Measuring the Performance of Text Summarization Methods

We have seen methods that produce summaries of some given text. Each summary differs from the other in subtle ways, and we have to rely on our subjective evaluation. This is certainly a challenge in selecting a method that works best for a given use case. Let's have a look at commonly used
accuracy metrics and see how they can be used to empirically select the best method for summarization. We must understand that to automatically evaluate the summary of some given text, there must be a reference summary that it can be compared with. Typically, this is a summary written by a human and is referred to as the gold standard. Every automatically generated summary can be compared with the gold standard to get an accuracy measure. This also gives us the opportunity to easily compare multiple methods and choose the best one. However, we will often run into the issue that a human-
generated summary may not exist for every use case. In such situations, we can choose a proxy measure to be considered as the gold standard. An example in the case of a news article would be the headline. While it is written by a human, it is a poor proxy as it can be quite short and is not an accurate summary but more of a leading statement to draw users. While this may not give us the best results, it is still useful to compare the performance of different summarization methods.

Recall-Oriented Understudy for Gisting Evaluation (ROUGE) is one of the most commonly used methods to measure the accuracy of a summary. There are several types of ROUGE metrics, but the basic idea is simple. It arrives at the measure of accuracy by comparing the number of shared terms between the automatically generated summary and the gold standard. ROUGE-N is a metric that measures the number of common n-grams (ROUGE-1 compares individual words, ROUGE-2 compares bigrams, and so on)

In [20]:
! pip install rouge-score

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25ldone
[?25h  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24932 sha256=8778c864e43811dc4698f03d8ff81aa1ba3de353d540c454efdd926ea9cfe2e8
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [21]:
import rouge_score as rouge
from rouge_score.rouge_scorer import RougeScorer


def print_rouge_score(rouge_scores):
    for k,v in rouge_scores.items():
        print (k, 'Precision:', "{:.2f}".format(v.precision), 'Recall:', "{:.2f}".format(v.recall), 'fmeasure:', "{:.2f}".format(v.fmeasure))

In [22]:
gold_standard = highlights0
summary = ''.join(textrank_summary(article0, 3))

scorer = RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
scores = scorer.score(gold_standard, summary)
print('\nTextRank Results: \n')
print_rouge_score(scores)

summary = ''.join(lsa_summary(article0, 3))
scores = scorer.score(gold_standard, summary)
print('\nLSA Results: \n')
print_rouge_score(scores)

summary = ''.join(tfidf_summary(article0, 3))
scores = scorer.score(gold_standard, summary)
print('\nTF-IDF Results: \n')
print_rouge_score(scores)


TextRank Results: 

rouge1 Precision: 0.23 Recall: 0.42 fmeasure: 0.30
rouge2 Precision: 0.06 Recall: 0.12 fmeasure: 0.08
rougeL Precision: 0.10 Recall: 0.17 fmeasure: 0.12

LSA Results: 

rouge1 Precision: 0.30 Recall: 0.43 fmeasure: 0.36
rouge2 Precision: 0.04 Recall: 0.06 fmeasure: 0.05
rougeL Precision: 0.13 Recall: 0.19 fmeasure: 0.16

TF-IDF Results: 

rouge1 Precision: 0.22 Recall: 0.40 fmeasure: 0.28
rouge2 Precision: 0.02 Recall: 0.04 fmeasure: 0.03
rougeL Precision: 0.09 Recall: 0.17 fmeasure: 0.12


The original ROUGE paper compared how many of the words that appear in the gold standard also appear in the automatically generated summary. So if most of the words present in the gold standard were also present in the generated summary, we would achieve a high score. However, this metric alone does not tell the whole story. Consider that we generate a verbose summary that is long but includes most of the words in the gold standard. This summary
would have a high score, but it would not be a good summary since it doesn’t provide a concise representation. This is why the ROUGE measure has been extended to compare the number of shared words to the total number of words in the generated summary as well. This indicates the precision: the number of words in the generated summary that are actually useful. We can combine these measures to generate the F-score.