In [1]:
import pandas as pd

# Importing the CNN Daily Mail Dataset

CNN-DailyMail Dataset:

https://www.kaggle.com/datasets/gowrishankarp/newspaper-text-summarization-cnn-dailymail

Amazon Reviews Dataset:

https://www.kaggle.com/code/currie32/summarizing-text-with-amazon-reviews/notebook

In [2]:
train_df=pd.read_csv('cnn_train.csv')

In [3]:
train_df

Unnamed: 0,id,article,highlights
0,0001d1afc246a7964130f43ae940af6bc6c57f01,By . Associated Press . PUBLISHED: . 14:11 EST...,"Bishop John Folda, of North Dakota, is taking ..."
1,0002095e55fcbd3a2f366d9bf92a95433dc305ef,(CNN) -- Ralph Mata was an internal affairs li...,Criminal complaint: Cop used his role to help ...
2,00027e965c8264c35cc1bc55556db388da82b07f,A drunk driver who killed a young woman in a h...,"Craig Eccleston-Todd, 27, had drunk at least t..."
3,0002c17436637c4fe1837c935c04de47adb18e9a,(CNN) -- With a breezy sweep of his pen Presid...,Nina dos Santos says Europe must be ready to a...
4,0003ad6ef0c37534f80b55b4235108024b407f0b,Fleetwood are the only team still to have a 10...,Fleetwood top of League One after 2-0 win at S...
...,...,...,...
287108,fffdfb56fdf1a12d364562cc2b9b1d4de7481dee,By . James Rush . Former first daughter Chelse...,Chelsea Clinton said question of running for o...
287109,fffeecb8690b85de8c3faed80adbc7a978f9ae2a,An apologetic Vanilla Ice has given his first ...,"Vanilla Ice, 47 - real name Robert Van Winkle ..."
287110,ffff5231e4c71544bc6c97015cdb16c60e42b3f4,America's most lethal sniper claimed he wished...,America's most lethal sniper made comment in i...
287111,ffff924b14a8d82058b6c1c5368ff1113c1632af,"By . Sara Malm . PUBLISHED: . 12:19 EST, 8 Mar...",A swarm of more than one million has crossed b...


# Extractive Text Summarization using TF-IDF Algorithm

In [4]:
import math

from nltk import sent_tokenize, word_tokenize, PorterStemmer
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nanda\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nanda\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
def _create_frequency_table(text_string) -> dict:
    """
    we create a dictionary for the word frequency table.
    For this, we should only use the words that are not part of the stopWords array.

    Removing stop words and making frequency table
    Stemmer - an algorithm to bring words to its root word.
    :rtype: dict
    """
    stopWords = set(stopwords.words("english"))
    words = word_tokenize(text_string)
    ps = PorterStemmer()

    freqTable = dict()
    for word in words:
        word = ps.stem(word)
        if word in stopWords:
            continue
        if word in freqTable:
            freqTable[word] += 1
        else:
            freqTable[word] = 1

    return freqTable

In [6]:
def _create_frequency_matrix(sentences):
    frequency_matrix = {}
    stopWords = set(stopwords.words("english"))
    ps = PorterStemmer()

    for sent in sentences:
        freq_table = {}
        words = word_tokenize(sent)
        for word in words:
            word = word.lower()
            word = ps.stem(word)
            if word in stopWords:
                continue

            if word in freq_table:
                freq_table[word] += 1
            else:
                freq_table[word] = 1

        frequency_matrix[sent[:15]] = freq_table

    return frequency_matrix


In [7]:
def _create_tf_matrix(freq_matrix):
    tf_matrix = {}

    for sent, f_table in freq_matrix.items():
        tf_table = {}

        count_words_in_sentence = len(f_table)
        for word, count in f_table.items():
            tf_table[word] = count / count_words_in_sentence

        tf_matrix[sent] = tf_table

    return tf_matrix

In [8]:
def _create_documents_per_words(freq_matrix):
    word_per_doc_table = {}

    for sent, f_table in freq_matrix.items():
        for word, count in f_table.items():
            if word in word_per_doc_table:
                word_per_doc_table[word] += 1
            else:
                word_per_doc_table[word] = 1

    return word_per_doc_table

In [9]:
def _create_idf_matrix(freq_matrix, count_doc_per_words, total_documents):
    idf_matrix = {}

    for sent, f_table in freq_matrix.items():
        idf_table = {}

        for word in f_table.keys():
            idf_table[word] = math.log10(total_documents / float(count_doc_per_words[word]))

        idf_matrix[sent] = idf_table

    return idf_matrix

In [10]:
def _create_tf_idf_matrix(tf_matrix, idf_matrix):
    tf_idf_matrix = {}

    for (sent1, f_table1), (sent2, f_table2) in zip(tf_matrix.items(), idf_matrix.items()):

        tf_idf_table = {}

        for (word1, value1), (word2, value2) in zip(f_table1.items(),
                                                    f_table2.items()):  # here, keys are the same in both the table
            tf_idf_table[word1] = float(value1 * value2)

        tf_idf_matrix[sent1] = tf_idf_table

    return tf_idf_matrix

In [11]:
def _score_sentences(tf_idf_matrix) -> dict:
    """
    score a sentence by its word's TF
    Basic algorithm: adding the TF frequency of every non-stop word in a sentence divided by total no of words in a sentence.
    :rtype: dict
    """

    sentenceValue = {}

    for sent, f_table in tf_idf_matrix.items():
        total_score_per_sentence = 0

        count_words_in_sentence = len(f_table)
        for word, score in f_table.items():
            total_score_per_sentence += score

        sentenceValue[sent] = total_score_per_sentence / count_words_in_sentence

    return sentenceValue

In [12]:
def _find_average_score(sentenceValue) -> int:
    """
    Find the average score from the sentence value dictionary
    :rtype: int
    """
    sumValues = 0
    for entry in sentenceValue:
        sumValues += sentenceValue[entry]

    # Average value of a sentence from original summary_text
    average = (sumValues / len(sentenceValue))

    return average

In [13]:
def _generate_summary(sentences, sentenceValue, threshold):
    sentence_count = 0
    summary = ''

    for sentence in sentences:
        if sentence[:15] in sentenceValue and sentenceValue[sentence[:15]] >= (threshold):
            summary += " " + sentence
            sentence_count += 1

    return summary

In [14]:
def run_summarization(text):
    """
    :param text: Plain summary_text of long article
    :return: summarized summary_text
    """

    '''
    We already have a sentence tokenizer, so we just need 
    to run the sent_tokenize() method to create the array of sentences.
    '''
    # 1 Sentence Tokenize
    sentences = sent_tokenize(text)
    total_documents = len(sentences)
    #print(sentences)

    # 2 Create the Frequency matrix of the words in each sentence.
    freq_matrix = _create_frequency_matrix(sentences)
    #print(freq_matrix)

    '''
    Term frequency (TF) is how often a word appears in a document, divided by how many words are there in a document.
    '''
    # 3 Calculate TermFrequency and generate a matrix
    tf_matrix = _create_tf_matrix(freq_matrix)
    #print(tf_matrix)

    # 4 creating table for documents per words
    count_doc_per_words = _create_documents_per_words(freq_matrix)
    #print(count_doc_per_words)

    '''
    Inverse document frequency (IDF) is how unique or rare a word is.
    '''
    # 5 Calculate IDF and generate a matrix
    idf_matrix = _create_idf_matrix(freq_matrix, count_doc_per_words, total_documents)
    #print(idf_matrix)

    # 6 Calculate TF-IDF and generate a matrix
    tf_idf_matrix = _create_tf_idf_matrix(tf_matrix, idf_matrix)
    #print(tf_idf_matrix)

    # 7 Important Algorithm: score the sentences
    sentence_scores = _score_sentences(tf_idf_matrix)
    #print(sentence_scores)

    # 8 Find the threshold
    threshold = _find_average_score(sentence_scores)
    #print(threshold)

    # 9 Important Algorithm: Generate the summary
    summary = _generate_summary(sentences, sentence_scores,0.6*threshold)
    return summary

In [15]:
run_summarization(train_df['article'][0])

' Associated Press . PUBLISHED: . 14:11 EST, 25 October 2013 . | . UPDATED: . 15:36 EST, 25 October 2013 . The state Health Department has issued an advisory of exposure for anyone who attended five churches and took communion. The diocese announced on Monday that Bishop John Folda is taking time off after being diagnosed with hepatitis A. Symptoms of hepatitis A include fever, tiredness, loss of appetite, nausea and abdominal discomfort.'

In [16]:
train_df['highlights'][0]

'Bishop John Folda, of North Dakota, is taking time off after being diagnosed .\nHe contracted the infection through contaminated food in Italy .\nChurch members in Fargo, Grand Forks and Jamestown could have been exposed .'

In [17]:
train_df['article'][0]

"By . Associated Press . PUBLISHED: . 14:11 EST, 25 October 2013 . | . UPDATED: . 15:36 EST, 25 October 2013 . The bishop of the Fargo Catholic Diocese in North Dakota has exposed potentially hundreds of church members in Fargo, Grand Forks and Jamestown to the hepatitis A virus in late September and early October. The state Health Department has issued an advisory of exposure for anyone who attended five churches and took communion. Bishop John Folda (pictured) of the Fargo Catholic Diocese in North Dakota has exposed potentially hundreds of church members in Fargo, Grand Forks and Jamestown to the hepatitis A . State Immunization Program Manager Molly Howell says the risk is low, but officials feel it's important to alert people to the possible exposure. The diocese announced on Monday that Bishop John Folda is taking time off after being diagnosed with hepatitis A. The diocese says he contracted the infection through contaminated food while attending a conference for newly ordained 

In [18]:
test_summaries=train_df['highlights'][:100]

In [19]:
test_summaries[0]

'Bishop John Folda, of North Dakota, is taking time off after being diagnosed .\nHe contracted the infection through contaminated food in Italy .\nChurch members in Fargo, Grand Forks and Jamestown could have been exposed .'

In [20]:
predicted_summaries=[]
for i in range(100):
    predicted_summaries.append(run_summarization(train_df['article'][i]))

In [21]:
predicted_summaries[0]

' Associated Press . PUBLISHED: . 14:11 EST, 25 October 2013 . | . UPDATED: . 15:36 EST, 25 October 2013 . The state Health Department has issued an advisory of exposure for anyone who attended five churches and took communion. The diocese announced on Monday that Bishop John Folda is taking time off after being diagnosed with hepatitis A. Symptoms of hepatitis A include fever, tiredness, loss of appetite, nausea and abdominal discomfort.'

# Extractive Text Summarization using TextRank Algorithm

In [22]:
import numpy as np
import pandas as pd
import nltk
import re
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from gensim.models import Word2Vec
from scipy import spatial
import networkx as nx

In [23]:
def TextRank(sentences):
    #preprocessing the sentences
    sentences_clean=[re.sub(r'[^\w\s]','',sentence.lower()) for sentence in sentences]
    #removing the stopwords
    stop_words = stopwords.words('english')
    #tokenizing the words in each sentence after cleaning.
    sentence_tokens=[[words for words in sentence.split(' ') if words not in stop_words] for sentence in sentences_clean]
    #calculating word embeddings for each word of the text.
    w2v=Word2Vec(sentence_tokens,vector_size=1,min_count=1,epochs=1000)
    #replacing word tokens with their embedding in sentence tokens.
    sentence_embeddings=[[w2v.wv[word][0] for word in words] for words in sentence_tokens]
    #max length of a sentence in the existing text
    max_len=max([len(tokens) for tokens in sentence_tokens])
    #padding sentence embedding using 0's to max_len.
    sentence_embeddings=[np.pad(embedding,(0,max_len-len(embedding)),'constant') for embedding in sentence_embeddings]
    #initializing similarity matrix.
    similarity_matrix = np.zeros([len(sentence_tokens), len(sentence_tokens)])
    #calculating the similarity between every two pairs of sentences.
    for i,row_embedding in enumerate(sentence_embeddings):
        for j,column_embedding in enumerate(sentence_embeddings):
            similarity_matrix[i][j]=1-spatial.distance.cosine(row_embedding,column_embedding)
    #converting the similarity matrix to network
    nx_graph = nx.from_numpy_array(similarity_matrix)
    #applying page rank
    scores = nx.pagerank(nx_graph,max_iter=1000)
    #retrieving the top 4 sentences.
    top_sentence={sentence:scores[index] for index,sentence in enumerate(sentences)}
    top=dict(sorted(top_sentence.items(), key=lambda x: x[1], reverse=True)[:4])
    result=''
    for sent in sentences:
        if sent in top.keys():
            result+=sent
    return result

In [25]:
#Tokenizing the sentences and performing the Text Rank algorithm.
predicted_summaries_textrank=[]
for i in range(100):
    sentences=sent_tokenize(train_df['article'][i])
    predicted_summaries_textrank.append(TextRank(sentences))

In [32]:
predicted_summaries_textrank[10]

'The biggest challenge is that it is impossible to portray the reality of the spherical world on a flat map – a problem that has haunted cartographers for centuries.Photo of a genuine hand drawn world map, it was drawn in 1844 and therefore the countries are named as they were in that period.Almost for the first time, the ability to create an accurate map has been placed in the hands of everyone, and it has transformed the way we view the world.Richard Oswald, secretary to the delegation, annotated it with coloured lines to show where it was thought past treaties established the U.S./Canada border .'

In [33]:
test_summaries[10]

'The distortion is the result of the Mercator map which was created in 1596 to help sailors navigate the world .\nIt gives the right shapes of countries but at the cost of distorting sizes in favour of the wealthy lands to the north .\nFor instance, north America looks larger, or at least as big, as Africa, and Greenland also looks of comparable size .\nIn reality, you can fit north America into Africa and still have space for India, Argentina, Tunisia and some left over .\nMap suggests Scandinavian countries are larger than India, whereas in reality India is three times the size .\nThe biggest challenge for cartographers is that it is impossible to portray reality of spherical world on a flat map .'

# Evaluation

In [None]:
from rouge import Rouge

In [34]:
def calc_rouge_scores(pred_summaries, gold_summaries, 

                                 keys=['rouge1', 'rougeL'], use_stemmer=True):

    #Calculate rouge scores
    scorer = rouge_scorer.RougeScorer(keys, use_stemmer= use_stemmer)
    n = len(pred_summaries)
    scores = [scorer.score(pred_summaries[j], gold_summaries[j]) for 
              j in range(n)] 
    dict_scores={}                                                            
    for key in keys:
        dict_scores.update({key: {}})
    for key in keys:
        precision_list = [scores[j][key][0] for j in range(len(scores))]
        recall_list = [scores[j][key][1] for j in range(len(scores))]
        f1_list = [scores[j][key][2] for j in range(len(scores))]
        precision = np.mean(precision_list)
        recall = np.mean(recall_list)
        f1 = np.mean(f1_list)
        dict_results = {'recall': recall, 'precision': precision, 'f1': f1}
        dict_scores[key] = dict_results
    return dict_scores

In [35]:
from rouge_score import rouge_scorer

# Rouge score for TF-IDF Algorithm for CNN Daily Mail Dataset

In [36]:
dict_scores_word_frequency=calc_rouge_scores(predicted_summaries,test_summaries)

In [37]:
dict_scores_word_frequency

{'rouge1': {'recall': 0.09882362037622135,
  'precision': 0.7592652905987959,
  'f1': 0.16921104279217225},
 'rougeL': {'recall': 0.06490876495936519,
  'precision': 0.5082022047646378,
  'f1': 0.11150274482492267}}

# Rouge score for Text Rank Algorithm for CNN Daily Mail Dataset

In [38]:
dict_scores_text_rank=calc_rouge_scores(predicted_summaries_textrank,test_summaries)

In [39]:
dict_scores_text_rank

{'rouge1': {'recall': 0.2258022443394076,
  'precision': 0.3250931570029875,
  'f1': 0.26019769051179276},
 'rougeL': {'recall': 0.12215636087996319,
  'precision': 0.1804242219771743,
  'f1': 0.1426723748129462}}

# Importing Amazon Reviews Dataset

In [40]:
train_amazon_reviews=pd.read_csv("Reviews.csv")

# Rouge score for TF-IDF Algorithm for Amazon Reviews Dataset

In [41]:
predicted_summaries_amazon=[]
for i in range(100):
    predicted_summaries_amazon.append(run_summarization(train_amazon_reviews['Text'][i]))

In [42]:
dict_scores_amazon_word_frequency=calc_rouge_scores(predicted_summaries_amazon,train_amazon_reviews['Text'][:100])

In [43]:
dict_scores_amazon_word_frequency

{'rouge1': {'recall': 1.0,
  'precision': 0.7740426749493853,
  'f1': 0.830507044288203},
 'rougeL': {'recall': 1.0,
  'precision': 0.7740426749493853,
  'f1': 0.830507044288203}}

# Rouge score for Text Rank Algorithm for Amazon Reviews Dataset

In [45]:
predicted_summaries_amazon_textrank=[]
for i in range(100):
    sentences=sent_tokenize(train_amazon_reviews['Text'][i])
    predicted_summaries_amazon_textrank.append(TextRank(sentences))

In [46]:
dict_scores_amazon_text_rank=calc_rouge_scores(predicted_summaries_amazon_textrank,train_amazon_reviews['Text'][:100])
dict_scores_amazon_text_rank

{'rouge1': {'recall': 1.0,
  'precision': 0.8655913978494624,
  'f1': 0.9080882352941176},
 'rougeL': {'recall': 1.0,
  'precision': 0.8655913978494624,
  'f1': 0.9080882352941176}}

In [47]:
predicted_amazon_lstm_df=pd.read_csv('predicted.csv')
original_amazon_lstm_df=pd.read_csv('original.csv')

In [48]:
predicted_amazon_lstm_df['predicted_summary'][0]

' not for me'

In [49]:
original_amazon_lstm_df['original_summary'][0]

'hour '

In [50]:
predicted_summary_amazon_lstm=[]
original_summary_amazon_lstm=[]
for i in range(predicted_amazon_lstm_df.shape[0]):
    predicted_summary_amazon_lstm.append(predicted_amazon_lstm_df['predicted_summary'][i])
    original_summary_amazon_lstm.append(original_amazon_lstm_df['original_summary'][i])

In [51]:
dict_scores_amazon_lstm=calc_rouge_scores(predicted_summary_amazon_lstm,original_summary_amazon_lstm)

In [52]:
dict_scores_amazon_lstm

{'rouge1': {'recall': 0.26208333333333333,
  'precision': 0.17885714285714285,
  'f1': 0.20119047619047617},
 'rougeL': {'recall': 0.26208333333333333,
  'precision': 0.17885714285714285,
  'f1': 0.20119047619047617}}