# **BASELINE 1: TextRank**

TextRank is an algorithm based on PageRank.
The top sentence (most important) from a piece of text will be extracted using this algorithm.

Main idea: Fetch the most relevant sentences from the text.

In [None]:
#import all the libraries
import pandas as pd
import re
import nltk
nltk.download('punkt') #one time download
import numpy as np
from nltk import sent_tokenize, word_tokenize
from nltk.cluster.util import cosine_distance

MULTIPLE_WHITESPACE_PATTERN = re.compile(r"\s+", re.UNICODE)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


**STEP 1: LOAD THE DATA**

The datasets are pulled from Kaggle, but for ease of use, we published on the web so that they can be used on any system in case the kaggle link changes. The url has been shared when reading the file.

```
# This is formatted as code
```



In [None]:
#DATASET
#reading the data files, published both to the web for ease of access
summary = pd.read_csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vRwR2KEtXYWDz9_dMrdQZv7Sa-8Vr0pTiRdedvX2A8CY_vuoHUGoarfaFV179puVPbbmKvYaa5ghgh4/pub?gid=1041903722&single=true&output=csv",
                      encoding='iso-8859-1')#news_summary csv file
raw = pd.read_csv('https://docs.google.com/spreadsheets/d/e/2PACX-1vSFbcrx8oO-z-s-n5_HvlE15BggLWMTX85L2J79FpQ2EvzQgxdaSI69xJolaJP6fbYJDhemQxzRuv23/pub?gid=903434167&single=true&output=csv',
                  encoding='iso-8859-1')#news_summary_more csv file

Now, since we have 2 data files, both are combined. The 'text' and 'summary' columns are combined and added to a third dataset, on which the algorithm will conduct analysis.

In [None]:
#combining the data from both files
pre1 = raw.iloc[:, 0:2].copy()
pre2 = summary.iloc[:, 0:6].copy()

# To increase the intake of possible text values to build a reliable model
#new column 'text' is created, where all the clomuns except headlines are concatenated under 'text'
pre2['text'] = pre2['author'].str.cat(pre2['date'
        ].str.cat(pre2['read_more'].str.cat(pre2['text'
        ].str.cat(pre2['ctext'], sep=' '), sep=' '), sep=' '), sep=' ')

#new table called pre with two columns 'text' and 'summary' acquired from pre1 and pre2
df = pd.DataFrame()
df['article_text'] = pd.concat([pre1['text'], pre2['text']], ignore_index=True)
df['given_summary'] = pd.concat([pre1['headlines'], pre2['headlines']],
                           ignore_index=True)
df['predicted_summary']=""

**STEP 2: CLEAN THE DATA**

In [None]:
def normalize_whitespace(text):
    return MULTIPLE_WHITESPACE_PATTERN.sub(_replace_whitespace, text)

def _replace_whitespace(match):
    text = match.group()
    if "\n" in text or "\r" in text:
        return "\n"
    else:
        return " "

def is_blank(string):
    return not string or string.isspace()

**STEP 3: BUILD A SIMILAIRTY MATRIX**

This algorithm uses cosine similarity to understand similairty between the sentences. This will then be used to measure distance.
So we use this method to measure similarity between all the sentences.

In [None]:
def get_symmetric_matrix(matrix):
    return matrix + matrix.T - np.diag(matrix.diagonal())

def core_cosine_similarity(vector1, vector2):
    return 1 - cosine_distance(vector1, vector2)


class TextRank():
    def __init__(self):
        self.damping = 0.85  # damping coefficient, usually is .85
        self.min_diff = 1e-5  # convergence threshold
        self.steps = 100  # iteration steps
        self.text_str = None
        self.sentences = None
        self.pr_vector = None

    def _sentence_similarity(self, sent1, sent2, stopwords=None):
        if stopwords is None:
            stopwords = []

        sent1 = [w.lower() for w in sent1]
        sent2 = [w.lower() for w in sent2]

        all_words = list(set(sent1 + sent2))

        vector1 = [0] * len(all_words)
        vector2 = [0] * len(all_words)

        # build the vector for the first sentence
        for w in sent1:
            if w in stopwords:
                continue
            vector1[all_words.index(w)] += 1

        # build the vector for the second sentence
        for w in sent2:
            if w in stopwords:
                continue
            vector2[all_words.index(w)] += 1

        return core_cosine_similarity(vector1, vector2)

    def _build_similarity_matrix(self, sentences, stopwords=None):
        # create an empty similarity matrix
        sm = np.zeros([len(sentences), len(sentences)])

        for idx1 in range(len(sentences)):
            for idx2 in range(len(sentences)):
                if idx1 == idx2:
                    continue
                sm[idx1][idx2] = self._sentence_similarity(sentences[idx1], sentences[idx2], stopwords=stopwords)

        # Get Symmeric matrix
        sm = get_symmetric_matrix(sm)

        # Normalize matrix by column
        norm = np.sum(sm, axis=0)
        sm_norm = np.divide(sm, norm, where=norm != 0)  # this is ignore the 0 element in norm
        return sm_norm

    def _run_page_rank(self, similarity_matrix):
        pr_vector = np.array([1] * len(similarity_matrix))
        # Iteration
        previous_pr = 0
        for epoch in range(self.steps):
            pr_vector = (1 - self.damping) + self.damping * np.matmul(similarity_matrix, pr_vector)
            if abs(previous_pr - sum(pr_vector)) < self.min_diff:
                break
            else:
                previous_pr = sum(pr_vector)
        return pr_vector

    def _get_sentence(self, index):
        try:
            return self.sentences[index]
        except IndexError:
            return ""

    def get_top_sentences(self, number=5):
        top_sentences = []
        if self.pr_vector is not None:
            sorted_pr = np.argsort(self.pr_vector)
            sorted_pr = list(sorted_pr)
            sorted_pr.reverse()
            index = 0
            for epoch in range(number):
                sent = self.sentences[sorted_pr[index]]
                sent = normalize_whitespace(sent)
                top_sentences.append(sent)
                index += 1
        return top_sentences

    def analyze(self, text, stop_words=None):
        self.text_str = text
        self.sentences = sent_tokenize(self.text_str)
        tokenized_sentences = [word_tokenize(sent) for sent in self.sentences]
        similarity_matrix = self._build_similarity_matrix(tokenized_sentences, stop_words)
        self.pr_vector = self._run_page_rank(similarity_matrix)

In [None]:
length = len(df['article_text'])
print (length) #using this to run a for loop to get summary for each article

102915


**STEP 4: RUN THE TEXTRANK ALGORITHM**

Now, we call the text rank algorithm for each part sentence and store the most important sentence in a new column of the data file called "predicted summary" - this is essentially what the TextRank is using as a summarizer.

In [None]:
i=0
while i<length:
  news_txt = str(df['article_text'][i]) #convert the article to string
  summary_obj = TextRank()
  summary_obj.analyze(news_txt)
  summary_txt=summary_obj.get_top_sentences(1)
  df['predicted_summary'][i]=summary_txt
  i = i+1

**STEP 5: CALCULATE THE ROUGE SCORE**



In [None]:
#ROUGE SCORE
!pip install rouge
!pip install rouge_score
!pip install evaluate

In [None]:
from rouge import Rouge

# Initialize
rouge = Rouge()

#convert the data cells into strings and put them in a list
golden_summaries = df['given_summary'].astype(str).tolist()
predicted_summaries = df['predicted_summary'].astype(str).tolist()

# Calaculate ROUGE scores
scores = rouge.get_scores(predicted_summaries, golden_summaries, avg=True)
print (scores)


{'rouge-1': {'r': 0.27756708945018227, 'p': 0.11796380760258501, 'f': 0.16212791732320814}, 'rouge-2': {'r': 0.08367072823639744, 'p': 0.03126422381280783, 'f': 0.04461420200277103}, 'rouge-l': {'r': 0.241761188306146, 'p': 0.10307887986748608, 'f': 0.14147646896098073}}


In [None]:
golden_summaries(1)

NameError: ignored