This implementation of the textrank algorithm has been adapted from https://github.com/prateekjoshi565/textrank_text_summarization/blob/master/TestRank_Text_Summarization.ipynb

In [None]:
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')# one time execution
stop_words = stopwords.words('arabic')
nltk.download('punkt') # one time execution
import re
from nltk.cluster.util import cosine_distance
from nltk.tokenize import word_tokenize
import math
import networkx as nx
from operator import itemgetter
import os

# Clean sentences of punctuation marks and stopwords

In [None]:
# function to remove stopwords
def remove_stopwords(sen):
    sen_new = " ".join([i for i in sen if i not in stop_words])
    return sen_new

from string import punctuation
punctuation += '،؛؟'
punctuation+='123456789'
def remove_punctuation(sen):
    sen_new = " ".join([i for i in sen if i not in punctuation])
    return sen_new

def sentence_cleaner(sen):
    clean_sentences = [remove_stopwords(r.split()) for r in sentences]
    clean_sentences = [remove_punctuation(r.split()) for r in clean_sentences]
    return clean_sentences

The next step is to find similarities among the sentences. We will use cosine similarity to find similarity between a pair of sentences.

In [None]:
def sentence_similarity(sent1, sent2):
    all_words = list(set(word_tokenize(sent1)))
    all_words= all_words+list(set(word_tokenize(sent2)))
    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)
    sent1 = word_tokenize(sent1) 
    sent2 = word_tokenize(sent2)
    for w in sent1:
        vector1[all_words.index(w)] += 1
    
    for w in sent2: 
        vector2[all_words.index(w)] += 1
    cosine_dist = 1 - cosine_distance(vector1, vector2) 
    if math.isnan((1-cosine_distance(vector1, vector2))):
            cosine_dist = 0
    return cosine_dist

## Sim matrix

In [None]:
def get_sim_matrix(clean_sentences):
    sim_mat = np.zeros([len(clean_sentences), len(clean_sentences)])
    for i in range(len(clean_sentences)):
        for j in range(len(clean_sentences)):
            sim_mat[i][j] = sentence_similarity(clean_sentences[i],clean_sentences[j])
    return sim_mat

In [None]:

def get_ranked_sentences(sim_mat):
    nx_graph = nx.from_numpy_array(sim_mat)
    scores = nx.pagerank(nx_graph)
    ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)
    return ranked_sentences

In [None]:
def summary_genarator(sentences, percentage):
    clean_sentences = sentence_cleaner(sentences)
    sim_mat = get_sim_matrix(clean_sentences)
    ranked_sentences = get_ranked_sentences(sim_mat)
    # Generate summary 
    sn = math.floor(percentage*len(sentences)/100) if math.floor(percentage*len(sentences)/100) is not 0 else 1
    gen_summ = []
    for i in range(sn):
        gen_summ.append(ranked_sentences[i][1]) 
    return gen_summ

### Different cosine similairty function used than in the beginning of code because the first both takes and returns 2d arrays.

# Testing


In [None]:

def get_summ_accuracy(gen_summ, correct_summ):
    highestSimilarity = avg = 0
 
    similarityScores = list()
    for i in range(len(correct_summ)):
        for j in range(len(gen_summ)):
            similarity = sentence_similarity(correct_summ[i], gen_summ[j])
            if  similarity > highestSimilarity:
                highestSimilarity = similarity
        similarityScores.append(highestSimilarity)
        avg+=highestSimilarity
        highestSimilarity=0

    avg=avg/len(similarityScores)
    return(avg)

### Check the generated summaries against the human generates summaries

In [None]:
# assign percentage for summary size
percentage = 10

# path to folder containing test articles
folder_articles = "EASC/EASC-UTF-8"

data = {'A 10':[],
        'B 10':[],
        'C 10':[],
        'D 10':[],
        'E 10':[],
        'Max 10':[],
        'A 20':[],
        'B 20':[],
        'C 20':[],
        'D 20':[],
        'E 20':[],
        'Max 20':[],
        'A 30':[],
        'B 30':[],
        'C 30':[],
        'D 30':[],
        'E 30':[],
        'Max 30':[],
        'A 40':[],
        'B 40':[],
        'C 40':[],
        'D 40':[],
        'E 40':[],
        'Max 40':[]}
 
# Create DataFrame
df = pd.DataFrame(data)

#loop over all articles
for x in range(1,154):
    print("------------Testing article #"+str(x)+"------------")
    #list variable that stores all percentage accuracies for each of the 5 summaries + max value to be added 
    #directly to dataframe
    list_accuracies = []
    
    for y in range(10,41,10):
        print("------------Testing "+str(y)+"% summaries accuracy------------")


        #find path of folder containing article for each topic
        article_path = folder_articles+'/Articles/Topic'+str(x)

        #variable to store maximum of the 5 percentage accuracies
        maximum = 0
        #use os.walk to get the file path
        for file in os.walk(article_path, topdown=True): 
            
            #retrieve path of article
            sentences_path = article_path+'/'+file[2][0]

            #open the file and assign sentences of the article to variable sentences
            with open(sentences_path, 'r', encoding = "utf-8", errors='ignore') as f:
                sentences = f.read()

            #perform basic cleaning of sentences to remove charaters found in the articles
            sentences = sentences.replace("\n", "")
            sentences = sentences.replace("\ufeff", "").split(".")

            #remove any empty sentences
            for sent in sentences:
                if sent is "":
                    sentences.remove(sent)  

            #generate summary
            gen_summ = summary_genarator(sentences, y)
    ##FINDING HUMAN GEBERATED SUMMARIES OF EACH ARTICLE

            #get path to folder where generated summaries of each topic are
            summary_path = folder_articles + '/MTurk/Topic'+str(x)

            
            #loop over all files in folder 
            for files in os.walk(summary_path, topdown=True):
                #get list of all summ path names to open one by one
                list_correct_summ_paths = files[2]
                
                #loop over all correct summaries
                for l in list_correct_summ_paths:

                    #path to each summary 
                    s_path = summary_path+'/'+l

                    #open the file and assign sentences of the summary to variable sentences
                    with open(s_path, 'r', encoding = "utf-8", errors='ignore') as f2:
                        correct_summ = f2.read()

                    #perform basic cleaning of sentences to remove charaters found in the articles
                    correct_summ = correct_summ.replace("\n", "")
                    correct_summ = correct_summ.replace("\ufeff", "").split(".")

                    #removing empty sentences
                    while("" in correct_summ) : 
                        correct_summ.remove("") 

                    #get the similarity percentage between generated summary and actual accuracy
                    list_accuracies.append(get_summ_accuracy(gen_summ,correct_summ))
            #get maximum of percentages to represent similarity of generated summary to actual summary
            maximum = max(list_accuracies)


            #add the maximum to the list of accuracies to be added to the data frame
            list_accuracies.append(maximum)

            #Add a new row to the dataframe
    df = df.append(pd.Series(list_accuracies, index =['A 10','B 10','C 10','D 10','E 10','Max 10', 'A 20','B 20','C 20','D 20','E 20','Max 20', 'A 30','B 30','C 30','D 30','E 30','Max 30', 'A 40','B 40','C 40','D 40','E 40','Max 40']),ignore_index=True)