In [45]:
import nltk
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
from nltk import sent_tokenize
import numpy as np
import networkx as nx

In [46]:
def read_article(file_name):
    file = open(file_name, "r")
    filedata = file.readlines()
    article = filedata[0].split(". ")
    #print(article)
    sentences = []

    for sentence in article:
        print(sentence)
        sentences.append(sentence.replace("[^a-zA-Z]", " ").split(" "))
    sentences.pop() 
    
    return sentences

In [47]:
def sentence_similarity(sent1, sent2, stopwords=None):
    if stopwords is None:
        stopwords = []
 
    sent1 = [w.lower() for w in sent1]
    sent2 = [w.lower() for w in sent2]
 
    all_words = list(set(sent1 + sent2))
 
    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)
 
    # build the vector for the first sentence
    for w in sent1:
        if w in stopwords:
            continue
        vector1[all_words.index(w)] += 1
 
    # build the vector for the second sentence
    for w in sent2:
        if w in stopwords:
            continue
        vector2[all_words.index(w)] += 1
 
    return 1 - cosine_distance(vector1, vector2)

In [48]:
def build_similarity_matrix(sentences, stop_words):
    # Create an empty similarity matrix
    similarity_matrix = np.zeros((len(sentences), len(sentences)))
 
    for idx1 in range(len(sentences)):
        for idx2 in range(len(sentences)):
            if idx1 == idx2: #ignore if both are same sentences
                continue 
            similarity_matrix[idx1][idx2] = sentence_similarity(sentences[idx1], sentences[idx2], stop_words)

    return similarity_matrix

In [49]:
from sumeval.metrics.rouge import RougeCalculator
from sumeval.metrics.bleu import BLEUCalculator

def eval_rouges(refrence_summary,model_summary):
    rouge = RougeCalculator(stopwords=True, lang="en")

    rouge_1 = rouge.rouge_n(
                summary=model_summary,
                references=refrence_summary,
                n=1)
    print("rogue1 "+str(rouge_1))

    rouge_2 = rouge.rouge_n(
                summary=model_summary,
                references=[refrence_summary],
                n=2)
    print("rogue2 "+str(rouge_2))

    rouge_l = rouge.rouge_l(
                summary=model_summary,
                references=[refrence_summary])
    print("roguel "+str(rouge_l))

    
    # You need spaCy to calculate ROUGE-BE
    
    #rouge_be = rouge.rouge_be(
    #            summary=model_summary,
    #            references=[refrence_summary])
    #print("rogue_be "+str(rouge_be))


    bleu = BLEUCalculator()
    bleu_score = bleu.bleu( summary=model_summary,
                        references=[refrence_summary])
    
    print("blue_score "+str(bleu_score))
    
    return rouge_1, rouge_2,rouge_l,bleu_score

In [58]:
def generate_summary(filename, top_n=2):
    nltk.download("stopwords")
    stop_words = stopwords.words('english')
    summarize_text = []

    # Step 1 - Read text anc split it
    sentences =  read_article(filename)
    #print("after getting sentences")
    #print(sentences)

    # Step 2 - Generate Similary Martix across sentences
    sentence_similarity_martix = build_similarity_matrix(sentences, stop_words)

    # Step 3 - Rank sentences in similarity martix
    sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_martix)
    scores = nx.pagerank(sentence_similarity_graph)

    # Step 4 - Sort the rank and pick top sentences
    ranked_sentence = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)    
    #print("Indexes of top ranked_sentence order are ", ranked_sentence)    

    for i in range(top_n):
      summarize_text.append(" ".join(ranked_sentence[i][1]))

    # Step 5 - Offcourse, output the summarize texr
    print("Summarize Text: \n", ". ".join(summarize_text))
    sum_text = ".".join(summarize_text)
    return sum_text


In [59]:
# let's begin
#generate_summary( "msft.txt", 2)
#generate_summary("cnn1.txt" ,2)

sum=generate_summary("cnn1.txt",2)
print("-----")
print(sum)


usain bolt rounded off the world championships sunday by claiming his third gold in moscow as he anchored jamaica to victory in the men is 4x100m relay
the fastest man in the world charged clear of united states rival justin gatlin as the jamaican quartet of nesta carter, kemar bailey cole, nickel ashmeade and bolt won in 37.36 seconds
the u.s finished second in 37.56 seconds with canada taking the bronze after britain were disqualified for a faulty handover
the 26 year old bolt has now collected eight gold medals at world championships, equaling the record held by american trio carl lewis, michael johnson and allyson felix, not to mention the small matter of six olympic titles
the relay triumph followed individual successes in the 100 and 200 meters in the russian capital
i am proud of myself and i will continue to work to dominate for as long as possible, bolt said, having previously expressed his intention to carry on until the 2016 rio olympics
victory was never seriously in doubt 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\santo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [60]:
ref_sum='usain bolt wins third gold of world championship anchors jamaica to 4x100m relay victory eighth gold at the championships for bolt jamaica double up in women is 4x100m relay'
eval_rouges(ref_sum,sum)

rogue1 0.5555555555555556
rogue2 0.11538461538461539
roguel 0.37037037037037035
blue_score 1.7593613167407098


(0.5555555555555556,
 0.11538461538461539,
 0.37037037037037035,
 1.7593613167407098)