In [48]:
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords
import string

nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /home/miboj/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/miboj/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [49]:
import os
from pathlib import Path
from os import listdir

# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, encoding='utf-8')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

# load all stories in a directory
def load_stories(directory):
    for name in listdir(directory):
        filename = directory + '/' + name
        # load document
        doc = load_doc(filename)

# split a document into news story and highlights
def split_story(doc):
    # find first highlight
    index = doc.find('@highlight')
    # split into story and highlights
    story, highlights = doc[:index], doc[index:].split('@highlight')
    # strip extra white space around each highlight
    highlights = [h.strip() for h in highlights if len(h) > 0]
    return story, highlights

# load all stories in a directory
def load_stories(directory):
    all_stories = list()
    for name in listdir(directory):
        filename = directory + '/' + name
        # load document
        doc = load_doc(filename)
        # split into story and highlights
        story, highlights = split_story(doc)
        # store
        all_stories.append({'story':story, 'highlights':highlights})
    return all_stories

# load stories
p = Path(os.getcwd()).parents[0]
cnn_path = str(p) + r'/data/external/cnn/stories'
stories = load_stories(cnn_path)
print('Loaded Stories %d' % len(stories))

Loaded Stories 92579


In [67]:
HANDICAP = 1

def remove_punctuation_marks(text) :
    punctuation_marks = dict((ord(punctuation_mark), None) for punctuation_mark in string.punctuation)
    return text.translate(punctuation_marks)

def get_lemmatized_tokens(text) :
    normalized_tokens = nltk.word_tokenize(remove_punctuation_marks(text.lower()))
    return [nltk.stem.WordNetLemmatizer().lemmatize(normalized_token) for normalized_token in normalized_tokens]

def get_average(values) :
    greater_than_zero_count = total = 0
    for value in values :
        if value != 0 :
            greater_than_zero_count += 1
            total += value 
    return total / greater_than_zero_count

def get_threshold(tfidf_results) :
    i = total = 0
    while i < (tfidf_results.shape[0]) :
        total += get_average(tfidf_results[i, :].toarray()[0])
        i += 1
    return total / tfidf_results.shape[0]

def get_summary(documents, tfidf_results) :
    summary = ""
    i = 0
    while i < (tfidf_results.shape[0]) :
        if (get_average(tfidf_results[i, :].toarray()[0])) >= get_threshold(tfidf_results) * HANDICAP :
                summary += ' ' + documents[i]
        i += 1
    return summary

In [68]:
documents = nltk.sent_tokenize(stories[1]['story'])
tfidf_results = TfidfVectorizer(tokenizer = get_lemmatized_tokens, 
                                stop_words = stopwords.words('english')).fit_transform(documents)



In [69]:
summary = get_summary(documents, tfidf_results)
print(summary)

 Maybe they are simply referring to the color. But you're unlikely to think that they have just had a mini ecosystem installed. But green roofs are not just aesthetic. The industry is not faring so well in other parts of the world. In North America, green roofs have taken even longer to catch on. But this was up 80 percent from the previous year, and the market continues to grow. "There are mandates over there because of the storm water they retain," she continued, "Which is a huge drain on their resources, as it is on ours. What we are trying to do is champion the policies behind storm water." Storm water is a growing problem in cities. "This will encourage uptake." Despite this lack of support, the green roof industry is growing. Cities such as London and Sheffield are now asking for them as part of planning applications." "This is a much better approach than driven by the center." In an industry that varies from project to project, this flexibility is a valuable asset. But then, it'

In [70]:
from rouge import Rouge
rouge = Rouge()
scores = rouge.get_scores(summary, ''.join(stories[1]['highlights']))
#scores1 = rouge.get_scores(''.join(stories[1]['highlights']), ''.join(stories[1]['highlights']))
print(scores)
print()
#print(scores1)

[{'rouge-1': {'f': 0.1439393911113981, 'p': 0.0867579908675799, 'r': 0.4222222222222222}, 'rouge-2': {'f': 0.015267172777810662, 'p': 0.009174311926605505, 'r': 0.045454545454545456}, 'rouge-l': {'f': 0.12087911744958348, 'p': 0.07746478873239436, 'r': 0.275}}]

