In [77]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords
import string
import pandas as pd
import heapq

nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /home/miboj/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/miboj/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
import os
from pathlib import Path
from os import listdir

# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, encoding='utf-8')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

# load all stories in a directory
def load_stories(directory):
    for name in listdir(directory):
        filename = directory + '/' + name
        # load document
        doc = load_doc(filename)

# split a document into news story and highlights
def split_story(doc):
    # find first highlight
    index = doc.find('@highlight')
    # split into story and highlights
    story, highlights = doc[:index], doc[index:].split('@highlight')
    # strip extra white space around each highlight
    highlights = [h.strip() for h in highlights if len(h) > 0]
    return story, highlights

# load all stories in a directory
def load_stories(directory):
    all_stories = list()
    for name in listdir(directory):
        filename = directory + '/' + name
        # load document
        doc = load_doc(filename)
        # split into story and highlights
        story, highlights = split_story(doc)
        # store
        all_stories.append({'story':story, 'highlights':highlights})
    return all_stories

# load stories
p = Path(os.getcwd()).parents[0]
cnn_path = str(p) + r'/data/external/cnn/stories'
stories = load_stories(cnn_path)
print('Loaded Stories %d' % len(stories))

Loaded Stories 92579


In [82]:
HANDICAP = 0.1

def remove_punctuation_marks(text) :
    punctuation_marks = dict((ord(punctuation_mark), None) for punctuation_mark in string.punctuation)
    return text.translate(punctuation_marks)

def get_lemmatized_tokens(text) :
    normalized_tokens = nltk.word_tokenize(remove_punctuation_marks(text.lower()))
    return [nltk.stem.WordNetLemmatizer().lemmatize(normalized_token) for normalized_token in normalized_tokens]

def calculate_sentence_scores(sentence_tokens, tfIdf):
    sentence_scores = {}
    for sent in sentence_tokens:
        for word in nltk.word_tokenize(sent.lower()):
            if word in tfIdf.keys():
                if sent not in sentence_scores.keys():
                    sentence_scores[sent] = tfIdf[word]
                else:
                    sentence_scores[sent] += tfIdf[word]                    
    return sentence_scores

def get_summary(summary_max_length, sentence_scores):
    summary_sentences = heapq.nlargest(summary_max_length, sentence_scores, key=sentence_scores.get)
    summary = ' '.join(summary_sentences)
    return summary


In [95]:
corpus = []
for i in stories:
    corpus.append(i['story'])
print(len(corpus))


92579


In [96]:
stpwrds = stopwords.words('english') + list(string.punctuation) + ['—', '“', '”', "'", "’"]

vectorizer = TfidfVectorizer(tokenizer = get_lemmatized_tokens, stop_words = stpwrds)
tfIdf = vectorizer.fit_transform(corpus)



In [97]:
df = pd.DataFrame(tfIdf[0].T.todense(), index=vectorizer.get_feature_names(), columns=["TF-IDF"])
df = df.sort_values('TF-IDF', ascending=False)
df.head()

Unnamed: 0,TF-IDF
shokalskiy,0.350399
ice,0.349511
polar,0.242372
icebreaker,0.24185
ship,0.222538


In [102]:
# Document to summarized
document = nltk.sent_tokenize(stories[1]['story'])

tfIdf_dict = df.to_dict()
sentence_scores = calculate_sentence_scores(document, tfIdf_dict['TF-IDF'])

summary = get_summary(3, sentence_scores)
print(summary)

Majora Carter, who set up Sustainable South Bronx to help lift the area out of poverty by creating green-collar jobs, is frustrated by this difference between the European and American industries. Dusty Gedge, co-founder of Livingroof.org, a UK Web site promoting the green roof industry, believes it is the government's responsibility to help the industry grow. Do green roofs really help the environment -- or are they an expensive indulgence?


In [103]:
from rouge import Rouge
rouge = Rouge()
scores = rouge.get_scores(summary, ''.join(stories[1]['highlights']))
#scores1 = rouge.get_scores(''.join(stories[1]['highlights']), ''.join(stories[1]['highlights']))
print(scores)
print()
#print(scores1)

[{'rouge-1': {'f': 0.25862068490636153, 'p': 0.2112676056338028, 'r': 0.3333333333333333}, 'rouge-2': {'f': 0.03508771455832628, 'p': 0.02857142857142857, 'r': 0.045454545454545456}, 'rouge-l': {'f': 0.16161615680032665, 'p': 0.13559322033898305, 'r': 0.2}}]

