In [1]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
import string
import pandas as pd
import heapq

nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /home/miboj/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/miboj/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
import os
from pathlib import Path
from os import listdir

# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, encoding='utf-8')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

# load all stories in a directory
def load_stories(directory):
    for name in listdir(directory):
        filename = directory + '/' + name
        # load document
        doc = load_doc(filename)

# split a document into news story and highlights
def split_story(doc):
    # find first highlight
    index = doc.find('@highlight')
    # split into story and highlights
    story, highlights = doc[:index], doc[index:].split('@highlight')
    # strip extra white space around each highlight
    highlights = [h.strip() for h in highlights if len(h) > 0]
    return story, highlights

# load all stories in a directory
def load_stories(directory):
    all_stories = list()
    for name in listdir(directory):
        filename = directory + '/' + name
        # load document
        doc = load_doc(filename)
        # split into story and highlights
        story, highlights = split_story(doc)
        # store
        all_stories.append({'story':story, 'highlights':highlights})
    return all_stories

# load stories
p = Path(os.getcwd()).parents[0]
cnn_path = str(p) + r'/data/external/cnn/stories'
stories = load_stories(cnn_path)
print('Loaded Stories %d' % len(stories))

Loaded Stories 92579


In [3]:
def remove_punctuation_marks(text) :
    punctuation_marks = dict((ord(punctuation_mark), None) for punctuation_mark in string.punctuation)
    return text.translate(punctuation_marks)

def get_tokens(text) :
    normalized_tokens = nltk.word_tokenize(remove_punctuation_marks(text.lower()))
    # Lemmatized
    #return [nltk.stem.WordNetLemmatizer().lemmatize(normalized_token) for normalized_token in normalized_tokens]
    # Stemmed
    return [nltk.stem.PorterStemmer().stem(normalized_token) for normalized_token in normalized_tokens]

def calculate_sentence_scores(sentence_tokens, tfIdf):
    sentence_scores = {}
    for sent in sentence_tokens:
        #for word in nltk.word_tokenize(sent.lower()):
        for word in get_tokens(sent):    
            if word in tfIdf.keys():
                if sent not in sentence_scores.keys():
                    sentence_scores[sent] = tfIdf[word]/len(sent)
                else:
                    sentence_scores[sent] += tfIdf[word]/len(sent)                    
    return sentence_scores

def get_summary(summary_max_length, sentence_scores):
    summary_sentences = heapq.nlargest(summary_max_length, sentence_scores, key=sentence_scores.get)
    summary = ' '.join(summary_sentences)
    return summary


In [4]:
# Train test split
X = [i['story'] for i in stories]
y = [i['highlights'] for i in stories]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=10)

In [5]:
print(len(X_train))

74063


In [6]:
stpwrds = stopwords.words('english') + list(string.punctuation) + ['—', '“', '”', "'", "’"]

vectorizer = TfidfVectorizer(tokenizer = get_tokens, stop_words = stpwrds)
tfIdf = vectorizer.fit_transform(X_train)



In [7]:
df = pd.DataFrame(tfIdf[0].T.todense(), index=vectorizer.get_feature_names(), columns=["TF-IDF"])
df = df.sort_values('TF-IDF', ascending=False)
df.head(20)

Unnamed: 0,TF-IDF
dhabi,0.327725
abu,0.299619
newsgath,0.293579
middl,0.250523
east,0.250408
maddox,0.21015
cnn,0.209359
prism,0.155704
region,0.142388
stan,0.127491


In [8]:
# Testing cell
a = get_tokens('playing')
b = nltk.stem.WordNetLemmatizer().lemmatize('playing')
wnl = nltk.stem.WordNetLemmatizer()
print(a)
print(b)
porter = nltk.stem.PorterStemmer()
lancaster = nltk.stem.LancasterStemmer()
print(porter.stem('playing'))
print(lancaster.stem('playing'))
print(wnl.lemmatize('playing', pos='v'))

['play']
playing
play
play
play


In [9]:
# Document to summarized
document = nltk.sent_tokenize(stories[1]['story'])
#document = stories[1]['story']

tfIdf_dict = df.to_dict()
sentence_scores = calculate_sentence_scores(document, tfIdf_dict['TF-IDF'])

summary = get_summary(3, sentence_scores)
print(summary)
#print(document)

LONDON, England (CNN) -- If your neighbor mentions their green roof you might think they have a moss problem. "In Switzerland, green roofs are federal law, but again this is interpreted at a cantonal and city level," he told CNN. "In Germany they are down to $20 per square meter, which is way cheaper than a regular roof here," she told CNN.


In [11]:
from rouge import Rouge
rouge = Rouge()
scores = rouge.get_scores(summary, ''.join(stories[1]['highlights']))
#scores1 = rouge.get_scores(''.join(stories[1]['highlights']), ''.join(stories[1]['highlights']))
print(scores)
print()
#print(scores1)

[{'rouge-1': {'f': 0.1308411166215392, 'p': 0.11290322580645161, 'r': 0.15555555555555556}, 'rouge-2': {'f': 0.019047614178686056, 'p': 0.01639344262295082, 'r': 0.022727272727272728}, 'rouge-l': {'f': 0.10869564725897944, 'p': 0.09615384615384616, 'r': 0.125}}]

