In [14]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
import string
import pandas as pd
import heapq

nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /home/miboj/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/miboj/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [17]:
import os
from pathlib import Path
from os import listdir

# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, encoding='utf-8')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

# load all stories in a directory
def load_stories(directory):
    for name in listdir(directory):
        filename = directory + '/' + name
        # load document
        doc = load_doc(filename)

# split a document into news story and highlights
def split_story(doc):
    # find first highlight
    index = doc.find('@highlight')
    # split into story and highlights
    story, highlights = doc[:index], doc[index:].split('@highlight')
    # strip extra white space around each highlight
    highlights = [h.strip() for h in highlights if len(h) > 0]
    return story, highlights

# load all stories in a directory
def load_stories(directory):
    all_stories = list()
    for name in listdir(directory):
        filename = directory + '/' + name
        # load document
        doc = load_doc(filename)
        # split into story and highlights
        story, highlights = split_story(doc)
        # store
        all_stories.append({'story':story, 'highlights':highlights})
    return all_stories

# load stories
p = Path(os.getcwd()).parents[0]
cnn_path = str(p) + r'/data/external/cnn/stories'
stories = load_stories(cnn_path)
print('Loaded Stories %d' % len(stories))

Loaded Stories 92579


In [18]:
def remove_punctuation_marks(text) :
    punctuation_marks = dict((ord(punctuation_mark), None) for punctuation_mark in string.punctuation)
    return text.translate(punctuation_marks)

def get_tokens(text) :
    normalized_tokens = nltk.word_tokenize(remove_punctuation_marks(text.lower()))
    # Lemmatized
    #return [nltk.stem.WordNetLemmatizer().lemmatize(normalized_token) for normalized_token in normalized_tokens]
    # Stemmed
    return [nltk.stem.PorterStemmer().stem(normalized_token) for normalized_token in normalized_tokens]

def calculate_sentence_scores(sentence_tokens, tfIdf):
    sentence_scores = {}
    for sent in sentence_tokens:
        #for word in nltk.word_tokenize(sent.lower()):
        for word in get_tokens(sent):    
            if word in tfIdf.keys():
                if sent not in sentence_scores.keys():
                    sentence_scores[sent] = tfIdf[word]
                else:
                    sentence_scores[sent] += tfIdf[word]                    
    return sentence_scores

def get_summary(summary_max_length, sentence_scores):
    summary_sentences = heapq.nlargest(summary_max_length, sentence_scores, key=sentence_scores.get)
    summary = ' '.join(summary_sentences)
    return summary


In [15]:
# Train test split
X = [i['story'] for i in stories]
y = [i['highlights'] for i in stories]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=10)

In [16]:
print(len(X_train))

74063


In [5]:
stpwrds = stopwords.words('english') + list(string.punctuation) + ['—', '“', '”', "'", "’"]

vectorizer = TfidfVectorizer(tokenizer = get_tokens, stop_words = stpwrds)
tfIdf = vectorizer.fit_transform(X_train)



In [7]:
df = pd.DataFrame(tfIdf[0].T.todense(), index=vectorizer.get_feature_names(), columns=["TF-IDF"])
df = df.sort_values('TF-IDF', ascending=False)
df.head(20)

Unnamed: 0,TF-IDF
shokalskiy,0.360352
ice,0.353056
icebreak,0.246709
akademik,0.216211
ship,0.214875
polar,0.209014
guard,0.205191
coast,0.203746
xue,0.196144
australian,0.168889


In [None]:
# Testing cell
a = get_lemmatized_tokens('playing')
b = nltk.stem.WordNetLemmatizer().lemmatize('playing')
wnl = nltk.stem.WordNetLemmatizer()
print(a)
print(b)
porter = nltk.stem.PorterStemmer()
lancaster = nltk.stem.LancasterStemmer()
print(porter.stem('playing'))
print(lancaster.stem('playing'))
print(wnl.lemmatize('playing', pos='v'))

In [8]:
# Document to summarized
document = nltk.sent_tokenize(stories[1]['story'])
#document = stories[1]['story']

tfIdf_dict = df.to_dict()
sentence_scores = calculate_sentence_scores(document, tfIdf_dict['TF-IDF'])

summary = get_summary(3, sentence_scores)
print(summary)
#print(document)

Majora Carter, who set up Sustainable South Bronx to help lift the area out of poverty by creating green-collar jobs, is frustrated by this difference between the European and American industries. Dusty Gedge, co-founder of Livingroof.org, a UK Web site promoting the green roof industry, believes it is the government's responsibility to help the industry grow. Do green roofs really help the environment -- or are they an expensive indulgence?


In [None]:
from rouge import Rouge
rouge = Rouge()
scores = rouge.get_scores(summary, ''.join(stories[1]['highlights']))
#scores1 = rouge.get_scores(''.join(stories[1]['highlights']), ''.join(stories[1]['highlights']))
print(scores)
print()
#print(scores1)