In [1]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
import string
import pandas as pd
import heapq
import ast
import json

nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /home/miboj/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/miboj/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
filename = r'/home/miboj/NLP/document-summarizer/data/processed/articles.json'
file = open(filename, encoding='ascii', errors='ignore')
text = file.read()
file.close()

d = ast.literal_eval(text)

In [3]:
with open(filename) as json_file:
    data = json.load(json_file)

filename = r'/home/miboj/NLP/document-summarizer/data/processed/articles.json'
file = open(filename, encoding='ascii', errors='ignore')
text = file.read()
file.close()

json_content = ast.literal_eval(text)
samples = json_content[0:10]

In [4]:
tokens_list = []
for i in d:
    for sen in i['content']:
        tokens_list.append(sen)

In [5]:
import time
start_time = time.time()
sentences = []
word_count = 0
stpwrds = stopwords.words('english') + list(string.punctuation) + ['—', '“', '”', "'", "’"]
for e, i in enumerate(tokens_list):
    words = []
    a = nltk.word_tokenize(i)
    for word in a:
        if word not in stpwrds:
            words.append(word)
            word_count += 1
    sentences.append(words)
print("--- %s seconds ---" % (time.time() - start_time))

--- 32.63631463050842 seconds ---


In [6]:
for text in tokens_list:
    print(text)
    break

 Lt. Gen. David Allvin was confirmed by the Senate to be the Air Forces next vice chief of staff in a late-night vote Wednesday.


In [7]:
def remove_punctuation_marks(text):
    punctuation_marks = dict((ord(punctuation_mark), None) for punctuation_mark in string.punctuation)
    return text.translate(punctuation_marks)

def remove_punctuation_marks2(sentences):
    punctutaion_marks = None
    for text in sentences:
        if punctutaion_marks == None:
             punctuation_marks = dict((ord(punctuation_mark), None) for punctuation_mark in string.punctuation)
        else:
            punctuation_marks += dict((ord(punctuation_mark), None) for punctuation_mark in string.punctuation)
    return text.translate(punctuation_marks)

def get_tokens(sentences) :
    normalized_tokens = nltk.word_tokenize(remove_punctuation_marks(text.lower()))
    # Lemmatized
    #return [nltk.stem.WordNetLemmatizer().lemmatize(normalized_token) for normalized_token in normalized_tokens]
    # Stemmed
    return [nltk.stem.PorterStemmer().stem(normalized_token) for normalized_token in normalized_tokens]

def calculate_sentence_scores(sentence_tokens, tfIdf):
    sentence_scores = {}
    for sent in sentence_tokens:
        #for word in nltk.word_tokenize(sent.lower()):
        for word in get_tokens(sent):    
            if word in tfIdf.keys():
                if sent not in sentence_scores.keys():
                    sentence_scores[sent] = tfIdf[word]/len(sent)
                else:
                    sentence_scores[sent] += tfIdf[word]/len(sent)                    
    return sentence_scores

def get_summary(summary_max_length, sentence_scores):
    summary_sentences = heapq.nlargest(summary_max_length, sentence_scores, key=sentence_scores.get)
    summary = ' '.join(summary_sentences)
    return summary


In [8]:
stpwrds = stopwords.words('english') + list(string.punctuation) + ['—', '“', '”', "'", "’"]
vectorizer = TfidfVectorizer(tokenizer = get_tokens, stop_words = stpwrds)
tfIdf = vectorizer.fit_transform(tokens_list)



In [9]:
df = pd.DataFrame(tfIdf[0].T.todense(), index=vectorizer.get_feature_names(), columns=["TF-IDF"])
df = df.sort_values('TF-IDF', ascending=False)
df.head(20)

Unnamed: 0,TF-IDF
air,0.25
allvin,0.25
chief,0.25
confirm,0.25
david,0.25
forc,0.25
gen,0.25
latenight,0.25
lt,0.25
next,0.25


In [10]:
# Testing cell
a = get_tokens('playing')
b = nltk.stem.WordNetLemmatizer().lemmatize('playing')
wnl = nltk.stem.WordNetLemmatizer()
print(a)
print(b)
porter = nltk.stem.PorterStemmer()
lancaster = nltk.stem.LancasterStemmer()
print(porter.stem('playing'))
print(lancaster.stem('playing'))
print(wnl.lemmatize('playing', pos='v'))

['lt', 'gen', 'david', 'allvin', 'wa', 'confirm', 'by', 'the', 'senat', 'to', 'be', 'the', 'air', 'forc', 'next', 'vice', 'chief', 'of', 'staff', 'in', 'a', 'latenight', 'vote', 'wednesday']
playing
play
play
play


In [16]:
def remove_empty_string(input_string):
    for e, i in enumerate(input_string):
        try:
            if i[-1] == ' ' and input_string[e+1][-1] == ' ':
                input_string[e] = i.rstrip()
        except IndexError:
            continue
    joined_string = ''.join(input_string)
    for e, i in enumerate(joined_string):
        if i == ' ' and joined_string[e+1] == ' ':
            del i
    sentences = nltk.sent_tokenize(joined_string)
    return sentences


In [12]:
# Document to summarized
#document = nltk.sent_tokenize(stories[1]['story'])
raw_string = [" ROME — Defying reports that their planned partnership is ", "doomed to fail", ", France’s Naval Group and ", "Italy’s Fincantieri", " have announced a joint venture to build and export naval vessels. ", " The two ", "state-controlled shipyards", " said they were forming a 50-50 joint venture after months of talks to integrate their activities. The move comes as Europe’s fractured shipbuilding industry faces stiffer global competition. ", " The firms said in a statement that the deal would allow them to “jointly prepare winning offers for binational programs and export market,” as well as create joint supply chains, research and testing. ", " Naval Group and Fincantieri first announced talks on cooperation last year after the latter negotiated a controlling share in French shipyard STX. But the deal was reportedly losing momentum due to resistance from French industry and a political row between France and Italy over migrants. ", " The new deal falls short of the 10 percent share swap predicted by French Economy and Finance Minister Bruno Le Maire earlier this year, and far short of the total integration envisaged by Fincantieri CEO Giuseppe Bono. ", " The statement called the joint venture the “first steps” toward the creation of an alliance that would create “a more efficient and competitive European shipbuilding industry.”", " Naval Group CEO Hervé Guillou, speaking at the Euronaval trade expo in Paris on Oct. 24, said the alliance is based on “two countries sharing a veritable naval ambition.”", " The joint venture is necessary because the “context of the global market has changed drastically,” he added, specifically mentioning new market entrants Russia, China, Singapore, Ukraine, India and Turkey.", "Sign up for the Early Bird Brief, the defense industry's most comprehensive news and information, straight to your inbox.", "By giving us your email, you are opting in to the Early Bird Brief.", " When asked about an initial product to be tackled under the alliance, Guillou acknowledged: “The answer is simple: there is nothing yet.”", " However, the firms said they are working toward a deal to build four logistics support ships for the French Navy, which will be based on an Italian design. ", "Competition flares up for the follow-on portion of a deal previously won by the French shipbuilder.", " The firms also plan to jointly bid next year on work for midlife upgrades for Horizon frigates, which were built by France and Italy and are in service with both navies. The work would include providing a common combat management system. ", " The statement was cautious about future acceleration toward integration. “A Government-to-Government Agreement would be needed to ensure the protection of sovereign assets, a fluid collaboration between the French and Italian teams and encourage further coherence of the National assistance programs, which provide a framework and support export sales,” the statement said.", " But the firms were optimistic the deal would be “a great opportunity for both groups and their eco-systems, by enhancing their ability to better serve the Italian and French navies, to capture new export contracts, to increase research funding and, ultimately, improve the competitiveness of both French and Italian naval sectors.”", " ", "Sebastian Sprenger", " in Paris contributed to this report."]
document = remove_empty_string(raw_string)

#document = stories[1]['story']

tfIdf_dict = df.to_dict()
def get_sentence_scores(document, tfIdf_dict):
    sentence_scores = calculate_sentence_scores(document, tfIdf_dict['TF-IDF'])
    return sentence_scores
#summary = get_summary(3, sentence_scores)
#print(summary)
#print(document)

In [13]:
summary_samples = []
summary_len_list = []
for i in samples:
    sentences = remove_empty_string(i['content'])
    scores = get_sentence_scores(sentences, tfIdf_dict)
    summary_len = int(len(sentences)*0.3) 
    summary = get_summary(summary_len, scores)
    summary_samples.append(summary)
    summary_len_list.append(summary_len)

Out of index
Out of index
Out of index
Out of index
Out of index
Out of index
Out of index
Out of index
Out of index
Out of index
Out of index
Out of index


In [14]:
sorted_summaries = []
for e, i in enumerate(summary_samples):
    a = nltk.sent_tokenize(i)
    o = samples[e]['content']
    b = remove_empty_string(o)
    #print(a)
    #print(b)
    res = [sort for x in b for sort in a if sort == x]
    sorted_summaries.append(res)

Out of index
Out of index
Out of index
Out of index
Out of index
Out of index
Out of index
Out of index
Out of index
Out of index
Out of index
Out of index
Out of index


In [17]:
import io
import sys

for e, i in enumerate(sorted_summaries):
    print(e, ": ")
    print("len original: ", len(remove_empty_string(samples[e]['content'])))
    print("Summary len: ", summary_len_list[e])
    summary = ""
    for sen in i:
        summary += sen
        summary += " "
    print(summary)

0 : 
len original:  17
Summary len:  5
Allvins nomination to become vice chief and receive his fourth star was approved unanimously. In a Thursday release, Chief of Staff Gen. Charles CQ Brown applauded Allvins confirmation. Wilson has served in his role since July 2016 and is the longest-serving vice chief in Air Force history. His past commands include the 97th Air Mobility Wing at Altus Air Force Base in Oklahoma from 2007 to 2009. He has traveled to the Middle East to cover Air Force operations against the Islamic State. 
1 : 
len original:  18
Summary len:  5
The type will replace the older NAMC YS-11EB (right) in the role. The aircraft made its maiden flight in early 2018, though the variant had been in development since at least 2015. Japan is also seeking to recapitalize its standoff jamming capability. The EC-1 is based on the older Kawasaki C-1 that Japan is slowly replacing with the C-2. Fiscal 2019 received no funding for the effort. 
2 : 
len original:  14
Summary len:  4
