In [None]:
# import packages

import re
import string
import spacy
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from geopy import geocoders

# SpaCy has own Stop Words (useless words for analysis) dicitonary to use for removal.
from spacy.lang.en.stop_words import STOP_WORDS

# A scikit-learn combatible visualizer for word frequency
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

# For later sentence scoring
from heapq import nlargest


In [None]:
# Load spacy English tokenizer, tagger, parser,and  word vectors
nlp = spacy.load('en')

# Initialize stopwords variable with spacy
stopwords = spacy.lang.en.stop_words.STOP_WORDS

# Load in text/whole documents and create Doc nlp object
input_str = open("wiki.txt").read() # full text -- enter path to txt HERE
article_1 = input_str[:42689] # segment first full article for analysis
doc = nlp(article_1) # create doc object for article

In [None]:
# Print Original Article
print()
print("\033[1m" + 'Original Article: ' + "\033[0m" + '%s' % (article_1[:250])) # adjust article display length accordingly
print()
print("____________________________________")
print()


In [None]:
# create dictionaries
word_frequencies2 = {}
word_frequencies_dist = {}
entities = {}
sentence_scores = {}

In [None]:
# text preprocessing
s = article_1.lower() # convert string to lowercase for simplicity
s = re.sub(r' \d+', '', article_1) # remove numbers from string
s = article_1.translate(str.maketrans("", "", string.punctuation)) # remove special characters from string
tokens = nlp(s) # tokenize - split string into words
tokens = [t for t in tokens if len(t) > 2] # remove short words
tokens = [t for t in tokens if t not in stopwords] # remove stop words

In [None]:
# Print tokens of original article
print("\033[1m" + 'Tokens:' + "\033[0m", tokens)
print()
print("____________________________________")
print()

# Print Recognized noun phrases from text
print("\033[1m" + "Noun phrases:" + "\033[0m", [chunk.text for chunk in doc.noun_chunks])

# Print Recognized verbs from text
print("\033[1m" + "Verbs:" + "\033[0m", [token.lemma_ for token in doc if token.pos_ == "VERB"])
print()
print("____________________________________")
print()


In [None]:
# Find named entities, phrases and concepts
for entity in doc.ents:
    entities.update({entity.text: entity.label_})

# Location-specific entity locations dictionary
entity_locations = dict()

for k, v in entities.items():
    if v == 'LOC' or v == 'GPE':
        geolocator = geocoders.GoogleV3(api_key='') # enter API key HERE
        location_info = geolocator.geocode(k, timeout=100)
        entity_locations.update({k : location_info})


In [None]:
# Print Entities
print("\033[1m" + "Entities: " + "\033[0m", entities)
print("\033[1m" + "Location-specific Entities: " + "\033[0m", entity_locations)
print()
print("____________________________________")
print()

In [None]:
# Build word frequency dictionary
for t in tokens:
    if t.text not in stopwords:
        if len(t.text) > 2:
            if t.text not in word_frequencies2.keys():
                word_frequencies2[t.text] = 1
            else:
                word_frequencies2[t.text] += 1


In [None]:
# Print Word Frequency Table
print("\033[1m" + "Word Frequency:" + "\033[0m", sorted(word_frequencies2.items(), key=lambda x: x[1], reverse=True))

In [None]:
# Find Maximum Word Frequency
maximum_frequency = max(word_frequencies2.values())

for word in word_frequencies2.keys():
    word_frequencies_dist[word] = (word_frequencies2[word] / maximum_frequency)

In [None]:
# Print Weighted Frequency Table Distribution
print("\033[1m" + "Weighted Word Frequency Distribution:" + "\033[0m",
        sorted(word_frequencies_dist.items(), key=lambda x: x[1], reverse=True))
print()
print("____________________________________")
print()


In [None]:
# Sentence Score and Ranking of Words in each sentence
# This scores every sentence based on number of non-stopwords

# Sentence Tokens
sentence_list = [sentence for sentence in doc.sents]

# Sentence Score via comparing each word with sentence

for sent in sentence_list:
    for word in sent:
        if word.text.lower() in word_frequencies_dist.keys():
            if len(sent.text.split(' ')) < 30:
                if sent not in sentence_scores.keys():
                    sentence_scores[sent] = word_frequencies_dist[word.text.lower()]
                else:
                    sentence_scores[sent] += word_frequencies_dist[word.text.lower()]

In [None]:
# Print Sentence Scores
print("\033[1m" + "Sentence Scores:" + "\033[0m", sentence_scores)
print()
print("____________________________________")
print()

In [None]:
# Finding top N sentence with largest score
summarized_sentences = nlargest(7, sentence_scores, key=sentence_scores.get)

# List Comprehension of Sentences Converted From Spacy.span to strings
final_sentences = [w.text for w in summarized_sentences]

# Spacy Summary of text
summary = ' '.join(final_sentences)

In [None]:
# Print SpaCy Summary
print("\033[1m" + "Spacy Summary:" + "\033[0m", summary)
print("____________________________________")
print()


In [None]:
# Produce WordCloud for visual representation of word frequency
wordcloud = WordCloud().generate(article_1)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()