In [52]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
from heapq import nlargest
import urllib.request as request
import bs4 as BeautifulSoup

In [53]:
fetched_data = request.urlopen('https://en.wikipedia.org')

article_read = fetched_data.read()

In [54]:
article_parsed = BeautifulSoup.BeautifulSoup(article_read,'html.parser')
paragraphs = article_parsed.find_all('p')

In [55]:
article_content = ''
for p in paragraphs:
    article_content += p.text

In [56]:
stopwords = list(STOP_WORDS)
nlp = spacy.load('en_core_web_sm')
doc = nlp(article_content)
tokens = [token.text for token in doc]
punctuation = punctuation + '\n'
punctuation
word_frequencies = {}
for word in doc:
    if word.text.lower() not in stopwords:
        if word.text.lower() not in punctuation:
            if word.text not in word_frequencies.keys():
                word_frequencies[word.text] = 1
            else:
                word_frequencies[word.text] += 1

In [58]:
max_frequency = max(word_frequencies.values())
max_frequency

10

In [59]:
for word in word_frequencies.keys():
    word_frequencies[word] = word_frequencies[word]/max_frequency

print(word_frequencies)

{'Planet': 1.0, 'Apes': 1.0, 'American': 0.1, 'science': 0.1, 'fiction': 0.1, 'media': 0.2, 'franchise': 0.1, 'consisting': 0.1, 'films': 0.2, 'books': 0.1, 'television': 0.2, 'series': 0.3, 'comics': 0.1, 'world': 0.1, 'humans': 0.1, 'intelligent': 0.1, 'apes': 0.1, 'clash': 0.1, 'control': 0.1, 'Based': 0.1, 'French': 0.1, 'author': 0.1, 'Pierre': 0.1, 'Boulle': 0.1, '1963': 0.1, 'novel': 0.1, 'La': 0.1, 'Planète': 0.1, 'des': 0.1, 'singes': 0.1, '1968': 0.1, 'film': 0.4, 'adaptation': 0.1, 'critical': 0.2, 'commercial': 0.1, 'hit': 0.1, '1970': 0.1, '1973': 0.1, 'sequels': 0.1, 'followed': 0.2, 'original': 0.2, 'Beneath': 0.1, 'Escape': 0.1, 'Conquest': 0.1, 'Battle': 0.1, 'approach': 0.1, 'acclaim': 0.1, 'commercially': 0.1, 'successful': 0.1, 'spawning': 0.1, '1974': 0.1, '1975': 0.1, 'Tim': 0.1, 'Burton': 0.1, 'released': 0.1, '2001': 0.1, 'reboot': 0.1, 'began': 0.2, '2011': 0.1, 'Rise': 0.1, 'Dawn': 0.1, '2014': 0.1, 'War': 0.1, '2017': 0.1, 'grossed': 0.1, '2': 0.1, '\xa0': 0.

In [60]:
sentence_tokens = [sent for sent in doc.sents]
print(sentence_tokens)

[Planet of the Apes is an American science-fiction media franchise consisting of films, books, television series, comics, and other media about a world in which humans and intelligent apes clash for control., Based on French author Pierre Boulle's 1963 novel La Planète des singes, its 1968 film adaptation, Planet of the Apes, was a critical and commercial hit., From 1970 to 1973 four sequels followed the original film: Beneath the Planet of the Apes, Escape from the Planet of the Apes, Conquest of the Planet of the Apes and Battle for the Planet of the Apes., They did not approach the critical acclaim of the original, but were commercially successful, spawning two television series in 1974 and 1975., Tim Burton's Planet of the Apes film was released in 2001., A reboot film series began in 2011 with Rise of the Planet of the Apes, which was followed by Dawn of the Planet of the Apes in 2014 and War for the Planet of the Apes in 2017., The films have grossed over $2 billion worldwide., (

In [63]:
sentence_scores = {}
for sent in sentence_tokens:
    for word in sent:
        if word.text.lower() in word_frequencies.keys():
            if sent not in sentence_scores.keys():
                sentence_scores[sent] = word_frequencies[word.text.lower()]
            else:
                sentence_scores[sent] += word_frequencies[word.text.lower()]
sentence_scores

{Planet of the Apes is an American science-fiction media franchise consisting of films, books, television series, comics, and other media about a world in which humans and intelligent apes clash for control.: 2.4000000000000004,
 Based on French author Pierre Boulle's 1963 novel La Planète des singes, its 1968 film adaptation, Planet of the Apes, was a critical and commercial hit.: 1.6000000000000003,
 From 1970 to 1973 four sequels followed the original film: Beneath the Planet of the Apes, Escape from the Planet of the Apes, Conquest of the Planet of the Apes and Battle for the Planet of the Apes.: 1.5000000000000004,
 They did not approach the critical acclaim of the original, but were commercially successful, spawning two television series in 1974 and 1975.: 1.6000000000000003,
 Tim Burton's Planet of the Apes film was released in 2001.: 0.7,
 A reboot film series began in 2011 with Rise of the Planet of the Apes, which was followed by Dawn of the Planet of the Apes in 2014 and War

In [64]:
select_length = int(len(sentence_tokens)*0.3)
select_length

5

In [70]:
summary = nlargest(select_length, sentence_scores, key = sentence_scores.get)
summary

[Planet of the Apes is an American science-fiction media franchise consisting of films, books, television series, comics, and other media about a world in which humans and intelligent apes clash for control.,
 A reboot film series began in 2011 with Rise of the Planet of the Apes, which was followed by Dawn of the Planet of the Apes in 2014 and War for the Planet of the Apes in 2017.,
 Based on French author Pierre Boulle's 1963 novel La Planète des singes, its 1968 film adaptation, Planet of the Apes, was a critical and commercial hit.,
 They did not approach the critical acclaim of the original, but were commercially successful, spawning two television series in 1974 and 1975.,
 From 1970 to 1973 four sequels followed the original film: Beneath the Planet of the Apes, Escape from the Planet of the Apes, Conquest of the Planet of the Apes and Battle for the Planet of the Apes.]

In [71]:
final_summary = [word.text for word in summary]
summary = ' '.join(final_summary)
print(summary)

Planet of the Apes is an American science-fiction media franchise consisting of films, books, television series, comics, and other media about a world in which humans and intelligent apes clash for control. A reboot film series began in 2011 with Rise of the Planet of the Apes, which was followed by Dawn of the Planet of the Apes in 2014 and War for the Planet of the Apes in 2017. Based on French author Pierre Boulle's 1963 novel La Planète des singes, its 1968 film adaptation, Planet of the Apes, was a critical and commercial hit. They did not approach the critical acclaim of the original, but were commercially successful, spawning two television series in 1974 and 1975. From 1970 to 1973 four sequels followed the original film: Beneath the Planet of the Apes, Escape from the Planet of the Apes, Conquest of the Planet of the Apes and Battle for the Planet of the Apes.
