In [0]:
%pip install bs4
%pip install lxml
%pip install nltk
%pip install textblob

In [0]:
import urllib.request as ur
from bs4 import BeautifulSoup

## STEP 1: Read data from HTML and parse it to clean string

In [0]:
#We would extract the abstract from this HTML page article
articleURL = "https://www.washingtonpost.com/news/the-switch/wp/2016/10/18/the-pentagons-massive-new-telescope-is-designed-to-track-space-junk-and-watch-out-for-killer-asteroids/"

In [0]:
#HTML contains extra tags in a tree like structure
page = ur.urlopen(articleURL).read().decode('utf8','ignore') 
soup = BeautifulSoup(page,"lxml")
soup

In [0]:
#We want the article or base text only
soup.find('article')

In [0]:
#Remove the article tags and get the plain text
soup.find('article').text

In [0]:
#Take all the articles from the page using find_all and combine together into a single string with a " "
text = ' '.join(map(lambda p: p.text, soup.find_all('article')))
text

In [0]:
#The encode() method encodes the string using the specified encoding. Convert back the encoded version to string by using decode() 
#Replace special encoded characters with a '?', further replace question mark with a blank char to get plain text from encoded article text.
text.encode('ascii', errors='replace').decode('utf8').replace("?"," ")

In [0]:
#All above steps encapsulated- to read and parse data from HTMl text
import urllib.request as ur
from bs4 import BeautifulSoup
def getTextWaPo(url):
    page = ur.urlopen(url).read().decode('utf8')
    soup = BeautifulSoup(page,"lxml")
    text = ' '.join(map(lambda p: p.text, soup.find_all('article')))
    return text.encode('ascii', errors='replace').decode('utf8').replace("?"," ")

In [0]:
#calling function
articleURL= "https://www.washingtonpost.com/news/the-switch/wp/2016/10/18/the-pentagons-massive-new-telescope-is-designed-to-track-space-junk-and-watch-out-for-killer-asteroids/"
text = getTextWaPo(articleURL)
text

## STEP 2: Extract summary

In [0]:
import nltk
from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.corpus import stopwords
from string import punctuation

In [0]:
#Strip all se4ntences in the text
# A sentence is identified by a period or full stop. A space has to be accompanied by the full-stop else both sentences would be treated as a single sentence
nltk.download('punkt')
sents = sent_tokenize(text)
sents

In [0]:
#Strip all words/tokens in the text
word_sent = word_tokenize(text.lower())
word_sent

In [0]:
#Get all english stop words and punctuation marks
nltk.download('stopwords')
_stopwords = set(stopwords.words('english') + list(punctuation))
_stopwords

In [0]:
#Filter stop words from our list of words in text
word_sent=[word for word in word_sent if word not in _stopwords]
word_sent

In [0]:
#Use build in function to determine the frequency or the number of times each word occurs in the text
#The higher the frequency, more is the importance of word
from nltk.probability import FreqDist
freq = FreqDist(word_sent)
freq

In [0]:
#The nlargest () function of the Python module heapq returns the specified number of largest elements from a Python iterable like a list, tuple and others. 
#heapq.nlargest(n, iterable, key=sorting_key, here used the dict.get function to get the value(frequency for a word) from key:value pair)

from heapq import nlargest
nlargest(10, freq, key=freq.get)
#To check if these most important words match with the central theme of the article 'Space asteroid attack'

In [0]:
#Now that we have the Word importance, we can calculate the significance score for each sentence
#Word_Imp=Frequency of word in corpus
#Sentence_Significance_score=SUM(Word_Imp for Words in the sentence)

from collections import defaultdict
ranking = defaultdict(int)

for i,sent in enumerate(sents):
    for w in word_tokenize(sent.lower()):
        if w in freq:
            ranking[i] += freq[w]
            
ranking
#{Index of sentence : Sentence significance score}

In [0]:
#Top most important 4 sentences - having maximum sentence significance score
sents_idx = nlargest(4, ranking, key=ranking.get)
sents_idx

In [0]:
#Get the sentences from the top indices
summary_1=[sents[j] for j in sorted(sents_idx)]
summary_1

In [0]:
#Concat most important sentences to form the summary
summary=""
for i in range(len(summary_1)):
  summary=summary + summary_1[i]
  
summary

In [0]:
def summarize(text, n):
    sents = sent_tokenize(text)
    
    assert n <= len(sents) #Check if the sentences list have atleast n sentences
    word_sent = word_tokenize(text.lower())
    _stopwords = set(stopwords.words('english') + list(punctuation))
    
    word_sent=[word for word in word_sent if word not in _stopwords]
    freq = FreqDist(word_sent)
    
    
    ranking = defaultdict(int)
    
    for i,sent in enumerate(sents):
        for w in word_tokenize(sent.lower()):
            if w in freq:
                ranking[i] += freq[w]
             
        
    sents_idx = nlargest(n, ranking, key=ranking.get)
    summary_1= [sents[j] for j in sorted(sents_idx)]
    summary=""
    for i in range(len(summary_1)):
      summary=summary + summary_1[i]
    return summary

In [0]:
#calling
summarize(text,4)