## Web scraping using BS

In [None]:
import requests
from bs4 import BeautifulSoup

In [None]:
articleURL = "https://www.washingtonpost.com/news/the-switch/wp/2016/10/18/the-pentagons-massive-new-telescope-is-designed-to-track-space-junk-and-watch-out-for-killer-asteroids/"

In [None]:
import requests
page = requests.get(articleURL)# make get request to access data from the API
#print(type(page))
#page.text # actual response text

 #package
result=requests.get(articleURL) # make get request to access data from the API
#once you get a response from the API, you can access various properties of the result
print(result.headers)
print(result.status_code) 
print(result.text)# actual response text
print(result.encoding)


#display HTML string in the jupyter notebook
from IPython.core.display import display, HTML 
display(HTML(page))
#converting html_string into html object using HTML function 
#and then passing the HTML document to the display function

In [None]:
soup = BeautifulSoup(page.text, "lxml") #BS will convert html string to structured object that we
# can query later. #others 'html.parser'
#print(soup)# prints html document structure

In [None]:
#soup.find('article') #any text corresponding to article is enclosed in 'article' tag
#this prints text along with tag
#finds only first article element

In [None]:
#soup.find('article').text# use text attribute to get the content of the tag

In [None]:
soup.find_all('article')# gives a list of articles elements that have tag article

In [None]:
text = ' '.join(map(lambda p: p.text, soup.find_all('article')))
#join()returns a string in which the elements of seq have been joined by specified string separator
#combing all the articles


In [None]:
text.encode('ascii', errors='replace').decode().replace("?"," ") #remove special characters

## Tokenize

In [None]:
from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.corpus import *
stopwords
from string import punctuation

In [None]:
sents = sent_tokenize(text)
sents

In [None]:
word_sent = word_tokenize(text.lower())
word_sent

In [None]:
_stopwords = set(stopwords.words('english') + list(punctuation))
_stopwords

In [None]:
word_sent=[word for word in word_sent if word not in _stopwords]

In [None]:
word_sent #list of words in the article

## Create dict to store freq of each words in our vocab

In [None]:
from nltk.probability import FreqDist
freq = FreqDist(word_sent)
freq # dict stores words(that are not stopwords) as keys and their freq as values
#basically a vocab of words in our corpus

In [None]:
from heapq import nlargest
nlargest(10, freq, key=freq.get) #gives keys(words) that have top 10 values

## Compute significance score of each sentence
### authors of an article tend to repeat words that are significant to the theme of the article
### if a sentence has many such words, the sentence significance goes high

In [None]:


#ranking stores score of each sentence
ranking = defaultdict(int) #it doesnt throw error when we try to access keys not present in the dict 
                            #simply adds a new key to the dict

# ranking dict will store sentences as keys and their importance as values
for i,sent in enumerate(sents): #i iterates over indexes of sentences
    for w in word_tokenize(sent.lower()): #for each word in a sentence
        if w in freq:#if word is present in our vocab
            ranking[i] += freq[w] # sentence score =sum of freq(in corpus) of the each word that appears in the sentence
            
ranking

In [None]:
sents_idx = nlargest(4, ranking, key=ranking.get)# gives sentence index that have max score
sents_idx

In [None]:
[sents[j] for j in sorted(sents_idx)] #sort the sentences to make logical sense

In [11]:
import requests
from bs4 import BeautifulSoup
from collections import defaultdict
from heapq import nlargest
from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.corpus import stopwords
from string import punctuation
from nltk.probability import FreqDist
def getTextWaPo(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.text,"lxml")
    text = ' '.join(map(lambda p: p.text, soup.find_all('article')))
    return text.encode('ascii', errors='replace').decode().replace("?"," ")


def summarize(text, n): # text and number of sentences to be included in our summary
    sents = sent_tokenize(text)
    
    assert n <= len(sents) #check if the text has required number of sentences
    word_sent = word_tokenize(text.lower())
    _stopwords = set(stopwords.words('english') + list(punctuation))
    
    word_sent=[word for word in word_sent if word not in _stopwords]
    freq = FreqDist(word_sent)
    
    
    ranking = defaultdict(int)
    
    for i,sent in enumerate(sents):
        for w in word_tokenize(sent.lower()):
            if w in freq:
                ranking[i] += freq[w]
             
        
    sents_idx = nlargest(n, ranking, key=ranking.get)
    return [sents[j] for j in sorted(sents_idx)]

articleURL = "https://www.washingtonpost.com/news/the-switch/wp/2016/10/18/the-pentagons-massive-new-telescope-is-designed-to-track-space-junk-and-watch-out-for-killer-asteroids/"
text = getTextWaPo(articleURL)
summarize(text,3)
