# Most Frequent Words in Section of a Webpage

* Console application that displays the most common words used in a portion of a webpage
* The parameters for configuring the function are:
    * The number of words to return (default: 10)
    * Words to exclude from the search
* Page to crawl is https://en.wikipedia.org/wiki/Microsoft
* Get the most common words found and the number of times they are used in the section of the page titled "history"
    * That is the page content from the element with id="History" to but not including the element with id="Corporate_affairs"

In [2]:
# Console application that displays the most common words used in a portion of a webpage
# The parameters for configuring the get_most_common_word_counts function are:
#    number_words = 10 is the number of words to return (default: 10)
#    stop_words = [] are the Words to exclude from the search (default: empty)
# Page to crawl is https://en.wikipedia.org/wiki/Microsoft
# Get the most common words found and the number of times they are used in the section of the page titled "history"
#    That is the page content from the element with id="History" to but not including the element with id="Corporate_affairs"

from urllib.request import urlopen
from html.parser import HTMLParser
import re 

class MyHTMLParser(HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self)
        self.in_the_zone = False
        self.data = []
    def handle_starttag(self, tag, attrs):
        for name, value in attrs:
            if name == 'id' and value == 'History':
                self.in_the_zone = True
            if name == 'id' and value == 'Corporate_affairs':
                self.in_the_zone = False
    def handle_endtag(self, tag):
        if self.in_the_zone:
            pass
    def handle_data(self, data):
        if self.in_the_zone:
            self.data.append(data)

def get_most_common_word_counts(number_words = 10, stop_words = []):
    url = 'https://en.wikipedia.org/wiki/Microsoft'
    html = urlopen(url).read().decode('utf8')
    parser = MyHTMLParser()
    parser.feed(html)
    parser.close()
    str = " ".join(parser.data)
    str = re.sub('[\W_]+', ' ', str)
    str = str.lower()
    words = str.split()
    words = [word for word in words if word not in stop_words]
    wordfreq = [words.count(p) for p in words]
    freqdict = dict(list(zip(words, wordfreq)))
    sortedfreqdict = sorted(freqdict.items(), key=lambda x: x[1], reverse=True)
    return sortedfreqdict[:number_words]

print(get_most_common_word_counts())
print()
print(get_most_common_word_counts(5))
print()
print(get_most_common_word_counts(stop_words = ['microsoft', 'with']))

[('the', 215), ('microsoft', 135), ('in', 113), ('of', 92), ('and', 92), ('a', 88), ('to', 84), ('on', 61), ('windows', 60), ('for', 53)]

[('the', 215), ('microsoft', 135), ('in', 113), ('of', 92), ('and', 92)]

[('the', 215), ('in', 113), ('of', 92), ('and', 92), ('a', 88), ('to', 84), ('on', 61), ('windows', 60), ('for', 53), ('as', 35)]
