## Webpage word count
### With example of world country pages on Wikipedia
An application that extracts the content of a website and displays the most occurring words on that webpage.

In [72]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import urllib.request, urllib.error
import operator

import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/maciejtarsa/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/maciejtarsa/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [73]:
# set word limit
# ie how many times a word has to occur for it to be displayed
# for countries on wikipedia it is set to 20
word_limit = 20

In [74]:
# a helper function to vectorize the tokens
# ie count the occurance of each token in the text
# takes a list of tokens as input
# returns a dictionary containing features as keys and weights as values
def vectorize(tokens):
    # an empty dictionary to be returned at the end
    features = {}
    # iterate through all tokens
    for token in tokens:
        # check if that token already exists in the dictionary
        try:
            i = features[token]
            # if it does, increment the count
            features[token] = i + 1
        # otherwise, assign a count of 1
        except KeyError:
            features[token] = 1
    return features

In [75]:
# a helper function to check that the URL is reachable
# implementation from:
# https://stackoverflow.com/questions/1726402/in-python-how-do-i-use-urllib-to-see-if-a-website-is-404-or-200
def check_URL(url):
    try:
        conn = urllib.request.urlopen(url)
    except urllib.error.HTTPError as e:
        # Return code error (e.g. 404, 501, ...)
        # ...
        print('HTTPError: {}'.format(e.code))
    except urllib.error.URLError as e:
        # Not an HTTP-specific error (e.g. connection refused)
        # ...
        print('URLError: {}'.format(e.reason))
    else:
        # 200
        # ...
        return True

In [76]:
# a function that counts the occurance of words on a webpage
# takes a url as input
# returns a dictionary of words and their occurance frequency 
def top_words(url):
    
    #check the URL
    response = check_URL(url)
    # if rerponse is True, continue
    if response:
        
        # specify the url of the web page
        source = urlopen(url).read()
  
        # make a soup 
        soup = BeautifulSoup(source,'lxml')
        # extract the plain text content from paragraphs
        text = ''
        for paragraph in soup.find_all('p'):
            text += paragraph.text

        # tokenize the words
        tokens = word_tokenize(text, language="english")
        # remove stopwords
        tokens = [word for word in tokens if word.lower() not in stop_words]
        # remove punctuation
        tokens = [word for word in tokens if word.isalpha()]
        # vectorize tokens - count occurance of each word
        vectors = vectorize(tokens)
        # only keep the ones with counts over 20
        vectors_reduced = {key:value for (key,value) in vectors.items() if value >= word_limit}
        # sort the result in descending order
        vectors_sorted =dict(sorted(vectors_reduced.items(), key=operator.itemgetter(1),reverse=True))

        return vectors_sorted

### Example - country pages on wikipedia

#### Option 1
User specifies the country (or other wikipedia page name)

In [78]:
# ask the user for input
user_input = input('Please input country: ')
# convert spaces to underscores
user_input_cleaned = user_input.replace(' ', '_')
# set up a full path for URL
url = 'https://en.wikipedia.org/wiki/' + user_input_cleaned
# run the function to extract the top words
top_words(url)

Please input country: United Kingdomm
HTTPError: 404


#### Option 2
Hardcode the full URL - copy and paste from a website

In [79]:
# set up a full path for URL
url = 'https://en.wikipedia.org/wiki/United_Kingdom'
# run the function to extract top words
top_words(url)

{'UK': 177,
 'per': 128,
 'British': 114,
 'cent': 114,
 'United': 113,
 'Kingdom': 104,
 'Ireland': 101,
 'England': 91,
 'Britain': 77,
 'Wales': 70,
 'Scotland': 68,
 'Northern': 64,
 'world': 64,
 'population': 49,
 'Great': 39,
 'million': 34,
 'first': 34,
 'century': 34,
 'London': 32,
 'people': 29,
 'Scottish': 27,
 'Welsh': 27,
 'English': 26,
 'also': 26,
 'government': 26,
 'around': 26,
 'Europe': 25,
 'include': 25,
 'country': 24,
 'countries': 24,
 'Union': 24,
 'number': 24,
 'largest': 23,
 'including': 22,
 'European': 21,
 'Irish': 20,
 'international': 20,
 'one': 20,
 'national': 20}