Step by step walkthrough of the methods to extract the counts of most frequently occuring words in a wikipedia entry for United Kingdom.

In [1]:
# set up a full path for URL
url = 'https://en.wikipedia.org/wiki/United_Kingdom'

In [2]:
from urllib.request import urlopen
# specify the url of the web page
source = urlopen(url).read()

In [3]:
from bs4 import BeautifulSoup
# make a soup 
soup = BeautifulSoup(source,'lxml')

In [4]:
# extract the plain text content from paragraphs
text = ''
for paragraph in soup.find_all('p'):
    text += paragraph.text

In [5]:
import nltk
# tokenize the words
tokens = nltk.word_tokenize(text, language="english")

In [6]:
# downloads and prepare the set of stopwords
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
# remove stopwords
tokens = [word for word in tokens if word.lower() not in stop_words]        

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/maciejtarsa/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
# remove punctuation
tokens = [word for word in tokens if word.isalpha()]

In [8]:
# vectorize tokens - count occurance of each word
# an empty dictionary to be returned at the end
vectors = {}
# iterate through all tokens
for token in tokens:
    # check if that token already exists in the dictionary
    try:
        i = vectors[token]
        # if it does, increment the count
        vectors[token] = i + 1
    # otherwise, assign a count of 1
    except KeyError:
        vectors[token] = 1

In [9]:
# only keep the ones with counts over 20
vectors_reduced = {key:value for (key,value) in vectors.items() if value >= 20}        

In [10]:
import operator
# sort the result in descending order
vectors_sorted =dict(sorted(vectors_reduced.items(), key=operator.itemgetter(1),reverse=True))

In [11]:
# print the response
print(vectors_sorted)

{'UK': 177, 'per': 128, 'British': 114, 'cent': 114, 'United': 113, 'Kingdom': 104, 'Ireland': 101, 'England': 91, 'Britain': 77, 'Wales': 70, 'Scotland': 68, 'Northern': 64, 'world': 64, 'population': 49, 'Great': 39, 'million': 34, 'first': 34, 'century': 34, 'London': 32, 'people': 29, 'Scottish': 27, 'Welsh': 27, 'English': 26, 'also': 26, 'government': 26, 'around': 26, 'Europe': 25, 'include': 25, 'country': 24, 'countries': 24, 'Union': 24, 'number': 24, 'largest': 23, 'including': 22, 'European': 21, 'Irish': 20, 'international': 20, 'one': 20, 'national': 20}
