In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import requests
# My aim here is to calculate the distribution of words in the Game of Thrones wiki
# I will use BeautifulSoup to scrape the site and all of its urls
# I will then use pandas DataFrames to store all of the data
# To plot and analyze, I use the matplotlib library

# This array will store the urls, and it starts with the initial url of the wiki page
urls = {"https://gameofthrones.fandom.com/wiki/Game_of_Thrones"}

response = requests.get("https://gameofthrones.fandom.com/wiki/Game_of_Thrones")

soup = BeautifulSoup(response.text,'html.parser')

for a in soup.find_all('a',attrs={"data-tracking": True}):
    href = a.get('href')  # Extract URL
    if href and href.startswith("https://gameofthrones.fandom.com/wiki/"):  # Filter relevant wiki pages
        urls.add(href)

urls2 = urls.copy()

'''for url in urls:
    response = requests.get(url)

    soup = BeautifulSoup(response.text,'html.parser')

    for a in soup.find_all('a',attrs={"data-tracking": True}):
        href = a.get('href')  # Extract URL
        if href and href.startswith("https://gameofthrones.fandom.com/wiki/"):  # Filter relevant wiki pages
            urls2.add(href)
'''
urls=list(urls2)

Now we will actually grab all of the words of of each URL. This will be achieved by, for each URL, checking each <p> tag's text, and using a dictionary to keep track of frequency. We again use Beautiful Soup to parse, and using string operations to clean up our data.

In [2]:
from collections import defaultdict

word_dictionary = defaultdict(int)
url = urls[0]

for url in urls:
    response = requests.get(url)
    soup = BeautifulSoup(response.text,'html.parser')
    words = soup.find_all("p")      # use BS method find_all to search through paragraphs
    for group in words:
        wordlist = group.text.lower().split()   # use string manipulation to grab words into a list
        for word in wordlist:   #loop through all words in our list
            word_dictionary[word]+=1    # increment

We now have our dictionary and would like it sorted, in descending order, by frequency.

In [None]:
# Sorting the dictionary by value in descending order
word_dictionary = dict(sorted(word_dictionary.items(), key=lambda item: item[1], reverse=True))
# Print our dictionary
print(word_dictionary)

Perhaps unsurprisingly, "the" is the most common word on the site. Our third most common word is '.', which we do not want to include, so, along with some other characters, I will drop.

In [None]:
del word_dictionary['·']
print(word_dictionary)

In [None]:
# Number of elements to plot
n = 5  # Change this to plot more/fewer elements

# Slicing first n elements
keys = list(word_dictionary.keys())[:n]
values = list(word_dictionary.values())[:n]

# Plot
plt.bar(keys, values)
plt.xlabel("Keys")
plt.ylabel("Values")
plt.title(f"First {n} Elements of Dictionary")
plt.show()