In [1]:
# Practica 8 - BS4 + JSON
# María José Medina Hernández

In [2]:
# Imports
import requests
import json
import re
from bs4 import BeautifulSoup

In [3]:
# Download the HTML 
url = "https://en.wikipedia.org/wiki/World_population"
response = requests.get(url)

In [4]:
# Parse the HTML
soup = BeautifulSoup(response.content, "html.parser")

In [5]:
def find_data(soup):
    """
    Finds data of the continents and parses it to a dictionary.
    
    Params
    -------
    soup : bs4.BeautifulSoup
        Parsed html document.

    Return
    -------
    data : dic
        The dictionary with the data of each continent.
    """
    data = {}
    trs = soup.find("table", class_="wikitable sortable").find_all("tr")
    for tr in trs:
        tds = tr.find_all("td")

        if tds:
            density = tds[1].get_text().strip()
            density = re.sub("[\(\[].*?[\)\]]", "", density) # Take out any notes
            population = tds[2].get_text().strip()
            population = re.sub("[\(\[].*?[\)\]]", "", population)
            city = tds[4].get_text().strip().replace(u'\xa0', u'').split("–")[1].strip().split(";")[0]
            city = re.sub("[\(\[].*?[\)\]]", "", city)
            city_pop = tds[4].get_text().strip().split(" ")[0].split("[")[0]
            city_pop = re.sub("[\(\[].*?[\)\]]", "", city_pop)
            # If there are countries in the continent (eg Antarctica is empty)
            if "N/A" not in tds[3].get_text().strip():
                country = tds[3].get_text().strip().replace(u'\xa0', u'').split("–")[1].strip().split(";")[0]
                country = re.sub("[\(\[].*?[\)\]]", "", country)
                country_pop = tds[3].get_text().strip().split(" ")[0].split("[")[0]
                country_pop = re.sub("[\(\[].*?[\)\]]", "", country_pop)
                aux = {
                    "density" : density,
                    "population" : population,
                    "most populous country" : {
                        "country" : country,
                        "population" : country_pop
                    },
                    "most populous city": {
                        "city": city,
                        "population" : city_pop
                    }
                }
            else:
                aux = {
                    "density" : density,
                    "population" : re.sub("[\(\[].*?[\)\]]", "", population),
                    "most populous city": {
                        "city": city,
                        "population" : city_pop
                    }
                }
            data[tds[0].get_text().strip()] = aux
    return data

In [7]:
data = find_data(soup)

In [8]:
def print_json(dic, file_path):
    with open(file_path, "w") as output_file:
        json.dump(dic, output_file)

In [9]:
file_path = r"world-population.json"
print_json(data, file_path)

In [10]:
type(soup)

bs4.BeautifulSoup