Collecting data of international borders from Wikipedia

In [1]:
import requests
from bs4 import BeautifulSoup
import csv
import codecs

In [2]:
WIKI_URL = "https://en.wikipedia.org/wiki/List_of_countries_and_territories_by_land_and_maritime_borders"

Getting content of table with list of actual international borders

In [3]:
website_url = requests.get(WIKI_URL).text
soup = BeautifulSoup(website_url, 'lxml')
data_table = soup.find('table', {'class':'wikitable sortable'})
rows = data_table.findAll('tr')[3:]

In [4]:
countries = []
for i in rows:
    row = []
    country = i.td.a.string
    if country == None:
        continue
    row.append(country)
    if i.findAll("td")[-1].small != None:
        i.findAll("td")[-1].small.decompose()
    for j in i.findAll("td")[-1].findAll("a"):
        neig = j.string
        if neig != None and neig[0] != '[':
            row.append(neig)
    countries.append(row)

Saving data as CSV file

In [5]:
with codecs.open("../data/borders.csv", "w", "utf-8-sig") as file:
    writer = csv.writer(file)
    writer.writerows(countries)

Downloading COVID-19 data

In [3]:
COVID_URL = "https://opendata.ecdc.europa.eu/covid19/casedistribution/csv"

In [4]:
r = requests.get(COVID_URL, allow_redirects=True)
open("../data/covid_data.csv", "wb").write(r.content)

913907

Loading data

In [32]:
import pandas as pd
import datetime

In [17]:
cases = pd.read_csv("../data/covid_data.csv")

In [18]:
cases.shape

(15698, 11)

In [39]:
cases['countriesAndTerritories'].unique().shape

(209,)

In [19]:
type(cases)

pandas.core.frame.DataFrame

In [30]:
with open("../data/borders.csv", mode="r", encoding="utf-8-sig") as file:
    borders = {}
    for line in file:
        neighbours = line.replace('\n','').replace(' ', '_').upper().split(',')
        borders[neighbours[0]] = neighbours[1:]
    file.close()

Creating mapping between names of countries in COVID dataset and borders dataset

In [48]:
for country in countries_from_covid_dataset:
    if country.upper() not in borders.keys():
        print(country)

Bonaire, Saint Eustatius and Saba
Brunei_Darussalam
Cases_on_an_international_conveyance_Japan
Congo
Cote_dIvoire
Czechia
Eswatini
Falkland_Islands_(Malvinas)
Gambia
Guinea_Bissau
Holy_See
Sao_Tome_and_Principe
Timor_Leste
United_Republic_of_Tanzania
United_States_of_America


In [56]:
mapping_countries_names = {
    'Bonaire, Saint Eustatius and Saba' : 'Netherlands',
    'Brunei_Darussalam' : 'Brunei',
    #'Cases_on_an_international_conveyance_Japan'  this is Diamond Princess,
    'Congo' : 'Republic_of_the_Congo',
    'Cote_dIvoire' : "Côte_d'Ivoire",
    'Czechia' : 'Czech_Republic',
    'Eswatini' : 'Eswatini_(Swaziland)' ,
    'Falkland_Islands_(Malvinas)' : 'Falkland_Islands',
    'Gambia' : 'The_Gambia',
    'Guinea_Bissau' : 'Guinea-Bissau',
    'Holy_See' : 'Vatican_City',
    'Sao_Tome_and_Principe' : 'São_Tomé_and_Príncipe' ,
    'Timor_Leste' : 'East_Timor',
    'United_Republic_of_Tanzania' : 'Tanzania' ,
    'United_States_of_America' : 'United_States',
}

In [57]:
for country in countries_from_covid_dataset:
    if country == 'Cases_on_an_international_conveyance_Japan':
        continue
    if country.upper() not in borders.keys():
        country = mapping_countries_names[country]
        if country.upper() not in borders.keys():
            print(country)