# Human Development Index

In [19]:
import requests
import urllib.request
import time
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import regex as re
import matplotlib.pyplot as plt
from urllib.request import urlopen
import pycountry

In [20]:
url = 'https://en.m.wikipedia.org/wiki/List_of_countries_by_Human_Development_Index'
html = urlopen(url) 
soup = BeautifulSoup(html, 'html.parser')

In [21]:
tables = soup.find_all('body')

In [22]:
# On site there are 2 tables with the class "wikitable"
# The following line will generate a list of HTML content for each table
tabl = soup.find_all("table", attrs={"class": "wikitable"})
print("Number of tables on site: ",len(tabl))

Number of tables on site:  2


In [23]:
table1 = tabl[0]

body = table1.find_all("tr")
head = body[0]
body_rows = body[1:]

headings = []

for item in head.find_all("th"):
    item = (item.text).rstrip("\n")
    headings.append(item)
print(headings)

['Rank', 'Nation', 'HDI']


In [24]:
#print(body_rows[0])
all_rows = [] # will be a list for list for all rows
for row_num in range(len(body_rows)): # A row at a time
    row = [] # this will old entries for one row
    for row_item in body_rows[row_num].find_all("td"): #loop through all row entries
        # row_item.text removes the tags from the entries
        # the following regex is to remove \xa0 and \n and comma from row_item.text
        # xa0 encodes the flag, \n is the newline and comma separates thousands in numbers
        aa = re.sub("(\xa0)|(\n)|,","",row_item.text)
        #append aa to row - note one row entry is being appended
        row.append(aa)
    # append one row to all_rows
    all_rows.append(row)

In [25]:
count = 0
for i in all_rows:
    if len(i) >1:
        count+=1
print(count)

189


In [26]:
country_data = []
for i in all_rows:
    if len(i) > 1:
        country_data.append(i)

In [27]:
country_data[0:6]

[['1', '', '0.957', ' 0.20%'],
 ['2', ' (7)', '0.955', ' 0.65%'],
 ['2', '', '0.955', ' 0.16%'],
 ['4', ' (7)', '0.949', ' 0.54%'],
 ['4', ' (4)', '0.949', ' 0.62%'],
 ['6', ' (3)', '0.947', ' 0.24%']]

In [28]:
df = pd.DataFrame(country_data)

In [118]:
df.head()

Unnamed: 0,0,1,2,3
0,1,,0.957,0.20%
1,2,(7),0.955,0.65%
2,2,,0.955,0.16%
3,4,(7),0.949,0.54%
4,4,(4),0.949,0.62%


In [119]:
data_frame = pd.merge(country,df,left_index = True, right_index = True)

In [120]:
data_frame.head(3)

Unnamed: 0,0_x,0_y,1,2,3
0,Norway,1,,0.957,0.20%
1,Republic of Ireland,2,(7),0.955,0.65%
2,Switzerland,2,,0.955,0.16%


In [121]:
del data_frame['0_y']

In [122]:
data_frame.columns = ['Country', 'HDI Change (5 yrs)', 'HDI (2020 Report)', 'Avg. annual growth (%)']

In [123]:
data_frame

Unnamed: 0,Country,HDI Change (5 yrs),HDI (2020 Report),Avg. annual growth (%)
0,Norway,,0.957,0.20%
1,Republic of Ireland,(7),0.955,0.65%
2,Switzerland,,0.955,0.16%
3,Hong Kong,(7),0.949,0.54%
4,Iceland,(4),0.949,0.62%
...,...,...,...,...
182,Sierra Leone,(2),0.452,1.40%
183,Mali,,0.434,0.69%
184,Burundi,(5),0.433,0.58%
185,South Sudan,(3),0.433,0.61%


In [124]:
data_frame.isnull().sum()

Country                   0
HDI Change (5 yrs)        0
HDI (2020 Report)         0
Avg. annual growth (%)    0
dtype: int64

In [125]:
project_href = [i['href'] for i in soup.find_all('a', href=True) if i['href'] != "#"]

nations = []
for i in project_href[85:276]:
    for j in i.split('/')[-1:]:
        nations.append(j)
print(nations)

['#cite_note-2020_components-2', '#cite_note-2020_trends-18', 'Norway', 'Republic_of_Ireland', 'Switzerland', 'Hong_Kong', 'Iceland', 'Germany', 'Sweden', 'Australia', 'Netherlands', 'Denmark', 'Finland', 'Singapore', 'United_Kingdom', 'Belgium', 'New_Zealand', 'Canada', 'United_States', 'Austria', 'Israel', 'Japan', 'Liechtenstein', 'Slovenia', 'Luxembourg', 'South_Korea', 'Spain', 'France', 'Czech_Republic', 'Malta', 'Estonia', 'Italy', 'United_Arab_Emirates', 'Greece', 'Cyprus', 'Lithuania', 'Poland', 'Andorra', 'Latvia', 'Portugal', 'Slovakia', 'Hungary', 'Saudi_Arabia', 'Bahrain', 'Chile', 'Croatia', 'Qatar', 'Argentina', 'Brunei', 'Montenegro', 'Romania', 'Palau', 'Kazakhstan', 'Russia', 'Belarus', 'Turkey', 'Uruguay', 'Bulgaria', 'Panama', 'The_Bahamas', 'Barbados', 'Oman', 'Georgia_(country)', 'Costa_Rica', 'Malaysia', 'Kuwait', 'Serbia', 'Mauritius', 'Seychelles', 'Trinidad_and_Tobago', 'Albania', 'Cuba', 'Iran', 'Sri_Lanka', 'Bosnia_and_Herzegovina', 'Grenada', 'Mexico', 'Sai

In [126]:
stp = [i for i,j in enumerate(nations) if j == "S%C3%A3o_Tom%C3%A9_and_Pr%C3%ADncipe"]

In [127]:
stp

[138]

In [128]:
def text_cleaner(x):
    n = []
    for i in x:
        if '#' in i:
            pass
        else:
            n.append(i.replace('_', ' '))
    return n

In [129]:
nation = text_cleaner(nations)

In [130]:
#verify all data from column nation had been collected
len(nation)

187

In [131]:
country = pd.DataFrame(nation)

In [132]:
country.head(2)

Unnamed: 0,0
0,Norway
1,Republic of Ireland


In [133]:
country.rename(columns = {0:"Country"})

Unnamed: 0,Country
0,Norway
1,Republic of Ireland
2,Switzerland
3,Hong Kong
4,Iceland
...,...
182,Sierra Leone
183,Mali
184,Burundi
185,South Sudan


In [134]:
data_frame['Country'][134] = "Sao Tome and Principe"

In [135]:
data_frame['Country'][134]

'Sao Tome and Principe'

In [136]:
countries_negative_hdi_growth = ['Lebanon', 'Jordan', 'Libya', 'Venezuela', 'East Timor', 'Syria', 'Yemen']

In [137]:
data_frame.head()

Unnamed: 0,Country,HDI Change (5 yrs),HDI (2020 Report),Avg. annual growth (%)
0,Norway,,0.957,0.20%
1,Republic of Ireland,(7),0.955,0.65%
2,Switzerland,,0.955,0.16%
3,Hong Kong,(7),0.949,0.54%
4,Iceland,(4),0.949,0.62%


In [138]:
data_frame.isnull().sum()

Country                   0
HDI Change (5 yrs)        0
HDI (2020 Report)         0
Avg. annual growth (%)    0
dtype: int64

In [139]:
data_frame.dtypes

Country                   object
HDI Change (5 yrs)        object
HDI (2020 Report)         object
Avg. annual growth (%)    object
dtype: object

In [140]:
data_frame['Avg. annual growth (%)'] = data_frame['Avg. annual growth (%)'].apply(lambda x: x[:-1])

In [141]:
data_frame['HDI Change (5 yrs)'] = data_frame['HDI Change (5 yrs)'].apply(lambda x: x[2] if len(x) > 0 else x)

In [142]:
n = []
for i in data_frame['Avg. annual growth (%)']:
    if i == 'NA[a':
        n.append(0)
    else:
        n.append(i)
print(pd.Series(n))

0       0.20
1       0.65
2       0.16
3       0.54
4       0.62
       ...  
182     1.40
183     0.69
184     0.58
185     0.61
186     0.84
Length: 187, dtype: object


In [143]:
data_frame['Avg. annual growth (%)'] = n

In [144]:
data_frame['HDI (2020 Report)'] = data_frame['HDI (2020 Report)'].str.strip().astype(float)

In [145]:
data_frame['Avg. annual growth (%)'] = data_frame['Avg. annual growth (%)'].str.strip().astype(float)

In [146]:
data_frame.dtypes

Country                    object
HDI Change (5 yrs)         object
HDI (2020 Report)         float64
Avg. annual growth (%)    float64
dtype: object

In [147]:
data_frame.head()

Unnamed: 0,Country,HDI Change (5 yrs),HDI (2020 Report),Avg. annual growth (%)
0,Norway,,0.957,0.2
1,Republic of Ireland,7.0,0.955,0.65
2,Switzerland,,0.955,0.16
3,Hong Kong,7.0,0.949,0.54
4,Iceland,4.0,0.949,0.62


In [148]:
#def annual_growth(x,y):
x = []
neg_growth_countries = ['Lebanon', 'Jordan', 'Libya', 'Venezuela', 'East Timor', 'Syria', 'Yemen']
for i in range(len(data_frame['Avg. annual growth (%)'])):
    if data_frame.iloc[i,0] in neg_growth_countries:
        x.append(-1 * data_frame.iloc[i,3])
    else:
        x.append(data_frame.iloc[i,3])
    
    

print(len(x))
        

187


In [149]:
data_frame['Avg. annual growth (%)'] = x

In [150]:
data_frame.columns

Index(['Country', 'HDI Change (5 yrs)', 'HDI (2020 Report)',
       'Avg. annual growth (%)'],
      dtype='object')

In [151]:
annual_growth(data_frame['Country'],data_frame)

NameError: name 'annual_growth' is not defined

In [152]:
data_frame

Unnamed: 0,Country,HDI Change (5 yrs),HDI (2020 Report),Avg. annual growth (%)
0,Norway,,0.957,0.20
1,Republic of Ireland,7,0.955,0.65
2,Switzerland,,0.955,0.16
3,Hong Kong,7,0.949,0.54
4,Iceland,4,0.949,0.62
...,...,...,...,...
182,Sierra Leone,2,0.452,1.40
183,Mali,,0.434,0.69
184,Burundi,5,0.433,0.58
185,South Sudan,3,0.433,0.61


In [88]:
#Need GeoSchemes, scrape this
url_ungs = "https://en.wikipedia.org/wiki/List_of_countries_by_United_Nations_geoscheme"
html1 = urlopen(url_ungs) 
soup1 = BeautifulSoup(html1, 'html.parser')

In [89]:
geoschemes = soup1.find_all('body')

In [90]:
tables1 = soup1.find_all("table", attrs={"class": "wikitable"})
print("Number of tables on site: ",len(tabl))

Number of tables on site:  2


In [91]:
table_ungs = tables1[0]

body = table_ungs.find_all("tr")
head = body[0]
body_rows = body[1:]

headings = []

for item in head.find_all("th"):
    item = (item.text).rstrip("\n")
    headings.append(item)
print(headings)

['Country or Area', 'Sub-Subregion', 'Subregion', 'Region', 'UNSD M49 Codes']


In [92]:
all_rows = [] # will be a list for list for all rows
for row_num in range(len(body_rows)): # A row at a time
    row = [] # this will old entries for one row
    for row_item in body_rows[row_num].find_all("td"): #loop through all row entries
        # row_item.text removes the tags from the entries
        # the following regex is to remove \xa0 and \n and comma from row_item.text
        # xa0 encodes the flag, \n is the newline and comma separates thousands in numbers
        aa = re.sub("(\xa0)|(\n)|,","",row_item.text)
        #append aa to row - note one row entry is being appended
        row.append(aa)
    # append one row to all_rows
    all_rows.append(row)

In [94]:
type(all_rows)

list

In [95]:
for i in all_rows:
    print(i)

[' Algeria', '', 'Northern Africa', 'Africa', '012 < 015 < 002 < 001']
[' Egypt', '', 'Northern Africa', 'Africa', '818 < 015 < 002 < 001']
[' Libya', '', 'Northern Africa', 'Africa', '434 < 015 < 002 < 001']
[' Morocco', '', 'Northern Africa', 'Africa', '504 < 015 < 002 < 001']
[' Sudan', '', 'Northern Africa', 'Africa', '729 < 015 < 002 < 001']
[' Tunisia', '', 'Northern Africa', 'Africa', '788 < 015 < 002 < 001']
[' Western Sahara', '', 'Northern Africa', 'Africa', '732 < 015 < 002 < 001']
[' British Indian Ocean Territory', 'Eastern Africa', 'Sub-Saharan Africa', 'Africa', '086 < 014 < 202 < 002 < 001']
[' Burundi', 'Eastern Africa', 'Sub-Saharan Africa', 'Africa', '108 < 014 < 202 < 002 < 001']
[' Comoros', 'Eastern Africa', 'Sub-Saharan Africa', 'Africa', '174 < 014 < 202 < 002 < 001']
[' Djibouti', 'Eastern Africa', 'Sub-Saharan Africa', 'Africa', '262 < 014 < 202 < 002 < 001']
[' Eritrea', 'Eastern Africa', 'Sub-Saharan Africa', 'Africa', '232 < 014 < 202 < 002 < 001']
[' Ethio

In [221]:
df1 = pd.DataFrame(all_rows)

In [222]:
df1

Unnamed: 0,0,1,2,3,4
0,Algeria,,Northern Africa,Africa,012 < 015 < 002 < 001
1,Egypt,,Northern Africa,Africa,818 < 015 < 002 < 001
2,Libya,,Northern Africa,Africa,434 < 015 < 002 < 001
3,Morocco,,Northern Africa,Africa,504 < 015 < 002 < 001
4,Sudan,,Northern Africa,Africa,729 < 015 < 002 < 001
...,...,...,...,...,...
244,Samoa,,Polynesia,Oceania,882 < 061 < 009 < 001
245,Tokelau,,Polynesia,Oceania,772 < 061 < 009 < 001
246,Tonga,,Polynesia,Oceania,776 < 061 < 009 < 001
247,Tuvalu,,Polynesia,Oceania,798 < 061 < 009 < 001


In [223]:
headings

['Country or Area', 'Sub-Subregion', 'Subregion', 'Region', 'UNSD M49 Codes']

In [224]:
df1.rename(columns = {0:'Country',1:'Sub-Subregion',2:'Subregion',3:'Region',4:'UNSD M49 Codes'}, inplace = True)

In [225]:
data_frame.columns

Index(['Country', 'HDI Change (5 yrs)', 'HDI (2020 Report)',
       'Avg. annual growth (%)'],
      dtype='object')

In [289]:
df1['Country'].apply(lambda x: x.strip())

0                Algeria
1                  Egypt
2                  Libya
3                Morocco
4                  Sudan
             ...        
244                Samoa
245              Tokelau
246                Tonga
247               Tuvalu
248    Wallis and Futuna
Name: Country, Length: 249, dtype: object

In [290]:
country_subregion = dict(zip(df1['Country'], df1['Subregion']))

In [291]:
country_subregion

{'Algeria': 'Northern Africa',
 'Egypt': 'Northern Africa',
 'Libya': 'Northern Africa',
 'Morocco': 'Northern Africa',
 'Sudan': 'Northern Africa',
 'Tunisia': 'Northern Africa',
 'Western Sahara': 'Northern Africa',
 'British Indian Ocean Territory': 'Sub-Saharan Africa',
 'Burundi': 'Sub-Saharan Africa',
 'Comoros': 'Sub-Saharan Africa',
 'Djibouti': 'Sub-Saharan Africa',
 'Eritrea': 'Sub-Saharan Africa',
 'Ethiopia': 'Sub-Saharan Africa',
 'French Southern and Antarctic Lands': 'Sub-Saharan Africa',
 'Kenya': 'Sub-Saharan Africa',
 'Madagascar': 'Sub-Saharan Africa',
 'Malawi': 'Sub-Saharan Africa',
 'Mauritius': 'Sub-Saharan Africa',
 'Mayotte': 'Sub-Saharan Africa',
 'Mozambique': 'Sub-Saharan Africa',
 'Réunion': 'Sub-Saharan Africa',
 'Rwanda': 'Sub-Saharan Africa',
 'Seychelles': 'Sub-Saharan Africa',
 'Somalia': 'Sub-Saharan Africa',
 'South Sudan': 'Sub-Saharan Africa',
 'Uganda': 'Sub-Saharan Africa',
 'Tanzania': 'Sub-Saharan Africa',
 'Zambia': 'Sub-Saharan Africa',
 'Zim

In [292]:
data_frame['Sub-Region'] = data_frame['Country'].map(country_subregion)

In [293]:
data_frame['Sub-Region'].isnull().sum()

18

In [294]:
df1.dtypes

Country      object
Subregion    object
dtype: object

In [295]:
data_frame.dtypes

Country                    object
HDI Change (5 yrs)         object
HDI (2020 Report)         float64
Avg. annual growth (%)    float64
Sub-Region                 object
dtype: object

In [323]:
df = pd.merge(df1,data_frame, on = 'Country', how = 'right')

In [330]:
df[df['Subregion'].isna()]

Unnamed: 0,Country,Subregion,HDI Change (5 yrs),HDI (2020 Report),Avg. annual growth (%),Sub-Region
1,Republic of Ireland,,7.0,0.955,0.65,
3,Hong Kong,,7.0,0.949,0.54,
23,South Korea,,1.0,0.916,0.33,
25,France,,1.0,0.901,0.28,
26,Czech Republic,,1.0,0.9,0.38,
57,The Bahamas,,3.0,0.814,0.12,
60,Georgia (country),,7.0,0.812,0.87,
114,State of Palestine,,6.0,0.708,0.38,
125,Cape Verde,,4.0,0.665,0.57,
134,Sao Tome and Principe,,1.0,0.625,1.21,


In [None]:
#create dictionary of NaNs and the use. map to add null values to dataframe
global_regions = {"Northern Africa" : ['Algeria', 'Egypt','Morocco', 'Algeria', 'Tunisia', 'Libya', 'Egypt', 'Sudan', 'Western Sahara'],
                  "Sub-Saharan Africa": [],
                  "Eastern Africa":[],
                  "Middle Africa": [],
                  "Southern Africa": [],
                  "Western Africa": [],
                  "Caribbean": [],
"Central America": [],
"South America": [],
"North America": [],
"Central Asia": [],
"Eastern Asia": [],
"South-eastern Asia": [],
"Southern Asia": [],
"Western Asia": [],
"Eastern Europe (Including Nortehrn Asia)": [],
"Northern Europe": [],
"Southern Europe": [],
"Western Europe": [],
"Australia & New Zealend": [],
"Melanesia": [],
"Micronesia": [],
"Polynesia": [],
                  
                  }

In [322]:
df['HDI (2020 Report)'].groupby(df.Subregion).mean().sort_values(ascending = False)

Subregion
Australia and New Zealand                          0.937500
Western Europe                                     0.933429
North America                                      0.927500
Northern Europe                                    0.922333
Southern Europe                                    0.851000
Eastern Europe                                     0.823778
Eastern Asia                                       0.805667
Western Asia                                       0.775313
Latin America and the Caribbean                    0.763167
South-eastern Asia                                 0.745556
Latin America and the Caribbean / North America    0.735900
Central Asia                                       0.725000
Micronesia                                         0.720000
Polynesia                                          0.720000
Northern Africa                                    0.685833
Southern Asia                                      0.656222
Melanesia                     

In [81]:
global_regions = {"Northern Africa" : ['Algeria', 'Egypt','Morocco', 'Algeria', 'Tunisia', 'Libya', 'Egypt', 'Sudan', 'Western Sahara'],
                  "Sub-Saharan Africa": [],
                  "Eastern Africa":[],
                  "Middle Africa": [],
                  "Southern Africa": [],
                  "Western Africa": [],
                  "Caribbean": [],
"Central America": [],
"South America": [],
"North America": [],
"Central Asia": [],
"Eastern Asia": [],
"South-eastern Asia": [],
"Southern Asia": [],
"Western Asia": [],
"Eastern Europe (Including Nortehrn Asia)": [],
"Northern Europe": [],
"Southern Europe": [],
"Western Europe": [],
"Australia & New Zealend": [],
"Melanesia": [],
"Micronesia": [],
"Polynesia": [],
                  
                  }

## United Nations geoscheme

Statistical regions as defined by the UNSD. Antarctica is not shown.
The United Nations geoscheme is a system which divides the 249 countries and territories of the world into 6 regional and 22 subregional groups. It was devised by the United Nations Statistics Division (UNSD) based on the M49 coding classification.

In [427]:
results_list = []

# Get all the <td class="title"... elements
all_td = soup.find_all('td', {'class':'title'})
for element in all_td:
    # start a dictionary to store this item's data
    result = {}
    
    # get the title and full link/url
    a_href = element.find('a')
    if a_href:
        result['title'] = a_href.text   # element text
        result['link'] = a_href['href'] # href link
        
    # get the url domain
    span = element.find('span', {'class':'comhead'})
    if span:
        result['url'] = span.text.strip()[1:-1]
        
    # only store "full" rows of data
    if len(result) == 3:
        results_list.append(result)
if len(results_list) > 1:        
    print(results_list[0])