# Human Development Index

In [602]:
import requests
import urllib.request
import time
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import regex as re
import matplotlib.pyplot as plt
from urllib.request import urlopen
url = 'https://en.m.wikipedia.org/wiki/List_of_countries_by_Human_Development_Index'
html = urlopen(url) 
soup = BeautifulSoup(html, 'html.parser')

In [603]:
tables = soup.find_all('body')

In [604]:
# On site there are 2 tables with the class "wikitable"
# The following line will generate a list of HTML content for each table
tabl = soup.find_all("table", attrs={"class": "wikitable"})
print("Number of tables on site: ",len(tabl))

Number of tables on site:  2


In [605]:
table1 = tabl[0]

body = table1.find_all("tr")
head = body[0]
body_rows = body[1:]

headings = []

for item in head.find_all("th"):
    item = (item.text).rstrip("\n")
    headings.append(item)
print(headings)

['Rank', 'Nation', 'HDI']


In [606]:
#print(body_rows[0])
all_rows = [] # will be a list for list for all rows
for row_num in range(len(body_rows)): # A row at a time
    row = [] # this will old entries for one row
    for row_item in body_rows[row_num].find_all("td"): #loop through all row entries
        # row_item.text removes the tags from the entries
        # the following regex is to remove \xa0 and \n and comma from row_item.text
        # xa0 encodes the flag, \n is the newline and comma separates thousands in numbers
        aa = re.sub("(\xa0)|(\n)|,","",row_item.text)
        #append aa to row - note one row entry is being appended
        row.append(aa)
    # append one row to all_rows
    all_rows.append(row)

In [607]:
count = 0
for i in all_rows:
    if len(i) >1:
        count+=1
print(count)

189


In [608]:
country_data = []
for i in all_rows:
    if len(i) > 1:
        country_data.append(i)

In [648]:
country_data[0:6]

[['1', '', '0.957', ' 0.20%'],
 ['2', ' (7)', '0.955', ' 0.65%'],
 ['2', '', '0.955', ' 0.16%'],
 ['4', ' (7)', '0.949', ' 0.54%'],
 ['4', ' (4)', '0.949', ' 0.62%'],
 ['6', ' (3)', '0.947', ' 0.24%']]

In [610]:
df = pd.DataFrame(country_data)

In [611]:
df.head()

Unnamed: 0,0,1,2,3
0,1,,0.957,0.20%
1,2,(7),0.955,0.65%
2,2,,0.955,0.16%
3,4,(7),0.949,0.54%
4,4,(4),0.949,0.62%


In [612]:
data_frame = pd.merge(country,df,left_index = True, right_index = True)

In [613]:
data_frame.head(3)

Unnamed: 0,0_x,0_y,1,2,3
0,Norway,1,,0.957,0.20%
1,Republic of Ireland,2,(7),0.955,0.65%
2,Switzerland,2,,0.955,0.16%


In [614]:
del data_frame['0_y']

In [615]:
data_frame.columns = ['Country', 'HDI Change (5 yrs)', 'HDI (2020 Report)', 'Avg. annual growth (%)']

In [616]:
data_frame

Unnamed: 0,Country,HDI Change (5 yrs),HDI (2020 Report),Avg. annual growth (%)
0,Norway,,0.957,0.20%
1,Republic of Ireland,(7),0.955,0.65%
2,Switzerland,,0.955,0.16%
3,Hong Kong,(7),0.949,0.54%
4,Iceland,(4),0.949,0.62%
...,...,...,...,...
184,Burundi,(5),0.433,0.58%
185,South Sudan,(3),0.433,0.61%
186,Chad,(1),0.398,0.84%
187,Central African Republic,(1),0.397,0.94%


In [617]:
data_frame.isnull().sum()

Country                   0
HDI Change (5 yrs)        0
HDI (2020 Report)         0
Avg. annual growth (%)    0
dtype: int64

In [618]:
project_href = [i['href'] for i in soup.find_all('a', href=True) if i['href'] != "#"]

nations = []
for i in project_href[85:276]:
    for j in i.split('/')[-1:]:
        nations.append(j)
print(nations)

['Norway', 'Republic_of_Ireland', 'Switzerland', 'Hong_Kong', 'Iceland', 'Germany', 'Sweden', 'Australia', 'Netherlands', 'Denmark', 'Finland', 'Singapore', 'United_Kingdom', 'Belgium', 'New_Zealand', 'Canada', 'United_States', 'Austria', 'Israel', 'Japan', 'Liechtenstein', 'Slovenia', 'Luxembourg', 'South_Korea', 'Spain', 'France', 'Czech_Republic', 'Malta', 'Estonia', 'Italy', 'United_Arab_Emirates', 'Greece', 'Cyprus', 'Lithuania', 'Poland', 'Andorra', 'Latvia', 'Portugal', 'Slovakia', 'Hungary', 'Saudi_Arabia', 'Bahrain', 'Chile', 'Croatia', 'Qatar', 'Argentina', 'Brunei', 'Montenegro', 'Romania', 'Palau', 'Kazakhstan', 'Russia', 'Belarus', 'Turkey', 'Uruguay', 'Bulgaria', 'Panama', 'The_Bahamas', 'Barbados', 'Oman', 'Georgia_(country)', 'Costa_Rica', 'Malaysia', 'Kuwait', 'Serbia', 'Mauritius', 'Seychelles', 'Trinidad_and_Tobago', 'Albania', 'Cuba', 'Iran', 'Sri_Lanka', 'Bosnia_and_Herzegovina', 'Grenada', 'Mexico', 'Saint_Kitts_and_Nevis', 'Ukraine', 'Antigua_and_Barbuda', 'Peru'

In [619]:
stp = [i for i,j in enumerate(nations) if j == "S%C3%A3o_Tom%C3%A9_and_Pr%C3%ADncipe"]

In [620]:
stp

[136]

In [621]:
def text_cleaner(x):
    n = []
    for i in x:
        if '#' in i:
            pass
        else:
            n.append(i.replace('_', ' '))
    return n

In [622]:
nation = text_cleaner(nations)

In [623]:
#verify all data from column nation had been collected
len(nation)

189

In [624]:
country = pd.DataFrame(nation)

In [625]:
country.head(2)

Unnamed: 0,0
0,Norway
1,Republic of Ireland


In [626]:
country.rename(columns = {0:"Country"})

Unnamed: 0,Country
0,Norway
1,Republic of Ireland
2,Switzerland
3,Hong Kong
4,Iceland
...,...
184,Burundi
185,South Sudan
186,Chad
187,Central African Republic


In [627]:
data_frame['Country'][134] = "Sao Tome and Principe"

In [628]:
data_frame['Country'][134]

'Sao Tome and Principe'

In [629]:
countries_negative_hdi_growth = ['Lebanon', 'Jordan', 'Libya', 'Venezuela', 'East Timor', 'Syria', 'Yemen']

In [630]:
data_frame.head()

Unnamed: 0,Country,HDI Change (5 yrs),HDI (2020 Report),Avg. annual growth (%)
0,Norway,,0.957,0.20%
1,Republic of Ireland,(7),0.955,0.65%
2,Switzerland,,0.955,0.16%
3,Hong Kong,(7),0.949,0.54%
4,Iceland,(4),0.949,0.62%


In [649]:
data_frame.isnull().sum()

Country                   0
HDI Change (5 yrs)        0
HDI (2020 Report)         0
Avg. annual growth (%)    1
dtype: int64

In [650]:
data_frame.dtypes

Country                    object
HDI Change (5 yrs)         object
HDI (2020 Report)         float64
Avg. annual growth (%)    float64
dtype: object

In [633]:
data_frame['Avg. annual growth (%)'] = data_frame['Avg. annual growth (%)'].apply(lambda x: x[:-1])

In [634]:
data_frame['HDI Change (5 yrs)'] = data_frame['HDI Change (5 yrs)'].apply(lambda x: x[2] if len(x) > 0 else x)

In [635]:
n = []
for i in data_frame['Avg. annual growth (%)']:
    if i == 'NA[a':
        n.append(0)
    else:
        n.append(i)
print(pd.Series(n))

0       0.20
1       0.65
2       0.16
3       0.54
4       0.62
       ...  
184     0.58
185     0.61
186     0.84
187     0.94
188     1.95
Length: 189, dtype: object


In [636]:
data_frame['Avg. annual growth (%)'] = n

In [637]:
data_frame['HDI (2020 Report)'] = data_frame['HDI (2020 Report)'].str.strip().astype(float)

In [638]:
data_frame['Avg. annual growth (%)'] = data_frame['Avg. annual growth (%)'].str.strip().astype(float)

In [639]:
data_frame.dtypes

Country                    object
HDI Change (5 yrs)         object
HDI (2020 Report)         float64
Avg. annual growth (%)    float64
dtype: object

In [640]:
data_frame.head()

Unnamed: 0,Country,HDI Change (5 yrs),HDI (2020 Report),Avg. annual growth (%)
0,Norway,,0.957,0.2
1,Republic of Ireland,7.0,0.955,0.65
2,Switzerland,,0.955,0.16
3,Hong Kong,7.0,0.949,0.54
4,Iceland,4.0,0.949,0.62


In [667]:
#def annual_growth(x,y):
x = []
neg_growth_countries = ['Lebanon', 'Jordan', 'Libya', 'Venezuela', 'East Timor', 'Syria', 'Yemen']
for i in range(len(data_frame['Avg. annual growth (%)'])):
    if data_frame.iloc[i,0] in neg_growth_countries:
        x.append(-1 * data_frame.iloc[i,3])
    else:
        x.append(data_frame.iloc[i,3])
    
    

print(len(x))
        

189


In [668]:
data_frame['Avg. annual growth (%)'] = x

In [644]:
data_frame.columns

Index(['Country', 'HDI Change (5 yrs)', 'HDI (2020 Report)',
       'Avg. annual growth (%)'],
      dtype='object')

In [645]:
annual_growth(data_frame['Country'],data_frame)

[]

In [646]:
data_frame

Unnamed: 0,Country,HDI Change (5 yrs),HDI (2020 Report),Avg. annual growth (%)
0,Norway,,0.957,0.20
1,Republic of Ireland,7,0.955,0.65
2,Switzerland,,0.955,0.16
3,Hong Kong,7,0.949,0.54
4,Iceland,4,0.949,0.62
...,...,...,...,...
184,Burundi,5,0.433,0.58
185,South Sudan,3,0.433,0.61
186,Chad,1,0.398,0.84
187,Central African Republic,1,0.397,0.94


In [None]:
global_regions = {"Northern Africa" : [''],
                  "Sub-Saharan Africa": [],
                  "Eastern Africa":[],
                  "Middle Africa": [],
                  "Southern Africa": [],
                  "Western Africa": [],
                  "Caribbean": [],
"Central America": [],
"South America": [],
"North America": [],
"Central Asia": [],
"Eastern Asia": [],
"South-eastern Asia": [],
"Southern Asia": [],
"Western Asia": [],
"Eastern Europe (Including Nortehrn Asia)": [],
"Northern Europe": [],
"Southern Europe": [],
"Western Europe": [],
"Australia & New Zealend": [],
"Melanesia": [],
"Micronesia": [],
"Polynesia": [],
                  
                  }

## United Nations geoscheme

Statistical regions as defined by the UNSD. Antarctica is not shown.
The United Nations geoscheme is a system which divides the 249 countries and territories of the world into 6 regional and 22 subregional groups. It was devised by the United Nations Statistics Division (UNSD) based on the M49 coding classification.

In [427]:
results_list = []

# Get all the <td class="title"... elements
all_td = soup.find_all('td', {'class':'title'})
for element in all_td:
    # start a dictionary to store this item's data
    result = {}
    
    # get the title and full link/url
    a_href = element.find('a')
    if a_href:
        result['title'] = a_href.text   # element text
        result['link'] = a_href['href'] # href link
        
    # get the url domain
    span = element.find('span', {'class':'comhead'})
    if span:
        result['url'] = span.text.strip()[1:-1]
        
    # only store "full" rows of data
    if len(result) == 3:
        results_list.append(result)
if len(results_list) > 1:        
    print(results_list[0])