## Web Scraping

In [36]:
from IPython.display import HTML, display
import requests
import bs4
from bs4 import BeautifulSoup
import re
from lxml import html
import numpy as np
import pandas as pd

from nose.tools import assert_equal, assert_is_instance
from numpy.testing import assert_array_equal
from pandas.util.testing import assert_frame_equal

In [37]:
who = 'http://www.who.int/countries/en/'

### 1. Function: get_country_url

In [206]:
def get_country_url(country, url=who):
    '''
    Finds the url link of the input country on the WHO website.
    
    Parameters
    ----------
    country: A string. Name of the country.
    url: A string. Default: 'http://www.who.int/countries/en/'
    
    Returns
    -------
    A string. Url link of the country.
    '''
    
    html = requests.get(url).content
    soup=BeautifulSoup(html, 'lxml')
    links=soup.find_all('li')
    for i in range(13,207):
        if links[i].span.text==country:
            add=links[i].a['href']
    country_url=url[:18]+add
    return country_url

In [209]:
t1_url = get_country_url('Panama')
assert_equal(t1_url, 'http://www.who.int/countries/pan/en')

t2_url = get_country_url('United Kingdom')
assert_equal(t2_url, 'http://www.who.int/countries/gbr/en')

t3_url = get_country_url('Micronesia (Federated States of)')
assert_equal(t3_url, 'http://www.who.int/countries/fsm/en')

### 2. Function: get_country_stats

In [333]:
def get_country_stats(country, url=who):
    '''
    Finds the statistical data of the input country on the country's website.
    
    Parameters
    ----------
    country: A string. Name of the country.
    url: A string. Default: 'http://www.who.int/countries/en/'
    
    Returns
    -------
    A 2d numpy array of identical content as the table on the website of the country.
    '''
    
    coutry_url=get_country_url(country, url=who)
    html = requests.get(coutry_url).content
    soup=BeautifulSoup(html, 'lxml')
    trs=soup.find_all('tr')
    a=[]
    b=[]
    for i in range(len(trs)-1):
        a.append(trs[i].th.text)
        b.append(trs[i].td.text.strip())
    title=np.array(a)
    data=np.array(b)
    stats=np.column_stack((a,b))
    return stats

In [334]:
t1_stats = get_country_stats('France')
for col, num in t1_stats:
    print('{0:80s}: {1:s}'.format(col, num))

Total population (2015)                                                         : 64,395,000
Gross national income per capita (PPP international $, 2013)                    : 37
Life expectancy at birth m/f (years, 2015)                                      : 79/85
Probability of dying under five (per 1 000 live births, 0)                      : not available
Probability of dying between 15 and 60 years m/f (per 1 000 population, 2015)   : 104/51
Total expenditure on health per capita (Intl $, 2014)                           : 4,508
Total expenditure on health as % of GDP (2014)                                  : 11.5


In [335]:
france = [['Total population (2015)', '64,395,000'],
          ['Gross national income per capita (PPP international $, 2013)', '37'],
          ['Life expectancy at birth m/f (years, 2015)', '79/85'],
          ['Probability of dying under five (per 1 000 live births, 0)', 'not available'],
          ['Probability of dying between 15 and 60 years m/f (per 1 000 population, 2015)', '104/51'],
          ['Total expenditure on health per capita (Intl $, 2014)', '4,508'],
          ['Total expenditure on health as % of GDP (2014)', '11.5']]
assert_array_equal(t1_stats, france)

germany = [['Total population (2015)', '80,688,000'],
           ['Gross national income per capita (PPP international $, 2013)', '44'],
           ['Life expectancy at birth m/f (years, 2015)', '79/83'],
           ['Probability of dying under five (per 1 000 live births, 0)', 'not available'],
           ['Probability of dying between 15 and 60 years m/f (per 1 000 population, 2015)', '87/47'],
           ['Total expenditure on health per capita (Intl $, 2014)', '5,182'],
           ['Total expenditure on health as % of GDP (2014)', '11.3']]
t2_stats = get_country_stats('Germany')
assert_array_equal(t2_stats, germany)

andorra = [['Total population (2015)', '70,000'],
           ['Gross national income per capita (PPP international $, 0)', 'not available'],
           ['Life expectancy at birth m/f (years, 0)', 'not available'],
           ['Probability of dying under five (per 1 000 live births, 0)', 'not available'],
           ['Probability of dying between 15 and 60 years m/f (per 1 000 population, 0)', 'not available'],
           ['Total expenditure on health per capita (Intl $, 2014)', '4,273'],
           ['Total expenditure on health as % of GDP (2014)', '8.1']]
t3_stats = get_country_stats('Andorra')
assert_array_equal(t3_stats, andorra)


### 3. Function: get_all_countries

In [325]:
def get_all_countries(url=who):
    '''
    Finds names of 194 memeber states on the WHO webpage as a list of strings.
    
    Parameters
    ----------
    url: A string. Default: 'http://www.who.int/countries/en/'
    
    Returns
    -------
    A list of the names of 194 WHO member states.
    '''
    
    html = requests.get(url).content
    soup=BeautifulSoup(html, 'lxml')
    links=soup.find_all('li')
    countries=[]
    for i in range(13,207):
        countries.append(links[i].span.text)
    return countries

In [326]:
country_names = get_all_countries()

In [328]:
country_names = get_all_countries()
answer = ['Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola', 'Antigua and Barbuda', 'Argentina', 'Armenia', 
          'Australia', 'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium', 
          'Belize', 'Benin', 'Bhutan', 'Bolivia (Plurinational State of)', 'Bosnia and Herzegovina', 'Botswana', 'Brazil', 
          'Brunei Darussalam', 'Bulgaria', 'Burkina Faso', 'Burundi', 'Cabo Verde', 'Cambodia', 'Cameroon', 'Canada', 
          'Central African Republic', 'Chad', 'Chile', 'China', 'Colombia', 'Comoros', 'Congo', 'Cook Islands', 'Costa Rica', 
          "Côte d'Ivoire", 'Croatia', 'Cuba', 'Cyprus', 'Czechia', "Democratic People's Republic of Korea", 
          'Democratic Republic of the Congo', 'Denmark', 'Djibouti', 'Dominica', 'Dominican Republic', 'Ecuador', 'Egypt', 
          'El Salvador', 'Equatorial Guinea', 'Eritrea', 'Estonia', 'Ethiopia', 'Fiji', 'Finland', 'France', 'Gabon', 'Gambia', 
          'Georgia', 'Germany', 'Ghana', 'Greece', 'Grenada', 'Guatemala', 'Guinea', 'Guinea-Bissau', 'Guyana', 'Haiti', 
          'Honduras', 'Hungary', 'Iceland', 'India', 'Indonesia', 'Iran (Islamic Republic of)', 'Iraq', 'Ireland', 'Israel', 
          'Italy', 'Jamaica', 'Japan', 'Jordan', 'Kazakhstan', 'Kenya', 'Kiribati', 'Kuwait', 'Kyrgyzstan', 
          "Lao People's Democratic Republic", 'Latvia', 'Lebanon', 'Lesotho', 'Liberia', 'Libya', 'Lithuania', 'Luxembourg', 
          'Madagascar', 'Malawi', 'Malaysia', 'Maldives', 'Mali', 'Malta', 'Marshall Islands', 'Mauritania', 'Mauritius', 
          'Mexico', 'Micronesia (Federated States of)', 'Monaco', 'Mongolia', 'Montenegro', 'Morocco', 'Mozambique', 'Myanmar', 
          'Namibia', 'Nauru', 'Nepal', 'Netherlands', 'New Zealand', 'Nicaragua', 'Niger', 'Nigeria', 'Niue', 'Norway', 'Oman', 
          'Pakistan', 'Palau', 'Panama', 'Papua New Guinea', 'Paraguay', 'Peru', 'Philippines', 'Poland', 'Portugal', 'Qatar', 
          'Republic of Korea', 'Republic of Moldova', 'Romania', 'Russian Federation', 'Rwanda', 'Saint Kitts and Nevis', 
          'Saint Lucia', 'Saint Vincent and the Grenadines', 'Samoa', 'San Marino', 'Sao Tome and Principe', 'Saudi Arabia', 
          'Senegal', 'Serbia ', 'Seychelles', 'Sierra Leone', 'Singapore', 'Slovakia', 'Slovenia', 'Solomon Islands', 'Somalia', 
          'South Africa', 'South Sudan', 'Spain', 'Sri Lanka', 'Sudan', 'Suriname', 'Swaziland', 'Sweden', 'Switzerland', 
          'Syrian Arab Republic', 'Tajikistan', 'Thailand', 'The former Yugoslav Republic of Macedonia', 'Timor-Leste', 'Togo', 
          'Tonga', 'Trinidad and Tobago', 'Tunisia', 'Turkey', 'Turkmenistan', 'Tuvalu', 'Uganda', 'Ukraine', 
          'United Arab Emirates', 'United Kingdom', 'United Republic of Tanzania', 'United States of America', 'Uruguay', 
          'Uzbekistan', 'Vanuatu', 'Venezuela (Bolivarian Republic of)', 'Viet Nam', 'Yemen', 'Zambia', 'Zimbabwe']
assert_array_equal(answer, country_names)


### 4. Function: get_combined_dataframe

In [336]:
cols = ['Total population',
        'Gross national income per capita (PPP international $)',
        'Life expectancy at birth m/f (years)',
        'Probability of dying under five (per 1 000 live births)',
        'Probability of dying between 15 and 60 years m/f (per 1 000 population)',
        'Total expenditure on health per capita (Intl $)',
        'Total expenditure on health as % of GDP']

In [400]:
def get_combined_dataframe(countries, cols=cols):
    '''
    Combines data for each country as a dataframe using specified column names as columns and country names as index.
    
    Parameters
    ----------
    country: A list of string. Names of the countries.
    cols: A list of string. Default: the list defined above this cell.
    
    Returns
    -------
    A pandas DataFrame object.
    '''
    
    result=[]
    for country in countries:
        coutry_url=get_country_url(country, url=who)
        html = requests.get(coutry_url).content
        soup=BeautifulSoup(html, 'lxml')
        trs=soup.find_all('tr')
        b=[]
        for i in range(len(trs)-1):
            b.append(trs[i].td.text.strip())
        result.append(b)
    new=pd.DataFrame(result,columns=cols)
    new['countries']=countries
    new=new.set_index('countries')
    new.index.name=None
    return new

In [401]:
countries1 = ['China', 'Egypt', 'United States of America']
df1 = get_combined_dataframe(countries1)

In [403]:
countries1 = ['China', 'Egypt', 'United States of America']
df1 = get_combined_dataframe(countries1)
assert_is_instance(df1, pd.DataFrame)
a1 = pd.DataFrame ([['1,400,000,000', '11', '75/78', 'not available', '98/71', '731','5.5'],
                    ['91,508,000', '10', '69/73', 'not available', '196/119', '594','5.6'],
                    ['321,774,000', '53', '77/82', 'not available', '128/77', '9,403','17.1']], 
                   columns=cols, index=countries1)
assert_frame_equal(df1, a1)

countries2 = country_names[100:110]
df2 = get_combined_dataframe(countries2)
a2 = pd.DataFrame([['24,235,000', '1', '64/67', 'not available', '245/196', '44', '3.0'],
                   ['17,215,000', '750', '57/60', 'not available', '398/330', '93','11.4'],
                   ['30,331,000', '22', '73/77', 'not available', '167/79', '1,040','4.2'],
                   ['364,000', '9', '77/80', 'not available', '79/43', '1,996', '13.7'],
                   ['17,600,000', '1', '58/58', 'not available', '266/267', '108','6.9'],
                   ['419,000', '28', '80/84', 'not available', '70/37', '3,072', '9.8'],
                   ['53,000', '4', 'not available', 'not available', 'not available','680', '17.1'],
                   ['4,068,000', '2', '62/65', 'not available', '227/182', '148', '3.8'],
                   ['1,273,000', '17', '71/78', 'not available', '190/99', '896', '4.8'],
                   ['127,017,000', '16', '74/80', 'not available', '161/82', '1,122','6.3']],
                 columns=cols, index=countries2)
assert_frame_equal(df2, a2)