# CS109 Project: Exploration
Work by Phillip Huang

# Possible errors

## File does not exist:
(1) Make sure you manually downloaded the 'data' folder from google drive

(2) all datasets containing '100k' were originally named 'death100k\*' but the files in the actual dataset were named 'deathper100k\*', so I changed all of the `read_csv` statements to match.

In [5]:
# import packages
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt

# Load Data

In [6]:
# NCD data
risk = pd.read_csv('data/ncd/risk.csv', skiprows=1)
deaths_100k = {}
deaths_100k['all'] = pd.read_csv('data/ncd/death100k_all.csv', skiprows=2)
deaths_100k['cancer'] = pd.read_csv('data/ncd/death100k_cancer.csv', skiprows=2)
deaths_100k['cardio'] = pd.read_csv('data/ncd/death100k_cardio.csv', skiprows=2)
deaths_100k['diabetes'] = pd.read_csv('data/ncd/death100k_diabetes.csv', skiprows=2)
deaths_100k['resp'] = pd.read_csv('data/ncd/death100k_resp.csv', skiprows=2)

# food data
crops = pd.read_csv('data/food/FoodSupply_Crops_E_All_Data.csv')
meat = pd.read_csv('data/food/FoodSupply_LivestockFish_E_All_Data.csv')

# healthcare data
beds = pd.read_csv('data/bed/API_SH.MED.BEDS.ZS_DS2_en_csv_v2.csv', skiprows=4)
doctors = pd.read_csv('data/phys/API_SH.MED.PHYS.ZS_DS2_en_csv_v2.csv', skiprows=4)

Not using norm version of food data because redundant and large.

Not using total death and death under 70 for NCD data because not normalized to per capita.

# NCD Risk

In [7]:
risk.head()

Unnamed: 0,Country,2012,2000
0,Afghanistan,31,33
1,Albania,19,24
2,Algeria,22,23
3,Angola,24,24
4,Argentina,17,20


In [8]:
risk = risk.set_index('Country')
risk.head()

Unnamed: 0_level_0,2012,2000
Country,Unnamed: 1_level_1,Unnamed: 2_level_1
Afghanistan,31,33
Albania,19,24
Algeria,22,23
Angola,24,24
Argentina,17,20


In [9]:
# grab country names
risk.index.unique()

array(['Afghanistan', 'Albania', 'Algeria', 'Angola', 'Argentina',
       'Armenia', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas',
       'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium', 'Belize',
       'Benin', 'Bhutan', 'Bolivia (Plurinational State of)',
       'Bosnia and Herzegovina', 'Botswana', 'Brazil', 'Brunei Darussalam',
       'Bulgaria', 'Burkina Faso', 'Burundi', "Cote d'Ivoire",
       'Cabo Verde', 'Cambodia', 'Cameroon', 'Canada',
       'Central African Republic', 'Chad', 'Chile', 'China', 'Colombia',
       'Comoros', 'Congo', 'Costa Rica', 'Croatia', 'Cuba', 'Cyprus',
       'Czech Republic', "Democratic People's Republic of Korea",
       'Democratic Republic of the Congo', 'Denmark', 'Djibouti',
       'Dominican Republic', 'Ecuador', 'Egypt', 'El Salvador',
       'Equatorial Guinea', 'Eritrea', 'Estonia', 'Ethiopia', 'Fiji',
       'Finland', 'France', 'Gabon', 'Gambia', 'Georgia', 'Germany',
       'Ghana', 'Greece', 'Guatemala', 'Guinea', 'Guinea

In [10]:
deaths_100k['all'].head()

Unnamed: 0,Country,Year,Both sexes,Female,Male
0,Afghanistan,2012,846.3,829.4,869.2
1,Afghanistan,2000,905.5,876.9,937.0
2,Albania,2012,671.6,625.1,714.2
3,Albania,2000,840.0,723.0,978.8
4,Algeria,2012,710.4,645.2,779.8


In [11]:
# print shapes
for df in deaths_100k.values():
    print df.shape

(344, 5)
(344, 5)
(344, 5)
(344, 5)
(344, 5)


Assume data is consistent across all death per 100k data.

In [12]:
# see if country names consistent
np.all(risk.index.unique() == deaths_100k['all']['Country'].unique())

True

The gender data in the deaths per 100k data is redundant since we don't know differences in other factors by gender. Let's simplify to have the data represent both genders combined.

In [13]:
# gather only combined-gender data
deaths_100k['all'].pivot('Country', 'Year', 'Both sexes').head()

Year,2000,2012
Country,Unnamed: 1_level_1,Unnamed: 2_level_1
Afghanistan,905.5,846.3
Albania,840.0,671.6
Algeria,737.5,710.4
Angola,739.6,768.4
Argentina,509.4,467.3


In [14]:
# complete for all 100k data
for key, value in deaths_100k.items():
    deaths_100k[key] = value.pivot('Country', 'Year', 'Both sexes')

In [15]:
deaths_100k['cancer'].head()

Year,2000,2012
Country,Unnamed: 1_level_1,Unnamed: 2_level_1
Afghanistan,122.9,123.6
Albania,137.7,123.1
Algeria,75.1,80.6
Angola,74.8,89.6
Argentina,136.8,131.5


# Food: Crops and Livestock

In [16]:
crops.head()

Unnamed: 0,Country Code,Country,Item Code,Item,Element Code,Element,Unit,Y1961,Y1961F,Y1962,...,Y2009,Y2009F,Y2010,Y2010F,Y2011,Y2011F,Y2012,Y2012F,Y2013,Y2013F
0,2,Afghanistan,2617,Apples and products,641,Food supply quantity (tonnes),tonnes,14345.0,S,14345.0,...,60666.0,S,65219.0,S,56185.0,S,43222.0,S,67678.0,S
1,2,Afghanistan,2617,Apples and products,646,Food supply quantity (g/capita/day),g/capita/day,4.39,Fc,4.3,...,6.0,Fc,6.29,Fc,5.29,Fc,3.97,Fc,6.07,Fc
2,2,Afghanistan,2617,Apples and products,645,Food supply quantity (kg/capita/yr),Kg,1.6,Fc,1.57,...,2.19,Fc,2.3,Fc,1.93,Fc,1.45,Fc,2.22,Fc
3,2,Afghanistan,2617,Apples and products,664,Food supply (kcal/capita/day),kcal/capita/day,2.0,Fc,2.0,...,3.0,Fc,3.0,Fc,3.0,Fc,2.0,Fc,3.0,Fc
4,2,Afghanistan,2617,Apples and products,674,Protein supply quantity (g/capita/day),g/capita/day,0.01,Fc,0.01,...,0.02,Fc,0.02,Fc,0.02,Fc,0.01,Fc,0.02,Fc


In [17]:
# exploring countries
crops['Country'].unique()

array(['Afghanistan', 'Albania', 'Algeria', 'Angola',
       'Antigua and Barbuda', 'Argentina', 'Armenia', 'Australia',
       'Austria', 'Azerbaijan', 'Bahamas', 'Bangladesh', 'Barbados',
       'Belarus', 'Belgium', 'Belgium-Luxembourg', 'Belize', 'Benin',
       'Bermuda', 'Bolivia (Plurinational State of)',
       'Bosnia and Herzegovina', 'Botswana', 'Brazil', 'Brunei Darussalam',
       'Bulgaria', 'Burkina Faso', 'Cabo Verde', 'Cambodia', 'Cameroon',
       'Canada', 'Central African Republic', 'Chad', 'Chile', 'China',
       'China, Hong Kong SAR', 'China, Macao SAR', 'China, mainland',
       'China, Taiwan Province of', 'Colombia', 'Congo', 'Costa Rica',
       "C\xf4te d'Ivoire", 'Croatia', 'Cuba', 'Cyprus', 'Czech Republic',
       'Czechoslovakia', "Democratic People's Republic of Korea",
       'Denmark', 'Djibouti', 'Dominica', 'Dominican Republic', 'Ecuador',
       'Egypt', 'El Salvador', 'Estonia', 'Ethiopia', 'Ethiopia PDR',
       'Fiji', 'Finland', 'France', 'Fre

It appears that the bottom portion of the rows are not countries but categories. We should drop them. 

In [18]:
# first non-country index
crops[crops['Country'] == 'World'].index[0]

77135

In [19]:
# contains only countries
crops = crops[:77135]

Now, we can drop the columns that are useless to us: the codes and the flags for each year.

In [20]:
cols_tokeep = ['Country', 'Item', 'Element', 'Unit'] + ['Y' + str(year) for year in range(1961, 2014)]
crops = pd.DataFrame(crops[cols_tokeep])
crops.head()

Unnamed: 0,Country,Item,Element,Unit,Y1961,Y1962,Y1963,Y1964,Y1965,Y1966,...,Y2004,Y2005,Y2006,Y2007,Y2008,Y2009,Y2010,Y2011,Y2012,Y2013
0,Afghanistan,Apples and products,Food supply quantity (tonnes),tonnes,14345.0,14345.0,14345.0,17480.0,19380.0,21660.0,...,17589.0,17770.0,46695.0,45262.0,48503.0,60666.0,65219.0,56185.0,43222.0,67678.0
1,Afghanistan,Apples and products,Food supply quantity (g/capita/day),g/capita/day,4.39,4.3,4.21,5.02,5.44,5.94,...,2.01,1.96,4.99,4.71,4.92,6.0,6.29,5.29,3.97,6.07
2,Afghanistan,Apples and products,Food supply quantity (kg/capita/yr),Kg,1.6,1.57,1.54,1.83,1.98,2.17,...,0.73,0.71,1.82,1.72,1.79,2.19,2.3,1.93,1.45,2.22
3,Afghanistan,Apples and products,Food supply (kcal/capita/day),kcal/capita/day,2.0,2.0,2.0,2.0,3.0,3.0,...,1.0,1.0,2.0,2.0,2.0,3.0,3.0,3.0,2.0,3.0
4,Afghanistan,Apples and products,Protein supply quantity (g/capita/day),g/capita/day,0.01,0.01,0.01,0.01,0.02,0.02,...,0.01,0.01,0.01,0.01,0.01,0.02,0.02,0.02,0.01,0.02


Let's see what each of the element are. We should use only one for consistency and simplicity.

In [21]:
crops['Element'].unique()

array(['Food supply quantity (tonnes)',
       'Food supply quantity (g/capita/day)',
       'Food supply quantity (kg/capita/yr)',
       'Food supply (kcal/capita/day)',
       'Protein supply quantity (g/capita/day)',
       'Fat supply quantity (g/capita/day)'], dtype=object)

In [22]:
crops = crops[crops['Element'] == 'Food supply quantity (g/capita/day)']
crops.head()

Unnamed: 0,Country,Item,Element,Unit,Y1961,Y1962,Y1963,Y1964,Y1965,Y1966,...,Y2004,Y2005,Y2006,Y2007,Y2008,Y2009,Y2010,Y2011,Y2012,Y2013
1,Afghanistan,Apples and products,Food supply quantity (g/capita/day),g/capita/day,4.39,4.3,4.21,5.02,5.44,5.94,...,2.01,1.96,4.99,4.71,4.92,6.0,6.29,5.29,3.97,6.07
7,Afghanistan,Bananas,Food supply quantity (g/capita/day),g/capita/day,0.0,0.0,0.0,0.0,0.0,0.0,...,0.69,0.09,1.16,0.82,1.18,3.71,2.06,2.61,4.39,7.39
13,Afghanistan,Barley and products,Food supply quantity (g/capita/day),g/capita/day,72.39,70.91,69.4,68.3,66.77,65.06,...,21.07,4.7,4.72,4.97,6.25,5.48,5.81,6.75,7.15,8.0
19,Afghanistan,Beer,Food supply quantity (g/capita/day),g/capita/day,0.0,0.0,0.0,0.0,0.0,0.0,...,0.53,0.27,0.33,0.21,0.25,0.3,0.28,0.25,0.25,0.25
24,Afghanistan,"Beverages, Alcoholic",Food supply quantity (g/capita/day),g/capita/day,0.0,0.0,0.0,0.0,0.0,0.01,...,0.01,0.02,0.02,0.01,0.07,0.01,0.0,0.0,0.0,0.0


Now let's eliminate a few more pointless columns.

In [23]:
del crops['Element']
del crops['Unit']
crops.reset_index(drop=True, inplace=True)
crops.head()

Unnamed: 0,Country,Item,Y1961,Y1962,Y1963,Y1964,Y1965,Y1966,Y1967,Y1968,...,Y2004,Y2005,Y2006,Y2007,Y2008,Y2009,Y2010,Y2011,Y2012,Y2013
0,Afghanistan,Apples and products,4.39,4.3,4.21,5.02,5.44,5.94,6.41,6.63,...,2.01,1.96,4.99,4.71,4.92,6.0,6.29,5.29,3.97,6.07
1,Afghanistan,Bananas,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.69,0.09,1.16,0.82,1.18,3.71,2.06,2.61,4.39,7.39
2,Afghanistan,Barley and products,72.39,70.91,69.4,68.3,66.77,65.06,60.18,59.51,...,21.07,4.7,4.72,4.97,6.25,5.48,5.81,6.75,7.15,8.0
3,Afghanistan,Beer,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.53,0.27,0.33,0.21,0.25,0.3,0.28,0.25,0.25,0.25
4,Afghanistan,"Beverages, Alcoholic",0.0,0.0,0.0,0.0,0.0,0.01,0.01,0.0,...,0.01,0.02,0.02,0.01,0.07,0.01,0.0,0.0,0.0,0.0


Perfect! Now let's do this for livestock.

In [24]:
meat.head()

Unnamed: 0,Country Code,Country,Item Code,Item,Element Code,Element,Unit,Y1961,Y1961F,Y1962,...,Y2009,Y2009F,Y2010,Y2010F,Y2011,Y2011F,Y2012,Y2012F,Y2013,Y2013F
0,2,Afghanistan,2731,Bovine Meat,641,Food supply quantity (tonnes),tonnes,43000.0,S,45800.0,...,136279.0,S,133397.0,S,140660.0,S,153438.0,S,140087.0,S
1,2,Afghanistan,2731,Bovine Meat,646,Food supply quantity (g/capita/day),g/capita/day,13.16,Fc,13.73,...,13.48,Fc,12.87,Fc,13.24,Fc,14.09,Fc,12.56,Fc
2,2,Afghanistan,2731,Bovine Meat,645,Food supply quantity (kg/capita/yr),Kg,4.8,Fc,5.01,...,4.92,Fc,4.7,Fc,4.83,Fc,5.14,Fc,4.59,Fc
3,2,Afghanistan,2731,Bovine Meat,664,Food supply (kcal/capita/day),kcal/capita/day,28.0,Fc,30.0,...,29.0,Fc,28.0,Fc,28.0,Fc,30.0,Fc,27.0,Fc
4,2,Afghanistan,2731,Bovine Meat,674,Protein supply quantity (g/capita/day),g/capita/day,1.99,Fc,2.07,...,2.03,Fc,1.94,Fc,2.0,Fc,2.13,Fc,1.89,Fc


Livestock resembles crops, so we should do the same process.

In [25]:
# first non-country index
meat[meat['Country'] == 'World'].index[0]

35085

In [26]:
# contains only countries
meat = meat[:35085]

In [27]:
meat = pd.DataFrame(meat[cols_tokeep])
meat.head()

Unnamed: 0,Country,Item,Element,Unit,Y1961,Y1962,Y1963,Y1964,Y1965,Y1966,...,Y2004,Y2005,Y2006,Y2007,Y2008,Y2009,Y2010,Y2011,Y2012,Y2013
0,Afghanistan,Bovine Meat,Food supply quantity (tonnes),tonnes,43000.0,45800.0,47250.0,48000.0,48700.0,68000.0,...,150974.0,144742.0,121169.0,140515.0,134301.0,136279.0,133397.0,140660.0,153438.0,140087.0
1,Afghanistan,Bovine Meat,Food supply quantity (g/capita/day),g/capita/day,13.16,13.73,13.86,13.77,13.66,18.65,...,17.22,15.95,12.95,14.61,13.61,13.48,12.87,13.24,14.09,12.56
2,Afghanistan,Bovine Meat,Food supply quantity (kg/capita/yr),Kg,4.8,5.01,5.06,5.03,4.99,6.81,...,6.29,5.82,4.73,5.33,4.97,4.92,4.7,4.83,5.14,4.59
3,Afghanistan,Bovine Meat,Food supply (kcal/capita/day),kcal/capita/day,28.0,30.0,30.0,30.0,30.0,40.0,...,37.0,34.0,27.0,31.0,29.0,29.0,28.0,28.0,30.0,27.0
4,Afghanistan,Bovine Meat,Protein supply quantity (g/capita/day),g/capita/day,1.99,2.07,2.09,2.08,2.06,2.82,...,2.59,2.39,1.94,2.19,2.05,2.03,1.94,2.0,2.13,1.89


In [28]:
meat = meat[meat['Element'] == 'Food supply quantity (g/capita/day)']
meat.head()

Unnamed: 0,Country,Item,Element,Unit,Y1961,Y1962,Y1963,Y1964,Y1965,Y1966,...,Y2004,Y2005,Y2006,Y2007,Y2008,Y2009,Y2010,Y2011,Y2012,Y2013
1,Afghanistan,Bovine Meat,Food supply quantity (g/capita/day),g/capita/day,13.16,13.73,13.86,13.77,13.66,18.65,...,17.22,15.95,12.95,14.61,13.61,13.48,12.87,13.24,14.09,12.56
7,Afghanistan,"Butter, Ghee",Food supply quantity (g/capita/day),g/capita/day,3.4,3.34,3.65,3.66,3.9,3.96,...,3.86,3.64,3.35,3.5,3.33,3.29,3.26,3.14,3.22,3.22
13,Afghanistan,Cheese,Food supply quantity (g/capita/day),g/capita/day,4.31,4.24,4.51,4.65,4.8,4.35,...,2.26,2.18,1.98,2.35,2.28,2.18,2.14,2.23,2.17,2.15
19,Afghanistan,Cream,Food supply quantity (g/capita/day),g/capita/day,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.03,0.12,0.28,0.33,1.66
25,Afghanistan,Eggs,Food supply quantity (g/capita/day),g/capita/day,2.57,2.78,2.84,2.9,3.03,3.05,...,1.74,1.95,1.86,2.53,2.88,2.72,2.27,3.0,2.37,3.56


In [29]:
del meat['Element']
del meat['Unit']
meat.reset_index(drop=True, inplace=True)
meat.head()

Unnamed: 0,Country,Item,Y1961,Y1962,Y1963,Y1964,Y1965,Y1966,Y1967,Y1968,...,Y2004,Y2005,Y2006,Y2007,Y2008,Y2009,Y2010,Y2011,Y2012,Y2013
0,Afghanistan,Bovine Meat,13.16,13.73,13.86,13.77,13.66,18.65,17.42,18.59,...,17.22,15.95,12.95,14.61,13.61,13.48,12.87,13.24,14.09,12.56
1,Afghanistan,"Butter, Ghee",3.4,3.34,3.65,3.66,3.9,3.96,4.22,4.3,...,3.86,3.64,3.35,3.5,3.33,3.29,3.26,3.14,3.22,3.22
2,Afghanistan,Cheese,4.31,4.24,4.51,4.65,4.8,4.35,4.59,4.57,...,2.26,2.18,1.98,2.35,2.28,2.18,2.14,2.23,2.17,2.15
3,Afghanistan,Cream,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.03,0.12,0.28,0.33,1.66
4,Afghanistan,Eggs,2.57,2.78,2.84,2.9,3.03,3.05,3.4,3.05,...,1.74,1.95,1.86,2.53,2.88,2.72,2.27,3.0,2.37,3.56


Let's check: are the countries still the same?

In [30]:
np.all(crops['Country'].unique() == meat['Country'].unique())

True

# Healthcare: Hospital Beds & Physicians

In [31]:
beds.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2008,2009,2010,2011,2012,2013,2014,2015,2016,Unnamed: 61
0,Aruba,ABW,"Hospital beds (per 1,000 people)",SH.MED.BEDS.ZS,,,,,,,...,,,,,,,,,,
1,Andorra,AND,"Hospital beds (per 1,000 people)",SH.MED.BEDS.ZS,,,,,,,...,,2.5,,,,,,,,
2,Afghanistan,AFG,"Hospital beds (per 1,000 people)",SH.MED.BEDS.ZS,0.170627,,,,,,...,0.42,0.4,0.4,,0.5,,,,,
3,Angola,AGO,"Hospital beds (per 1,000 people)",SH.MED.BEDS.ZS,2.061462,,,,,,...,,,,,,,,,,
4,Albania,ALB,"Hospital beds (per 1,000 people)",SH.MED.BEDS.ZS,5.102676,,,,,,...,,2.8,,2.43,2.6,,,,,


In [32]:
beds['Country Name'].unique()

array(['Aruba', 'Andorra', 'Afghanistan', 'Angola', 'Albania',
       'Arab World', 'United Arab Emirates', 'Argentina', 'Armenia',
       'American Samoa', 'Antigua and Barbuda', 'Australia', 'Austria',
       'Azerbaijan', 'Burundi', 'Belgium', 'Benin', 'Burkina Faso',
       'Bangladesh', 'Bulgaria', 'Bahrain', 'Bahamas, The',
       'Bosnia and Herzegovina', 'Belarus', 'Belize', 'Bermuda', 'Bolivia',
       'Brazil', 'Barbados', 'Brunei Darussalam', 'Bhutan', 'Botswana',
       'Central African Republic', 'Canada',
       'Central Europe and the Baltics', 'Switzerland', 'Channel Islands',
       'Chile', 'China', "Cote d'Ivoire", 'Cameroon', 'Congo, Rep.',
       'Colombia', 'Comoros', 'Cabo Verde', 'Costa Rica',
       'Caribbean small states', 'Cuba', 'Curacao', 'Cayman Islands',
       'Cyprus', 'Czech Republic', 'Germany', 'Djibouti', 'Dominica',
       'Denmark', 'Dominican Republic', 'Algeria',
       'East Asia & Pacific (excluding high income)',
       'Early-demographic di

The country names look different, but they all seem to be countries. Let's worry about them later.

First, let's drop some of the useless columns.

In [33]:
del beds['Country Code']
del beds['Indicator Name']
del beds['Indicator Code']
del beds['Unnamed: 61']
beds = beds.set_index('Country Name')

beds.head()

Unnamed: 0_level_0,1960,1961,1962,1963,1964,1965,1966,1967,1968,1969,...,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016
Country Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Aruba,,,,,,,,,,,...,,,,,,,,,,
Andorra,,,,,,,,,,,...,2.6,,2.5,,,,,,,
Afghanistan,0.170627,,,,,,,,,,...,0.42,0.42,0.4,0.4,,0.5,,,,
Angola,2.061462,,,,,,,,,,...,,,,,,,,,,
Albania,5.102676,,,,,,,,,,...,2.9,,2.8,,2.43,2.6,,,,


Looks good. Let's now do it for the physicians per 1000 people.

In [34]:
doctors.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2008,2009,2010,2011,2012,2013,2014,2015,2016,Unnamed: 61
0,Aruba,ABW,"Physicians (per 1,000 people)",SH.MED.PHYS.ZS,,,,,,,...,,,,,,,,,,
1,Andorra,AND,"Physicians (per 1,000 people)",SH.MED.PHYS.ZS,,,,,,,...,,3.912,4.0,,,,,,,
2,Afghanistan,AFG,"Physicians (per 1,000 people)",SH.MED.PHYS.ZS,0.034844,,,,,0.063428,...,0.145,0.175,0.194,0.234,0.225,0.266,,,,
3,Angola,AGO,"Physicians (per 1,000 people)",SH.MED.PHYS.ZS,0.067068,,,,,0.076062,...,,0.166,,,,,,,,
4,Albania,ALB,"Physicians (per 1,000 people)",SH.MED.PHYS.ZS,0.276291,,,,,0.481283,...,,1.144,1.132,1.113,1.145,1.145,,,,


In [35]:
del doctors['Country Code']
del doctors['Indicator Name']
del doctors['Indicator Code']
del doctors['Unnamed: 61']
doctors = doctors.set_index('Country Name')

doctors.head()

Unnamed: 0_level_0,1960,1961,1962,1963,1964,1965,1966,1967,1968,1969,...,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016
Country Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Aruba,,,,,,,,,,,...,,,,,,,,,,
Andorra,,,,,,,,,,,...,3.716,,3.912,4.0,,,,,,
Afghanistan,0.034844,,,,,0.063428,,,,,...,0.146,0.145,0.175,0.194,0.234,0.225,0.266,,,
Angola,0.067068,,,,,0.076062,,,,,...,,,0.166,,,,,,,
Albania,0.276291,,,,,0.481283,,,,,...,1.146,,1.144,1.132,1.113,1.145,1.145,,,


Finally, check once more that the country names are consistent.

In [36]:
np.all(beds.index == doctors.index)

True

# Country Merging (CHRIS START HERE)

If stuff doesn't appear in master data, ignore it. I can probably try to implement changes to the actual indexes myself tomorrow morning, so focus on just identifying the ones that are different.

## Master data: make everything resemble this:

In [37]:
master_list = risk.index.values

## Variation 1:

In [38]:
variation_1 = crops['Country'].unique()

## Variation 2

In [43]:
variation_2 = beds.index.values

In [45]:
mlist = list(master_list)
var1 = list(variation_1)
var2 = list(variation_2)

In [82]:
mlist = set(mlist)
var1 = set(var1)
var2 = set(var2)

For documentation on these set operations, see: https://docs.python.org/2/library/sets.html

In [52]:
# number of differences
len(mlist.symmetric_difference(var1))

46

In [None]:
# print countries contained in only one of the lists
# for master and variation 1
print mlist.symmetric_difference(var1)

In [53]:
# number of differences
len(mlist.symmetric_difference(var2))

132

In [54]:
# print countries contained in only one of the lists
# for master and variation 2
print mlist.symmetric_difference(var2)

set(['United States of America', 'OECD members', 'Middle East & North Africa (IDA & IBRD countries)', 'Channel Islands', 'East Asia & Pacific (IDA & IBRD countries)', 'IDA total', 'St. Lucia', 'Dominica', 'Tanzania', 'Early-demographic dividend', 'Isle of Man', 'Monaco', 'Latin America & Caribbean', 'Yemen, Rep.', 'Upper middle income', 'Hong Kong SAR, China', 'Slovakia', 'Vanuatu', 'Nauru', 'Antigua and Barbuda', 'Liechtenstein', 'Micronesia, Fed. Sts.', 'North America', 'St. Vincent and the Grenadines', 'Middle income', 'Kyrgyz Republic', 'United States', 'Andorra', 'Gibraltar', 'Tuvalu', 'IDA only', 'Sub-Saharan Africa', "Lao People's Democratic Republic", 'Post-demographic dividend', 'Moldova', 'United Republic of Tanzania', 'Kosovo', 'Republic of Korea', 'Palau', 'United Kingdom of Great Britain and Northern Ireland', 'St. Martin (French part)', 'East Asia & Pacific (excluding high income)', 'Not classified', 'IBRD only', "Democratic People's Republic of Korea", 'United Kingdom', 

Ok, this might be doable manually, but it will be very painful (especially for variation 2 since there are over 100 discrepancies). Unfortunately, it doesn't look like it's going to be possible to map names to their variations with 100 percent accuracy. Going to try my best. Process will be something like the following:

(1) Remove all region names in variations 1 and 2 from consideration (this will prevent some inaccurate mappings), as they will probably be too general to be useful, and we can calculate them later by aggregating the data for the respective regions

(2) Try to match up countries by substring

(3) Creating a dictionary mapping each name in variations 1 and 2 (but not in master) to the most similar country by name in master

In [56]:
# get names in variations 1 and 2 that are not in master
var1_not_master = var1.difference(mlist)
var2_not_master = var2.difference(mlist)

In [57]:
var1_not_master

{'Antigua and Barbuda',
 'Belgium-Luxembourg',
 'Bermuda',
 'China, Hong Kong SAR',
 'China, Macao SAR',
 'China, Taiwan Province of',
 'China, mainland',
 'Czechoslovakia',
 "C\xf4te d'Ivoire",
 'Dominica',
 'Ethiopia PDR',
 'French Polynesia',
 'Grenada',
 'Kiribati',
 'Netherlands Antilles (former)',
 'New Caledonia',
 'Saint Kitts and Nevis',
 'Saint Lucia',
 'Saint Vincent and the Grenadines',
 'Samoa',
 'Sao Tome and Principe',
 'Serbia and Montenegro',
 'Sudan (former)',
 'The former Yugoslav Republic of Macedonia',
 'USSR',
 'United Kingdom',
 'Vanuatu',
 'Yugoslav SFR'}

In [72]:
var2_not_master

{'American Samoa',
 'Andorra',
 'Antigua and Barbuda',
 'Arab World',
 'Aruba',
 'Bahamas, The',
 'Bermuda',
 'Bolivia',
 'British Virgin Islands',
 'Caribbean small states',
 'Cayman Islands',
 'Central Europe and the Baltics',
 'Channel Islands',
 'Congo, Dem. Rep.',
 'Congo, Rep.',
 'Curacao',
 'Dominica',
 'Early-demographic dividend',
 'East Asia & Pacific',
 'East Asia & Pacific (IDA & IBRD countries)',
 'East Asia & Pacific (excluding high income)',
 'Egypt, Arab Rep.',
 'Euro area',
 'Europe & Central Asia',
 'Europe & Central Asia (IDA & IBRD countries)',
 'Europe & Central Asia (excluding high income)',
 'European Union',
 'Faroe Islands',
 'Fragile and conflict affected situations',
 'French Polynesia',
 'Gambia, The',
 'Gibraltar',
 'Greenland',
 'Grenada',
 'Guam',
 'Heavily indebted poor countries (HIPC)',
 'High income',
 'Hong Kong SAR, China',
 'IBRD only',
 'IDA & IBRD total',
 'IDA blend',
 'IDA only',
 'IDA total',
 'Iran, Islamic Rep.',
 'Isle of Man',
 'Kiribati',

In [73]:
# Definitely regions not countries
var2_exclude = {'Arab World', 'Caribbean small states',
               'Central Europe and the Baltics',
               'Early-demographic dividend',
               'East Asia & Pacific',
               'East Asia & Pacific (IDA & IBRD countries)',
               'East Asia & Pacific (excluding high income)',
               'Euro area',
               'Europe & Central Asia',
               'Europe & Central Asia (IDA & IBRD countries)',
               'Europe & Central Asia (excluding high income)',
               'European Union',
               'Fragile and conflict affected situations',
               'Heavily indebted poor countries (HIPC)',
               'IBRD only',
               'IDA & IBRD total',
               'IDA blend',
               'IDA only',
               'IDA total',
               'Late-demographic dividend',
               'Latin America & Caribbean',
               'Latin America & Caribbean (excluding high income)',
               'Latin America & the Caribbean (IDA & IBRD countries)',
               'Least developed countries: UN classification',
               'Low & middle income',
               'Low income',
               'Lower middle income',
               'Middle East & North Africa',
               'Middle East & North Africa (IDA & IBRD countries)',
               'Middle East & North Africa (excluding high income)',
               'Middle income',
               'North America',
               'Not classified',
               'OECD members',
               'Other small states',
               'Pacific island small states',
               'Post-demographic dividend',
               'Pre-demographic dividend',
               'Small states',
               'South Asia',
               'South Asia (IDA & IBRD)', 
               'Sub-Saharan Africa',
               'Sub-Saharan Africa (IDA & IBRD countries)',
               'Sub-Saharan Africa (excluding high income)',
               'Upper middle income',
               'West Bank and Gaza',
               'World',
        'High income'
               }

Actually it may be feasible to do this manually once I remove the regions, gonna try that

In [76]:
# delete from var2
var2 -= var2_exclude

In [83]:
var1 - mlist

{'Antigua and Barbuda',
 'Belgium-Luxembourg',
 'Bermuda',
 'China, Hong Kong SAR',
 'China, Macao SAR',
 'China, Taiwan Province of',
 'China, mainland',
 'Czechoslovakia',
 "C\xf4te d'Ivoire",
 'Dominica',
 'Ethiopia PDR',
 'French Polynesia',
 'Grenada',
 'Kiribati',
 'Netherlands Antilles (former)',
 'New Caledonia',
 'Saint Kitts and Nevis',
 'Saint Lucia',
 'Saint Vincent and the Grenadines',
 'Samoa',
 'Sao Tome and Principe',
 'Serbia and Montenegro',
 'Sudan (former)',
 'The former Yugoslav Republic of Macedonia',
 'USSR',
 'United Kingdom',
 'Vanuatu',
 'Yugoslav SFR'}

In [84]:
var2 - mlist

{'American Samoa',
 'Andorra',
 'Antigua and Barbuda',
 'Aruba',
 'Bahamas, The',
 'Bermuda',
 'Bolivia',
 'British Virgin Islands',
 'Cayman Islands',
 'Channel Islands',
 'Congo, Dem. Rep.',
 'Congo, Rep.',
 'Curacao',
 'Dominica',
 'Egypt, Arab Rep.',
 'Faroe Islands',
 'French Polynesia',
 'Gambia, The',
 'Gibraltar',
 'Greenland',
 'Grenada',
 'Guam',
 'High income',
 'Hong Kong SAR, China',
 'Iran, Islamic Rep.',
 'Isle of Man',
 'Kiribati',
 'Korea, Dem. People\xe2\x80\x99s Rep.',
 'Korea, Rep.',
 'Kosovo',
 'Kyrgyz Republic',
 'Lao PDR',
 'Liechtenstein',
 'Macao SAR, China',
 'Macedonia, FYR',
 'Marshall Islands',
 'Micronesia, Fed. Sts.',
 'Moldova',
 'Monaco',
 'Nauru',
 'New Caledonia',
 'Northern Mariana Islands',
 'Palau',
 'Puerto Rico',
 'Samoa',
 'San Marino',
 'Sao Tome and Principe',
 'Seychelles',
 'Sint Maarten (Dutch part)',
 'Slovak Republic',
 'St. Kitts and Nevis',
 'St. Lucia',
 'St. Martin (French part)',
 'St. Vincent and the Grenadines',
 'Tanzania',
 'Tong

In [79]:
# convert everything back to lists for easy searching
mlist = list(mlist)
var1 = list(var1)
var2 = list(var2)

In [155]:
'''
build dictionary for var1
countries with 'None' as values are 
those without master list equivalent
'''
var1_lookup = {'Antigua and Barbuda': None,
               'Belgium-Luxembourg': 'Luxembourg',
               'Bermuda': None, 
               'China, Hong Kong SAR': None, 
               'China, Macao SAR': None,
               'China, Taiwan Province of': None, 
               'China, mainland': 'China',
               'Czechoslovakia': None,
               "C\xf4te d'Ivoire": "Cote d'Ivoire",
               'Dominica': None,
               'Ethiopia PDR': 'Ethiopia',
               'French Polynesia': None,
               'Grenada': None,
               'Kiribati': None,
               'Netherlands Antilles (former)': None,
               'New Caledonia': None,
               'Saint Kitts and Nevis': None,
               'Saint Lucia': None,
               'Saint Vincent and the Grenadines': None,
               'Samoa': None,
               # dang there really are no caribbean countries
               'Sao Tome and Principe': None,
               # Separate countries in master
               'Serbia and Montenegro': None,
               # now Sudan and South Sudan
               'Sudan (former)': None,
               # different capitalization
               'The former Yugoslav Republic of Macedonia': 'The former Yugoslav republic of Macedonia',
               'USSR': None,
               'United Kingdom': 'United Kingdom of Great Britain and Northern Ireland',
               'Vanuatu': None,
               'Yugoslav SFR': None, 
              }

In [228]:
'''
build dictionary for var2
countries with 'None' as values are 
those without master list equivalent
'''
var2_lookup = {'American Samoa': None,
               'Andorra': None,
               'Antigua and Barbuda': None,
               'Aruba': None,
               'Bahamas, The': 'Bahamas',
               'Bermuda': None,
               'Bolivia': 'Bolivia (Plurinational State of)',
               'British Virgin Islands': None,
               'Cayman Islands': None,
               'Channel Islands': None,
               'Congo, Dem. Rep.': 'Democratic Republic of the Congo',
               'Congo, Rep.': 'Congo',
               'Curacao': None,
               'Dominica': None,
               'Egypt, Arab Rep.': 'Egypt',
               'Faroe Islands': None,
               'French Polynesia': None,
               'Gambia, The': 'Gambia',
               'Gibraltar': None,
               'Greenland': None,
               'Grenada': None,
               'Guam': None,
               'Hong Kong SAR, China': None,
               'Iran, Islamic Rep.': 'Iran (Islamic Republic of)',
               'Isle of Man': None,
               'Kiribati': None,
               # North Korea
               'Korea, Dem. People\xe2\x80\x99s Rep.': "Democratic People's Republic of Korea",
               # South Korea
               'Korea, Rep.': 'Republic of Korea',
               'Kosovo': None,
               'Kyrgyz Republic': 'Kyrgyzstan',
               'Lao PDR': "Lao People's Democratic Republic",
               'Liechtenstein': None,
               'Macao SAR, China': None,
               'Macedonia, FYR': 'The former Yugoslav republic of Macedonia',
               'Marshall Islands': None,
               'Micronesia, Fed. Sts.': None,
               'Moldova': 'Republic of Moldova',
               'Monaco': None,
               'Nauru': None,
               'New Caledonia': None,
               'Northern Mariana Islands': None,
               'Palau': None,
               'Puerto Rico': None,
               'Samoa': None,
               'San Marino': None,
               'Sao Tome and Principe': None,
               'Seychelles': None,
               'Sint Maarten (Dutch part)': None,
               'Slovak Republic': 'Slovakia',
               'St. Kitts and Nevis': None,
               'St. Lucia': None,
               'St. Martin (French part)': None,
               'St. Vincent and the Grenadines': None,
               'Tanzania': 'United Republic of Tanzania',
               'Tonga': None,
               'Turks and Caicos Islands': None,
               'Tuvalu': None,
               'United Kingdom': 'United Kingdom of Great Britain and Northern Ireland',
               'United States': 'United States of America',
               'Vanuatu': None,
               'Venezuela, RB': 'Venezuela (Bolivarian Republic of)',
               'Vietnam': 'Viet Nam',
               'Virgin Islands (U.S.)': None,
               'Yemen, Rep.': 'Yemen'
              }

In [229]:
# how I searched for countries in master
[name for name in mlist if 'name_substring' in name.lower()]

[]

In [231]:
'''
Build master dictionary.
If a given country is not in master and is not a key
in this dictionary, it was not considered a country
'''
var_1_2_lookup = var1_lookup.copy()
var_1_2_lookup.update(var2_lookup)

In [233]:
var_1_2_lookup

{'American Samoa': None,
 'Andorra': None,
 'Antigua and Barbuda': None,
 'Aruba': None,
 'Bahamas, The': 'Bahamas',
 'Belgium-Luxembourg': 'Luxembourg',
 'Bermuda': None,
 'Bolivia': 'Bolivia (Plurinational State of)',
 'British Virgin Islands': None,
 'Cayman Islands': None,
 'Channel Islands': None,
 'China, Hong Kong SAR': None,
 'China, Macao SAR': None,
 'China, Taiwan Province of': None,
 'China, mainland': 'China',
 'Congo, Dem. Rep.': 'Democratic Republic of the Congo',
 'Congo, Rep.': 'Congo',
 'Curacao': None,
 'Czechoslovakia': None,
 "C\xf4te d'Ivoire": "Cote d'Ivoire",
 'Dominica': None,
 'Egypt, Arab Rep.': 'Egypt',
 'Ethiopia PDR': 'Ethiopia',
 'Faroe Islands': None,
 'French Polynesia': None,
 'Gambia, The': 'Gambia',
 'Gibraltar': None,
 'Greenland': None,
 'Grenada': None,
 'Guam': None,
 'Hong Kong SAR, China': None,
 'Iran, Islamic Rep.': 'Iran (Islamic Republic of)',
 'Isle of Man': None,
 'Kiribati': None,
 'Korea, Dem. People\xe2\x80\x99s Rep.': "Democratic Peop