Importing pandas and numpy

In [3]:
import pandas as pd
import numpy as np

Import first data frame - child mortality vs health expenditure

In [4]:
cmdf = pd.read_csv ("child-mortality-vs-health-expenditure.csv")

Import second data frame - life expectancy vs health expenditure

In [5]:
ledf = pd.read_csv ("life-expectancy-vs-healthcare-expenditure.csv")

Import third data frame - share of out of pocket health expenditure vs GDP per capita

In [6]:
gdpdf = pd.read_csv ("share-of-out-of-pocket-expenditure-vs-gdp-per-capita.csv")

check outputs

In [7]:
cmdf.head()

Unnamed: 0,Entity,Code,Year,CM,HE,Population,Continent
0,Australia,AUS,1900,,,3693392,
1,Austria,AUT,1900,,,5979177,
2,Belgium,BEL,1900,,,6717693,
3,Canada,CAN,1900,,,5411600,
4,Denmark,DNK,1900,,,2565597,


In [8]:
ledf.head()

Unnamed: 0,Entity,Code,Year,LE,HE,Population,Continent
0,Australia,AUS,1900,,,3693392,
1,Austria,AUT,1900,,,5979177,
2,Belgium,BEL,1900,,,6717693,
3,Canada,CAN,1900,,,5411600,
4,Denmark,DNK,1900,,,2565597,


In [9]:
gdpdf.head()

Unnamed: 0,Entity,Code,Year,OE,GDP,Population,Continent
0,Australia,AUS,1900,,,3693392,
1,Austria,AUT,1900,,,5979177,
2,Belgium,BEL,1900,,,6717693,
3,Canada,CAN,1900,,,5411600,
4,Denmark,DNK,1900,,,2565597,


Remove country code and continent columns entirely from all three data frames (continent column is almost enitrely empty. Country code is redundant, since we already have country name)

In [10]:
del cmdf["Code"]
del cmdf["Continent"]
del ledf["Code"]
del ledf["Continent"]
del gdpdf["Code"]
del gdpdf["Continent"]

Remove rows with empty values

In [11]:
cmdf_c = cmdf.dropna(subset = ['Population'], inplace = True)
cmdf_c = cmdf.dropna(subset = ['HE'], inplace = True)
cmdf_c = cmdf.dropna(subset = ['CM'], inplace = True)

ledf_c = ledf.dropna(subset = ['Population'], inplace = True)
ledf_c = ledf.dropna(subset = ['HE'], inplace = True)
ledf_c = ledf.dropna(subset = ['LE'], inplace = True)

gdpdf_c = gdpdf.dropna(subset = ['Population'], inplace = True)
gdpdf_c = gdpdf.dropna(subset = ['OE'], inplace = True)
gdpdf_c = gdpdf.dropna(subset = ['GDP'], inplace = True)

Remove any duplicates

In [12]:
cmdf.drop_duplicates(inplace = True)
ledf.drop_duplicates(inplace = True)
gdpdf.drop_duplicates(inplace = True)

check outputs

In [13]:
cmdf.head()

Unnamed: 0,Entity,Year,CM,HE,Population
2600,Australia,2000,0.62,2155.341553,18991434
2601,Austria,2000,0.55,2708.029541,8069276
2602,Belgium,2000,0.58,2226.144287,10282046
2603,Canada,2000,0.62,2429.605713,30588379
2604,Denmark,2000,0.57,2327.757813,5341192


In [14]:
ledf.head()

Unnamed: 0,Entity,Year,LE,HE,Population
2600,Australia,2000,79.234146,2155.341553,18991434
2601,Austria,2000,78.126831,2708.029541,8069276
2602,Belgium,2000,77.721954,2226.144287,10282046
2603,Canada,2000,79.136589,2429.605713,30588379
2604,Denmark,2000,76.592682,2327.757813,5341192


In [15]:
gdpdf.head()

Unnamed: 0,Entity,Year,OE,GDP,Population
2600,Australia,2000,20.997139,38217.44922,18991434
2601,Austria,2000,17.82222,46469.85938,8069276
2602,Belgium,2000,20.227228,43024.14063,10282046
2603,Canada,2000,16.605209,37194.39453,30588379
2604,Denmark,2000,15.356432,48907.01172,5341192


For cm and le, remove duplicate colums and merge - resulting data frame should include entity, year, mortality, HE, population, and LE

In [16]:
difcol_1 = ledf.columns.difference(cmdf.columns)
dif_1 = ledf[difcol_1]
merge1 = pd.merge(cmdf, dif_1, left_index = True, right_index = True, how = 'inner')

Check output

In [17]:
merge1.head()

Unnamed: 0,Entity,Year,CM,HE,Population,LE
2600,Australia,2000,0.62,2155.341553,18991434,79.234146
2601,Austria,2000,0.55,2708.029541,8069276,78.126831
2602,Belgium,2000,0.58,2226.144287,10282046,77.721954
2603,Canada,2000,0.62,2429.605713,30588379,79.136589
2604,Denmark,2000,0.57,2327.757813,5341192,76.592682


For gdp and merge1, remove duplicate colums and merge - will produce final data fram that includes entity, year, mortality, HE, population, LE, OE, and GDP

In [18]:
difcol_2 = gdpdf.columns.difference(merge1.columns)
dif_2 = gdpdf[difcol_2]
df = pd.merge(merge1, dif_2, left_index = True, right_index = True, how = 'inner')

check output

In [19]:
df.head()

Unnamed: 0,Entity,Year,CM,HE,Population,LE,GDP,OE
2600,Australia,2000,0.62,2155.341553,18991434,79.234146,38217.44922,20.997139
2601,Austria,2000,0.55,2708.029541,8069276,78.126831,46469.85938,17.82222
2602,Belgium,2000,0.58,2226.144287,10282046,77.721954,43024.14063,20.227228
2603,Canada,2000,0.62,2429.605713,30588379,79.136589,37194.39453,16.605209
2604,Denmark,2000,0.57,2327.757813,5341192,76.592682,48907.01172,15.356432


Child mortality is listed as per 1,000 live births - the below code will convert it to a percentage

In [20]:
df["CM_%"] = ((df["CM"]*100)/1000)

HE is listed per capita - the following will convert it to a % of GDP

In [21]:
df["HE_%"] = ((df["HE"]/df["GDP"])*100)

OE is listed as % of current health expenditure - the following will convert to % of GDP
OE * HE = OED (straight dollar number)
OED % GDP = OE_%

In [22]:
df["OE_%"] = ((df["OE"]*df["HE"])/df["GDP"])

In [23]:
df.head()

Unnamed: 0,Entity,Year,CM,HE,Population,LE,GDP,OE,CM_%,HE_%,OE_%
2600,Australia,2000,0.62,2155.341553,18991434,79.234146,38217.44922,20.997139,0.062,5.639679,1.184171
2601,Austria,2000,0.55,2708.029541,8069276,78.126831,46469.85938,17.82222,0.055,5.827497,1.038589
2602,Belgium,2000,0.58,2226.144287,10282046,77.721954,43024.14063,20.227228,0.058,5.174175,1.046592
2603,Canada,2000,0.62,2429.605713,30588379,79.136589,37194.39453,16.605209,0.062,6.532182,1.084683
2604,Denmark,2000,0.57,2327.757813,5341192,76.592682,48907.01172,15.356432,0.057,4.759558,0.730898


Round all columns to 3 decimal places

In [24]:
df.round(3)

Unnamed: 0,Entity,Year,CM,HE,Population,LE,GDP,OE,CM_%,HE_%,OE_%
2600,Australia,2000,0.62,2155.342,18991434,79.234,38217.449,20.997,0.062,5.640,1.184
2601,Austria,2000,0.55,2708.030,8069276,78.127,46469.859,17.822,0.055,5.827,1.039
2602,Belgium,2000,0.58,2226.144,10282046,77.722,43024.141,20.227,0.058,5.174,1.047
2603,Canada,2000,0.62,2429.606,30588379,79.137,37194.395,16.605,0.062,6.532,1.085
2604,Denmark,2000,0.57,2327.758,5341192,76.593,48907.012,15.356,0.057,4.760,0.731
...,...,...,...,...,...,...,...,...,...,...,...
3115,Sweden,2019,0.27,6222.659,10036391,83.110,52850.570,13.876,0.027,11.774,1.634
3116,Switzerland,2019,0.40,8532.496,8591361,83.905,70944.352,25.293,0.040,12.027,3.042
3117,United Kingdom,2019,0.43,5087.381,67530161,81.205,47368.586,17.074,0.043,10.740,1.834
3118,United States,2019,0.64,10921.013,329064917,78.788,62630.875,11.312,0.064,17.437,1.972


Rename "Entity" column to "Country"

In [25]:
df.rename(columns = {"Entity":"Country"})

Unnamed: 0,Country,Year,CM,HE,Population,LE,GDP,OE,CM_%,HE_%,OE_%
2600,Australia,2000,0.62,2155.341553,18991434,79.234146,38217.44922,20.997139,0.062,5.639679,1.184171
2601,Austria,2000,0.55,2708.029541,8069276,78.126831,46469.85938,17.822220,0.055,5.827497,1.038589
2602,Belgium,2000,0.58,2226.144287,10282046,77.721954,43024.14063,20.227228,0.058,5.174175,1.046592
2603,Canada,2000,0.62,2429.605713,30588379,79.136589,37194.39453,16.605209,0.062,6.532182,1.084683
2604,Denmark,2000,0.57,2327.757813,5341192,76.592682,48907.01172,15.356432,0.057,4.759558,0.730898
...,...,...,...,...,...,...,...,...,...,...,...
3115,Sweden,2019,0.27,6222.659180,10036391,83.109756,52850.57031,13.876382,0.027,11.774063,1.633814
3116,Switzerland,2019,0.40,8532.496094,8591361,83.904877,70944.35156,25.293182,0.040,12.027027,3.042018
3117,United Kingdom,2019,0.43,5087.381348,67530161,81.204880,47368.58594,17.073608,0.043,10.739990,1.833704
3118,United States,2019,0.64,10921.012700,329064917,78.787804,62630.87500,11.311578,0.064,17.437107,1.972412


output as final CSV

In [26]:
df.to_csv('final_health_expenditure_data.csv')