In [1]:
#Import dependencies 

import pandas as pd
import numpy as np
from sqlalchemy import create_engine

# Store CSV into Dataframe

In [2]:
csv = "Data/health_systems.csv"
world_bank_df = pd.read_csv(csv)

world_bank_df

Unnamed: 0,Country_Region,Province_State,World_Bank_Name,Health_exp_pct_GDP_2016,Health_exp_public_pct_2016,Health_exp_out_of_pocket_pct_2016,Health_exp_per_capita_USD_2016,per_capita_exp_PPP_2016,External_health_exp_pct_2016,Physicians_per_1000_2009-18,Nurse_midwife_per_1000_2009-18,Specialist_surgical_per_1000_2008-18,Completeness_of_birth_reg_2009-18,Completeness_of_death_reg_2008-16
0,Afghanistan,,Afghanistan,10.2,5.1,77.4,57.2,162.8,17.5,0.3,0.3,0.0,42.3,
1,Albania,,Albania,6.7,41.4,58.0,271.5,759.7,0.7,1.2,3.6,11.6,98.4,53.0
2,Algeria,,Algeria,6.6,67.7,30.9,260.4,998.2,0.0,1.8,2.2,12.1,100.0,
3,Andorra,,Andorra,10.4,49.1,41.7,3834.7,4978.7,,3.3,4.0,83.1,100.0,80.0
4,Angola,,Angola,2.9,44.1,35.2,95.2,185.8,3.6,0.2,1.3,,25.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
205,US,Virgin Islands,Virgin Islands (U.S.),,,,,,,,,,,
206,,,West Bank and Gaza,,,,,,,,,,96.0,
207,,,"Yemen, Rep.",5.6,10.2,81.0,72.0,144.5,7.8,0.3,0.7,0.8,30.7,
208,Zambia,,Zambia,4.5,38.3,12.1,56.5,175.2,42.5,0.1,0.9,1.5,11.3,


# Select columns needed

In [3]:
column_wb_df = world_bank_df[["Country_Region", 
                              "Province_State", 
                              "World_Bank_Name", 
                              "Health_exp_pct_GDP_2016", 
                              "Health_exp_public_pct_2016", 
                              "Health_exp_out_of_pocket_pct_2016"]].copy()

column_wb_df

Unnamed: 0,Country_Region,Province_State,World_Bank_Name,Health_exp_pct_GDP_2016,Health_exp_public_pct_2016,Health_exp_out_of_pocket_pct_2016
0,Afghanistan,,Afghanistan,10.2,5.1,77.4
1,Albania,,Albania,6.7,41.4,58.0
2,Algeria,,Algeria,6.6,67.7,30.9
3,Andorra,,Andorra,10.4,49.1,41.7
4,Angola,,Angola,2.9,44.1,35.2
...,...,...,...,...,...,...
205,US,Virgin Islands,Virgin Islands (U.S.),,,
206,,,West Bank and Gaza,,,
207,,,"Yemen, Rep.",5.6,10.2,81.0
208,Zambia,,Zambia,4.5,38.3,12.1


# Clean Data

In [4]:
#Rename Columns
rename_wb_df = column_wb_df.rename(columns= {"Country_Region":"Country",
                                             "Province_State":"Province", 
                                             "World_Bank_Name":"World_Bank", 
                                             "Health_exp_pct_GDP_2016":"Current_Health_Expenditure", 
                                             "Health_exp_public_pct_2016":"Domestic_Government_Health_Expenditure", 
                                             "Health_exp_out_of_pocket_pct_2016":"Out-of-Pocket_Expenditure"})

In [5]:
#Identify missing values in the data
rename_wb_df.count()

Country                                   187
Province                                   14
World_Bank                                210
Current_Health_Expenditure                186
Domestic_Government_Health_Expenditure    186
Out-of-Pocket_Expenditure                 186
dtype: int64

In [6]:
#Identify countries with missing values in the "Country" column
rename_wb_df.query('Country != Country')

Unnamed: 0,Country,Province,World_Bank,Current_Health_Expenditure,Domestic_Government_Health_Expenditure,Out-of-Pocket_Expenditure
97,,,Kiribati,11.9,79.6,0.1
98,,,"Korea, Dem. People's Rep.",,,
100,,,Kosovo,,,
106,,,Lesotho,8.1,63.8,18.9
113,,,Malawi,9.8,28.0,11.4
118,,,Marshall Islands,23.3,52.6,9.0
122,,,"Micronesia, Fed. Sts.",12.6,27.8,2.6
129,,,Myanmar,5.1,20.1,74.0
139,,,Northern Mariana Islands,,,
143,,,Palau,11.7,59.8,14.5


In [7]:
#Drop missing values from "Country" column
drop_na_country_df = rename_wb_df.dropna(subset=["Country"])

drop_na_country_df

Unnamed: 0,Country,Province,World_Bank,Current_Health_Expenditure,Domestic_Government_Health_Expenditure,Out-of-Pocket_Expenditure
0,Afghanistan,,Afghanistan,10.2,5.1,77.4
1,Albania,,Albania,6.7,41.4,58.0
2,Algeria,,Algeria,6.6,67.7,30.9
3,Andorra,,Andorra,10.4,49.1,41.7
4,Angola,,Angola,2.9,44.1,35.2
...,...,...,...,...,...,...
203,Venezuela,,"Venezuela, RB",3.2,24.1,40.0
204,Vietnam,,Vietnam,5.7,47.4,44.6
205,US,Virgin Islands,Virgin Islands (U.S.),,,
208,Zambia,,Zambia,4.5,38.3,12.1


In [8]:
#Identify values included in the Province column
provinces = drop_na_country_df.query("Province == Province")
prov_list = provinces["Province"]

provinces

Unnamed: 0,Country,Province,World_Bank,Current_Health_Expenditure,Domestic_Government_Health_Expenditure,Out-of-Pocket_Expenditure
32,United Kingdom,Cayman Islands,Cayman Islands,,,
35,United Kingdom,Channel Islands,Channel Islands,,,
38,China,Hong Kong,"Hong Kong SAR, China",,,
39,China,Macau,"Macao SAR, China",,,
62,Denmark,Faroe Islands,Faroe Islands,,,
66,France,French Polynesia,French Polynesia,,,
73,Denmark,Greenland,Greenland,,,
75,US,Guam,Guam,,,
89,United Kingdom,Isle of Man,Isle of Man,,,
133,France,New Caledonia,New Caledonia,,,


In [9]:
#Drop provinces from dataframe
drop_provs_df = drop_na_country_df[~(drop_na_country_df["Province"].isin(prov_list))]
drop_provs_df

Unnamed: 0,Country,Province,World_Bank,Current_Health_Expenditure,Domestic_Government_Health_Expenditure,Out-of-Pocket_Expenditure
0,Afghanistan,,Afghanistan,10.2,5.1,77.4
1,Albania,,Albania,6.7,41.4,58.0
2,Algeria,,Algeria,6.6,67.7,30.9
3,Andorra,,Andorra,10.4,49.1,41.7
4,Angola,,Angola,2.9,44.1,35.2
...,...,...,...,...,...,...
201,Uzbekistan,,Uzbekistan,6.3,46.1,52.2
203,Venezuela,,"Venezuela, RB",3.2,24.1,40.0
204,Vietnam,,Vietnam,5.7,47.4,44.6
208,Zambia,,Zambia,4.5,38.3,12.1


In [10]:
#Remove "Province" column from dataframe
drop_provs_df = drop_provs_df.drop(["Province"], axis=1)
drop_provs_df

Unnamed: 0,Country,World_Bank,Current_Health_Expenditure,Domestic_Government_Health_Expenditure,Out-of-Pocket_Expenditure
0,Afghanistan,Afghanistan,10.2,5.1,77.4
1,Albania,Albania,6.7,41.4,58.0
2,Algeria,Algeria,6.6,67.7,30.9
3,Andorra,Andorra,10.4,49.1,41.7
4,Angola,Angola,2.9,44.1,35.2
...,...,...,...,...,...
201,Uzbekistan,Uzbekistan,6.3,46.1,52.2
203,Venezuela,"Venezuela, RB",3.2,24.1,40.0
204,Vietnam,Vietnam,5.7,47.4,44.6
208,Zambia,Zambia,4.5,38.3,12.1


In [11]:
#Verify that all missing values have been removed
drop_provs_df.isna().any()

Country                                   False
World_Bank                                False
Current_Health_Expenditure                 True
Domestic_Government_Health_Expenditure     True
Out-of-Pocket_Expenditure                  True
dtype: bool

In [21]:
#Locate remaining missing values
missing_values = drop_provs_df.query("Current_Health_Expenditure != Current_Health_Expenditure")
missing_list = missing_values["Country"]

missing_values

Unnamed: 0,Country,World_Bank,Current_Health_Expenditure,Domestic_Government_Health_Expenditure,Out-of-Pocket_Expenditure
108,Libya,Libya,,,
109,Liechtenstein,Liechtenstein,,,
169,Somalia,Somalia,,,
182,Syria,Syrian Arab Republic,,,


In [23]:
#Remove countries with missing values
final_df = drop_provs_df[~(drop_provs_df["Country"].isin(missing_list))]
final_df

Unnamed: 0,Country,World_Bank,Current_Health_Expenditure,Domestic_Government_Health_Expenditure,Out-of-Pocket_Expenditure
0,Afghanistan,Afghanistan,10.2,5.1,77.4
1,Albania,Albania,6.7,41.4,58.0
2,Algeria,Algeria,6.6,67.7,30.9
3,Andorra,Andorra,10.4,49.1,41.7
4,Angola,Angola,2.9,44.1,35.2
...,...,...,...,...,...
201,Uzbekistan,Uzbekistan,6.3,46.1,52.2
203,Venezuela,"Venezuela, RB",3.2,24.1,40.0
204,Vietnam,Vietnam,5.7,47.4,44.6
208,Zambia,Zambia,4.5,38.3,12.1


In [24]:
#Confirm that there are no longer any missing values in the dataframe
final_df.count()

Country                                   169
World_Bank                                169
Current_Health_Expenditure                169
Domestic_Government_Health_Expenditure    169
Out-of-Pocket_Expenditure                 169
dtype: int64