#### Getting data from the World Bank to build a dataframe

In [31]:
# imports
import numpy as np
import pandas as pd
import matplotlib.pyplot
import seaborn as sns

from functools import reduce

#Custom API set up for WorldBank
import wbgapi as wb

-----
## Read in and Aggregate Agriculture/Rural Development Data

In [None]:
def world_bank_api(name, api_filepath, indicator_name):
    '''
    Reads in a dataframe from World Bank
    Renames columns
    Selects 2017 column
    prepares to be merged with other dataframes
    '''
    name = wb.data.DataFrame(api_filepath)
    # name.drop(columns = ['Unnamed: 66'], inplace = True)
    
    #selecting useful columns
    name = name[['YR2017']].copy()
    
    # rename to specify 
    name.rename(columns = {'YR2017': indicator_name}, inplace = True)
    
    return name

In [None]:
rural_pop_17 = world_bank_api('rural_pop_17', 'SP.RUR.TOTL.ZS','rural_pop_percent_2017')

In [None]:
rural_pop_17.head()

Unnamed: 0_level_0,rural_pop_percent_2017
economy,Unnamed: 1_level_1
ABW,56.707
AFE,64.667627
AFG,74.75
AFW,53.980666
AGO,35.161


In [None]:
names = ['rural_pop_percent_17', 'food_production_index_17', 'ag_land_area_17', 'arable_land_percent_2017',
         'net_migration_2017','hiv_prevalence_2017','mat_mortality_ratio_2017','under5_mortality_ratio_2017',
        'tubercul_incidence_2017','elec_access_2017','ren_energy_percent_2017','ffuel_energy_percent_2017',
        'co2_emissions_2017','pop_air_pollution_2017','foreign_dir_inv_2017','atm_access_2017','legal_rights_index_2017',
        'adol_fertility_rate_2017','fem_labor_part_rate_2017','male_labor_part_rate_2017','fertility_rate_2017',
        'dpt_immuniz_rate_2017','undernourished_rate_2017','cell_subscriptions_per100','internet_per_mil_2017',
        'military_exp_2017','women_seats_percent_2017','male_bus_start_2017','female_bus_start_2017','patent_apps_2017',
        'sci_articles_2017']

api_keys = ['SP.RUR.TOTL.ZS', 'AG.PRD.FOOD.XD', 'AG.LND.AGRI.ZS', 'AG.LND.ARBL.ZS','SM.POP.NETM',
            'SH.DYN.AIDS.ZS','SH.STA.MMRT','SH.DYN.MORT','SH.TBS.INCD','SP.RUR.TOTL.ZS','EG.FEC.RNEW.ZS',
            'EG.USE.COMM.FO.ZS','EN.ATM.CO2E.PC','EN.ATM.PM25.MC.M3','BX.KLT.DINV.CD.WD','FB.ATM.TOTL.P5','IC.LGL.CRED.XQ',
            'SP.ADO.TFRT','SL.TLF.CACT.FE.ZS','SL.TLF.CACT.MA.ZS','SP.DYN.TFRT.IN','SH.IMM.IDPT','SN.ITK.DEFC.ZS','IT.CEL.SETS.P2',
            'IT.CEL.SETS.P2','MS.MIL.XPND.GD.ZS','SG.GEN.PARL.ZS','IC.REG.DURS.MA','IC.REG.DURS.FE','IP.PAT.RESD',
            'IP.JRN.ARTC.SC']           

In [None]:
def merged_table(names_list, key_list):
    '''
    This function iterates through the various World Bank Indicators
    pulls in csv data and filters for the 2017 column
    and merges the sub-data frames into one merged copy
    '''
    data_frames = []
    
    # Creating data frames for each indicator
    # appends the name of each new data frame to a list to be used in the merge
    
    for name in names_list:
        for api in key_list:
            df_name = world_bank_api(name, api, name) # calling the function that was created previously
            data_frames.append(df_name)
            time.sleep(1)
    # the reduce function code was adapted from everestial007 response on stack overflow (https://stackoverflow.com/questions/44327999/python-pandas-merge-multiple-dataframes)
    # merging the dataframes
    merged_df = reduce(lambda left,right: pd.merge(left,right,
                                                   left_index = True,
                                                   right_index = True,
                                                   suffixes=(None, '_right'),
                                                   how='inner',
                                                  ),
                                                   data_frames)
    
    # remove any duplicate columns
    merged_df = merged_df[[col for col in merged_df if "right" not in col]]
    
    return merged_df

In [8]:
merged_df = merged_table(names, api_keys)

KeyboardInterrupt: 

In [None]:
# merged_table the end will need add the country names to the table (merge on economy/country code)

In [32]:
# read csv version
def world_bank_csv(name, filepath, indicator_name):
    '''
    Reads in csv file from World Bank
    Renames columns
    Selects 2017 column
    prepares to be merged with other dataframes
    '''
    name = pd.read_csv('../datasets/' + filepath, skiprows = 4)
    name.drop(columns = ['Unnamed: 66'], inplace = True)
    
    #selecting useful columns
    name = name[['Country Name','Country Code','2017']].copy()
    
    # rename to specify 
    name.rename(columns = {'2017': indicator_name}, inplace = True)
    
    return name

In [33]:
rural_pop_17 = world_bank_csv('rural_pop_17', 'API_SP.RUR.TOTL.ZS_DS2_en_csv_v2_4261416.csv','rural_pop_percent_2017')

In [34]:
rural_pop_17.head()

Unnamed: 0,Country Name,Country Code,rural_pop_percent_2017
0,Aruba,ABW,56.707
1,Africa Eastern and Southern,AFE,64.667627
2,Afghanistan,AFG,74.75
3,Africa Western and Central,AFW,53.980666
4,Angola,AGO,35.161


In [35]:
names = ['rural_pop_percent_17', 'food_production_index_17', 'ag_land_area_17', 'arable_land_percent_2017',
         'net_migration_2017','hiv_prevalence_2017','mat_mortality_ratio_2017','under5_mortality_ratio_2017',
        'tubercul_incidence_2017','elec_access_2017','ren_energy_percent_2017','ffuel_energy_percent_2017',
        'co2_emissions_2017','pop_air_pollution_2017','foreign_dir_inv_2017','atm_access_2017','legal_rights_index_2017',
        'adol_fertility_rate_2017','fem_labor_part_rate_2017','male_labor_part_rate_2017','fertility_rate_2017',
        'dpt_immuniz_rate_2017','undernourished_rate_2017','cell_subscriptions_per100','internet_per_mil_2017',
        'military_exp_2017','women_seats_percent_2017','male_bus_start_2017','female_bus_start_2017','patent_apps_2017',
        'sci_articles_2017','pop_density_2017']

file_paths = ['API_SP.RUR.TOTL.ZS_DS2_en_csv_v2_4261416.csv', 'API_AG.PRD.FOOD.XD_DS2_en_csv_v2_4254742.csv', 'API_AG.LND.AGRI.ZS_DS2_en_csv_v2_4254639.csv',
            'API_AG.LND.ARBL.ZS_DS2_en_csv_v2_4252676.csv','API_SM.POP.NETM_DS2_en_csv_v2_4250793.csv','API_SH.DYN.AIDS.ZS_DS2_en_csv_v2_4250791.csv',
            'API_SH.STA.MMRT_DS2_en_csv_v2_4252399.csv','API_SH.DYN.MORT_DS2_en_csv_v2_4252415.csv',
            'API_SH.TBS.INCD_DS2_en_csv_v2_4250622.csv','API_SP.RUR.TOTL.ZS_DS2_en_csv_v2_4261416.csv','API_EG.FEC.RNEW.ZS_DS2_en_csv_v2_4251598.csv',
            'API_EG.USE.COMM.FO.ZS_DS2_en_csv_v2_4250919.csv','API_EN.ATM.CO2E.PC_DS2_en_csv_v2_4251354.csv','API_EN.ATM.PM25.MC.M3_DS2_en_csv_v2_4251710.csv',
            'API_BX.KLT.DINV.CD.WD_DS2_en_csv_v2_4250821.csv','API_FB.ATM.TOTL.P5_DS2_en_csv_v2_4260830.csv','API_IC.LGL.CRED.XQ_DS2_en_csv_v2_4261026.csv',
            'API_SP.ADO.TFRT_DS2_en_csv_v2_4252410.csv','API_SL.TLF.CACT.FE.ZS_DS2_en_csv_v2_4250833.csv','API_SL.TLF.CACT.MA.ZS_DS2_en_csv_v2_4251123.csv',
            'API_SP.DYN.TFRT.IN_DS2_en_csv_v2_4252390.csv','API_SH.IMM.IDPT_DS2_en_csv_v2_4261936.csv','API_SN.ITK.DEFC.ZS_DS2_en_csv_v2_4251163.csv',
            'API_IT.CEL.SETS.P2_DS2_en_csv_v2_4251970.csv','API_IT.NET.SECR.P6_DS2_en_csv_v2_4251938.csv','API_MS.MIL.XPND.GD.ZS_DS2_en_csv_v2_4349031.csv',
            'API_SG.GEN.PARL.ZS_DS2_en_csv_v2_4353272.csv','API_IC.REG.DURS.MA_DS2_en_csv_v2_4353601.csv','API_IC.REG.DURS.FE_DS2_en_csv_v2_4353597.csv',
            'API_IP.PAT.RESD_DS2_en_csv_v2_4353337.csv','API_IP.JRN.ARTC.SC_DS2_en_csv_v2_4353653.csv','API_EN.POP.DNST_DS2_en_csv_v2_4353230.csv']           

In [36]:
def merged_table(names_list, path_list):
    '''
    This function iterates through the various World Bank Indicators
    pulls in csv data and filters for the 2017 column
    and merges the sub-data frames into one merged copy
    '''
    data_frames = []
    
    # Creating data frames for each indicator
    # appends the name of each new data frame to a list to be used in the merge
    
    for name in names_list:
        for path in path_list:
            df_name = world_bank_csv(name, path, name) # calling the function that was created previously
            data_frames.append(df_name)
            time.sleep(1)
    # the reduce function code was adapted from everestial007 response on stack overflow (https://stackoverflow.com/questions/44327999/python-pandas-merge-multiple-dataframes)
    # merging the dataframes
    merged_df = reduce(lambda left,right: pd.merge(left,right,
                                                   left_index = True,
                                                   right_index = True,
                                                   suffixes=(None, '_right'),
                                                   how='inner',
                                                  ),
                                                   data_frames)
    
    # remove any duplicate columns
    merged_df = merged_df[[col for col in merged_df if "right" not in col]]
    
    return merged_df

In [37]:
merged_df = merged_table(names, file_paths)

In [None]:
merged_df.head()

Unnamed: 0,Country Name,Country Code,rural_pop_percent_17,food_production_index_17,ag_land_area_17,arable_land_percent_2017,net_migration_2017,hiv_prevalence_2017,mat_mortality_ratio_2017,under5_mortality_ratio_2017,...,undernourished_rate_2017,cell_subscriptions_per100,internet_per_mil_2017,military_exp_2017,women_seats_percent_2017,male_bus_start_2017,female_bus_start_2017,patent_apps_2017,sci_articles_2017,pop_density_2017
0,Aruba,ABW,56.707,56.707,56.707,56.707,56.707,56.707,56.707,56.707,...,56.707,56.707,56.707,56.707,56.707,56.707,56.707,56.707,56.707,56.707
1,Africa Eastern and Southern,AFE,64.667627,64.667627,64.667627,64.667627,64.667627,64.667627,64.667627,64.667627,...,64.667627,64.667627,64.667627,64.667627,64.667627,64.667627,64.667627,64.667627,64.667627,64.667627
2,Afghanistan,AFG,74.75,74.75,74.75,74.75,74.75,74.75,74.75,74.75,...,74.75,74.75,74.75,74.75,74.75,74.75,74.75,74.75,74.75,74.75
3,Africa Western and Central,AFW,53.980666,53.980666,53.980666,53.980666,53.980666,53.980666,53.980666,53.980666,...,53.980666,53.980666,53.980666,53.980666,53.980666,53.980666,53.980666,53.980666,53.980666,53.980666
4,Angola,AGO,35.161,35.161,35.161,35.161,35.161,35.161,35.161,35.161,...,35.161,35.161,35.161,35.161,35.161,35.161,35.161,35.161,35.161,35.161


In [None]:
merged_df.to_csv('../datasets/merged_df.csv', index=False) # saved combined data frame to csv
# ready for further cleaning