In [365]:
import pandas as pd
import numpy as np
from pathlib import Path
from datetime import date

In [366]:
df = pd.read_csv('https://frontdoor-l4uikgap6gz3m.azurefd.net/DEX_CMS/GHE_FULL?$filter=DIM_GHECAUSE_TITLE%20eq%20%27Malaria%27%20and%20DIM_SEX_CODE%20eq%20%27BTSX%27&$select=DIM_GHECAUSE_TITLE,DIM_YEAR_CODE,DIM_COUNTRY_CODE,DIM_AGEGROUP_CODE,DIM_SEX_CODE,VAL_DEATHS_COUNT_NUMERIC&$format=csv')

In [367]:
df.shape

(73200, 6)

Saving a copy of the data locally in case the API changes or breaks

In [368]:
date = date.today()
Path("data/").mkdir(parents=True, exist_ok=True)
df.to_csv(f'data/{date}_ghe_malaria_deaths.csv', index = False)

Removing the rows for All Ages as we are only interested in age breakdowns.

In [369]:
df = df[df['DIM_AGEGROUP_CODE'] != 'ALLAges']

In [370]:
df.shape

(69540, 6)

Current age groups

In [371]:
df['DIM_AGEGROUP_CODE'].drop_duplicates()

30     YEARS70-74
31     YEARS75-79
32     YEARS80-84
33    YEARS85PLUS
34       YEARS0-1
35       YEARS1-4
36       YEARS5-9
37     YEARS10-14
38     YEARS15-19
39     YEARS20-24
40     YEARS25-29
41     YEARS30-34
42     YEARS35-39
43     YEARS40-44
44     YEARS45-49
45     YEARS50-54
46     YEARS55-59
47     YEARS60-64
48     YEARS65-69
Name: DIM_AGEGROUP_CODE, dtype: object

Years with data

In [372]:
df['DIM_YEAR_CODE'].drop_duplicates()

30       2000
3596     2001
7200     2002
10935    2003
14582    2004
18268    2005
22013    2006
25612    2007
29201    2008
32933    2009
36591    2010
40129    2011
43872    2012
47464    2013
51147    2014
55083    2015
58743    2016
62403    2017
66063    2018
69723    2019
Name: DIM_YEAR_CODE, dtype: int64

Reclassifying age groups

In [373]:

conditions = [
    (df['DIM_AGEGROUP_CODE'].isin(['YEARS0-1','YEARS1-4'])),
    (df['DIM_AGEGROUP_CODE'].isin(['YEARS5-9','YEARS10-14'])),
    (df['DIM_AGEGROUP_CODE'].isin(['YEARS15-19','YEARS20-24','YEARS25-29','YEARS30-34','YEARS35-39','YEARS40-44','YEARS45-49'])),
    (df['DIM_AGEGROUP_CODE'].isin(['YEARS50-54','YEARS55-59','YEARS60-64','YEARS65-69'])),
    (df['DIM_AGEGROUP_CODE'].isin(['YEARS70-74','YEARS75-79','YEARS80-84','YEARS85PLUS'])),
    ]

values = ['Under_5s', 'Years5-14', 'Years15-49', 'Years50-69', 'Years70plus']

df['new_age_group'] = np.select(conditions, values)


Check there aren't any age groups which have been missed

In [374]:
df.new_age_group.drop_duplicates()

30    Years70plus
34       Under_5s
36      Years5-14
38     Years15-49
45     Years50-69
Name: new_age_group, dtype: object

Create the output table for individual countries

In [375]:
df

Unnamed: 0,DIM_GHECAUSE_TITLE,DIM_YEAR_CODE,DIM_COUNTRY_CODE,DIM_AGEGROUP_CODE,DIM_SEX_CODE,VAL_DEATHS_COUNT_NUMERIC,new_age_group
30,Malaria,2000,AGO,YEARS70-74,BTSX,218.29,Years70plus
31,Malaria,2000,AGO,YEARS75-79,BTSX,28.78,Years70plus
32,Malaria,2000,AGO,YEARS80-84,BTSX,2.88,Years70plus
33,Malaria,2000,AGO,YEARS85PLUS,BTSX,0.09,Years70plus
34,Malaria,2000,ALB,YEARS0-1,BTSX,0.00,Under_5s
...,...,...,...,...,...,...,...
73195,Malaria,2019,ZWE,YEARS65-69,BTSX,32.05,Years50-69
73196,Malaria,2019,ZWE,YEARS70-74,BTSX,27.15,Years70plus
73197,Malaria,2019,ZWE,YEARS75-79,BTSX,7.26,Years70plus
73198,Malaria,2019,ZWE,YEARS80-84,BTSX,2.95,Years70plus


In [377]:
dfwg = df[['DIM_YEAR_CODE','DIM_COUNTRY_CODE', 'new_age_group', 'VAL_DEATHS_COUNT_NUMERIC']].groupby(by = ['DIM_YEAR_CODE', 'DIM_COUNTRY_CODE','new_age_group']).sum().reset_index()

In [384]:
dfwg.rename(columns = {'DIM_COUNTRY_CODE':'entity'}, inplace = True)
dfwg['value'] = round(dfwg['VAL_DEATHS_COUNT_NUMERIC'], 0).astype(int)



In [382]:
dfwg

Unnamed: 0,DIM_YEAR_CODE,entity,new_age_group,VAL_DEATHS_COUNT_NUMERIC,value
0,2000,AFG,Under_5s,265.00,265
1,2000,AFG,Years15-49,347.90,348
2,2000,AFG,Years5-14,29.07,29
3,2000,AFG,Years50-69,280.20,280
4,2000,AFG,Years70plus,48.82,49
...,...,...,...,...,...
18295,2019,ZWE,Under_5s,720.00,720
18296,2019,ZWE,Years15-49,337.40,337
18297,2019,ZWE,Years5-14,747.81,748
18298,2019,ZWE,Years50-69,160.11,160


In [385]:
dfw = pd.pivot_table(dfwg[['DIM_YEAR_CODE', 'entity', 'new_age_group', 'value']], values = 'value', index = [ 'entity','DIM_YEAR_CODE'], columns='new_age_group')
dfw = dfw.reset_index()
dfw.columns.name = None


In [386]:
dfw[dfw['DIM_YEAR_CODE'] == 2019]

Unnamed: 0,entity,DIM_YEAR_CODE,Under_5s,Years15-49,Years5-14,Years50-69,Years70plus
19,AFG,2019,40,117,3,20,7
39,AGO,2019,6459,2184,2234,2113,617
59,ALB,2019,0,0,0,0,0
79,ARE,2019,0,0,0,0,0
99,ARG,2019,0,0,0,0,0
...,...,...,...,...,...,...,...
3579,WSM,2019,0,0,0,0,0
3599,YEM,2019,510,882,17,217,93
3619,ZAF,2019,18,44,7,12,2
3639,ZMB,2019,3143,1663,1459,1200,417


Create a dataframe for the world which is the sum of all the country values

In [328]:
dfg = df[['DIM_YEAR_CODE', 'new_age_group', 'VAL_DEATHS_COUNT_NUMERIC']].groupby(by = ['DIM_YEAR_CODE', 'new_age_group']).sum().reset_index()

In [329]:
dfg['entity'] = 'World'

In [330]:
dfp = pd.pivot_table(dfg, values = 'VAL_DEATHS_COUNT_NUMERIC', index = [ 'entity','DIM_YEAR_CODE'], columns='new_age_group')

In [331]:
dfp = dfp.reset_index()
dfp.columns.name = None

Combine the country and the world tables together

In [332]:
dfw = dfw.append(dfp, ignore_index=True)


In [333]:
cols = ['Under_5s', 'Years5-14', 'Years15-49', 'Years50-69', 'Years70plus']
dfw[cols] = dfw[cols].round(0).astype(int)

dfw

Unnamed: 0,entity,DIM_YEAR_CODE,Under_5s,Years15-49,Years5-14,Years50-69,Years70plus
0,AFG,2000,132,50,14,70,12
1,AFG,2001,132,49,14,69,14
2,AFG,2002,158,92,14,44,8
3,AFG,2003,104,66,14,30,6
4,AFG,2004,42,28,7,13,3
...,...,...,...,...,...,...,...
3675,World,2015,316983,51527,34631,43029,12908
3676,World,2016,294643,50778,34582,42617,15745
3677,World,2017,281428,50774,34127,42688,15922
3678,World,2018,274908,49284,33456,41905,15977


In [334]:
dfw.rename(columns = {'entity':'Country'}, inplace=True)


Standardising country names

In [335]:
dfw['Country'].drop_duplicates().to_csv('data/countries_to_standardise.csv', index = False)

In [337]:
country_standardise = pd.read_csv('data/countries_to_standardise_country_standardized.csv').set_index('Country').squeeze().to_dict()


In [338]:
dfw['entity'] = dfw["Country"].apply(
        lambda x: country_standardise[x]
    )
dfw.drop('Country', axis = 1, inplace = True)

In [339]:
dfw

Unnamed: 0,DIM_YEAR_CODE,Under_5s,Years15-49,Years5-14,Years50-69,Years70plus,entity
0,2000,132,50,14,70,12,Afghanistan
1,2001,132,49,14,69,14,Afghanistan
2,2002,158,92,14,44,8,Afghanistan
3,2003,104,66,14,30,6,Afghanistan
4,2004,42,28,7,13,3,Afghanistan
...,...,...,...,...,...,...,...
3675,2015,316983,51527,34631,43029,12908,World
3676,2016,294643,50778,34582,42617,15745,World
3677,2017,281428,50774,34127,42688,15922,World
3678,2018,274908,49284,33456,41905,15977,World


In [340]:
dfw = dfw[['entity', 'DIM_YEAR_CODE', 'Under_5s', 'Years5-14', 'Years15-49', 'Years50-69', 'Years70plus']]
dfw.rename(columns={'DIM_YEAR_CODE':'year'}).to_csv(f'data/{date}_who_malaria_deaths_by_age.csv', index = False)