In [313]:
import pandas as pd
import numpy as np
from pathlib import Path
from datetime import date

In [314]:
df = pd.read_csv('https://frontdoor-l4uikgap6gz3m.azurefd.net/DEX_CMS/GHE_FULL?$filter=DIM_GHECAUSE_TITLE%20eq%20%27Malaria%27%20and%20DIM_SEX_CODE%20eq%20%27BTSX%27&$select=DIM_GHECAUSE_TITLE,DIM_YEAR_CODE,DIM_COUNTRY_CODE,DIM_AGEGROUP_CODE,DIM_SEX_CODE,VAL_DEATHS_COUNT_NUMERIC&$format=csv')

In [315]:
df.shape

(73200, 6)

Saving a copy of the data locally in case the API changes or breaks

In [316]:
date = date.today()
Path("data/").mkdir(parents=True, exist_ok=True)
df.to_csv(f'data/{date}_ghe_malaria_deaths.csv', index = False)

In [317]:
cols = ['DIM_COUNTRY_CODE', 'DIM_YEAR_CODE', 'DIM_AGEGROUP_CODE',
       'DIM_SEX_CODE','DIM_GHECAUSE_TITLE', 'VAL_DEATHS_COUNT_NUMERIC']
df = df[cols]

df = df[df['DIM_AGEGROUP_CODE'] != 'ALLAges']

In [318]:
df.shape

(69540, 6)

In [319]:
df['DIM_AGEGROUP_CODE'].drop_duplicates()

30     YEARS70-74
31     YEARS75-79
32     YEARS80-84
33    YEARS85PLUS
34       YEARS0-1
35       YEARS1-4
36       YEARS5-9
37     YEARS10-14
38     YEARS15-19
39     YEARS20-24
40     YEARS25-29
41     YEARS30-34
42     YEARS35-39
43     YEARS40-44
44     YEARS45-49
45     YEARS50-54
46     YEARS55-59
47     YEARS60-64
48     YEARS65-69
Name: DIM_AGEGROUP_CODE, dtype: object

In [320]:
df['DIM_YEAR_CODE'].drop_duplicates()

30       2000
3596     2001
7200     2002
10935    2003
14582    2004
18268    2005
22013    2006
25612    2007
29201    2008
32933    2009
36591    2010
40129    2011
43872    2012
47464    2013
51147    2014
55083    2015
58743    2016
62403    2017
66063    2018
69723    2019
Name: DIM_YEAR_CODE, dtype: int64

In [321]:

conditions = [
    (df['DIM_AGEGROUP_CODE'].isin(['YEARS0-1','YEARS1-4'])),
    (df['DIM_AGEGROUP_CODE'].isin(['YEARS5-9','YEARS10-14'])),
    (df['DIM_AGEGROUP_CODE'].isin(['YEARS15-19','YEARS20-24','YEARS25-29','YEARS30-34','YEARS35-39','YEARS40-44','YEARS45-49'])),
    (df['DIM_AGEGROUP_CODE'].isin(['YEARS50-54','YEARS55-59','YEARS60-64','YEARS65-69'])),
    (df['DIM_AGEGROUP_CODE'].isin(['YEARS70-74','YEARS75-79','YEARS80-84','YEARS85PLUS'])),
    ]

values = ['Under_5s', 'Years5-14', 'Years15-49', 'Years50-69', 'Years70plus']

df['new_age_group'] = np.select(conditions, values)


Check there aren't any age groups which have been missed

In [322]:
df.new_age_group.drop_duplicates()

30    Years70plus
34       Under_5s
36      Years5-14
38     Years15-49
45     Years50-69
Name: new_age_group, dtype: object

Create the output table for individual countries

In [323]:
df

Unnamed: 0,DIM_COUNTRY_CODE,DIM_YEAR_CODE,DIM_AGEGROUP_CODE,DIM_SEX_CODE,DIM_GHECAUSE_TITLE,VAL_DEATHS_COUNT_NUMERIC,new_age_group
30,AGO,2000,YEARS70-74,BTSX,Malaria,218.29,Years70plus
31,AGO,2000,YEARS75-79,BTSX,Malaria,28.78,Years70plus
32,AGO,2000,YEARS80-84,BTSX,Malaria,2.88,Years70plus
33,AGO,2000,YEARS85PLUS,BTSX,Malaria,0.09,Years70plus
34,ALB,2000,YEARS0-1,BTSX,Malaria,0.00,Under_5s
...,...,...,...,...,...,...,...
73195,ZWE,2019,YEARS65-69,BTSX,Malaria,32.05,Years50-69
73196,ZWE,2019,YEARS70-74,BTSX,Malaria,27.15,Years70plus
73197,ZWE,2019,YEARS75-79,BTSX,Malaria,7.26,Years70plus
73198,ZWE,2019,YEARS80-84,BTSX,Malaria,2.95,Years70plus


In [324]:
df.rename(columns = {'DIM_COUNTRY_CODE':'entity'}, inplace = True)
df['value'] = round(df['VAL_DEATHS_COUNT_NUMERIC'], 0).astype(int)



In [325]:
df

Unnamed: 0,entity,DIM_YEAR_CODE,DIM_AGEGROUP_CODE,DIM_SEX_CODE,DIM_GHECAUSE_TITLE,VAL_DEATHS_COUNT_NUMERIC,new_age_group,value
30,AGO,2000,YEARS70-74,BTSX,Malaria,218.29,Years70plus,218
31,AGO,2000,YEARS75-79,BTSX,Malaria,28.78,Years70plus,29
32,AGO,2000,YEARS80-84,BTSX,Malaria,2.88,Years70plus,3
33,AGO,2000,YEARS85PLUS,BTSX,Malaria,0.09,Years70plus,0
34,ALB,2000,YEARS0-1,BTSX,Malaria,0.00,Under_5s,0
...,...,...,...,...,...,...,...,...
73195,ZWE,2019,YEARS65-69,BTSX,Malaria,32.05,Years50-69,32
73196,ZWE,2019,YEARS70-74,BTSX,Malaria,27.15,Years70plus,27
73197,ZWE,2019,YEARS75-79,BTSX,Malaria,7.26,Years70plus,7
73198,ZWE,2019,YEARS80-84,BTSX,Malaria,2.95,Years70plus,3


In [326]:
dfw = pd.pivot_table(df[['DIM_YEAR_CODE', 'entity', 'new_age_group', 'value']], values = 'value', index = [ 'entity','DIM_YEAR_CODE'], columns='new_age_group')
dfw = dfw.reset_index()
dfw.columns.name = None


In [327]:
dfw

Unnamed: 0,entity,DIM_YEAR_CODE,Under_5s,Years15-49,Years5-14,Years50-69,Years70plus
0,AFG,2000,132.5,49.714286,14.5,70.00,12.00
1,AFG,2001,131.5,49.428571,14.0,69.00,14.50
2,AFG,2002,157.5,92.285714,14.5,44.50,7.75
3,AFG,2003,103.5,66.428571,14.0,29.75,6.00
4,AFG,2004,41.5,28.000000,7.0,12.75,2.75
...,...,...,...,...,...,...,...
3655,ZWE,2015,564.5,69.428571,431.5,49.50,11.25
3656,ZWE,2016,348.5,49.857143,349.5,37.00,9.50
3657,ZWE,2017,761.5,72.285714,527.5,58.25,14.50
3658,ZWE,2018,274.5,39.857143,319.0,33.00,7.75


Create a dataframe for the world which is the sum of all the country values

In [328]:
dfg = df[['DIM_YEAR_CODE', 'new_age_group', 'VAL_DEATHS_COUNT_NUMERIC']].groupby(by = ['DIM_YEAR_CODE', 'new_age_group']).sum().reset_index()

In [329]:
dfg['entity'] = 'World'
dfg['VAL_DEATHS_COUNT_NUMERIC'] = round(dfg['VAL_DEATHS_COUNT_NUMERIC'], 0).astype(int)

In [330]:
dfp = pd.pivot_table(dfg, values = 'VAL_DEATHS_COUNT_NUMERIC', index = [ 'entity','DIM_YEAR_CODE'], columns='new_age_group')

In [331]:
dfp = dfp.reset_index()
dfp.columns.name = None

Combine the country and the world tables together

In [332]:
dfw = dfw.append(dfp, ignore_index=True)


In [333]:
cols = ['Under_5s', 'Years5-14', 'Years15-49', 'Years50-69', 'Years70plus']
dfw[cols] = dfw[cols].round(0).astype(int)

dfw

Unnamed: 0,entity,DIM_YEAR_CODE,Under_5s,Years15-49,Years5-14,Years50-69,Years70plus
0,AFG,2000,132,50,14,70,12
1,AFG,2001,132,49,14,69,14
2,AFG,2002,158,92,14,44,8
3,AFG,2003,104,66,14,30,6
4,AFG,2004,42,28,7,13,3
...,...,...,...,...,...,...,...
3675,World,2015,316983,51527,34631,43029,12908
3676,World,2016,294643,50778,34582,42617,15745
3677,World,2017,281428,50774,34127,42688,15922
3678,World,2018,274908,49284,33456,41905,15977


In [334]:
dfw.rename(columns = {'entity':'Country'}, inplace=True)


In [335]:
dfw['Country'].drop_duplicates().to_csv('data/countries_to_standardise.csv', index = False)

In [337]:
country_standardise = pd.read_csv('data/countries_to_standardise_country_standardized.csv').set_index('Country').squeeze().to_dict()


In [338]:
dfw['entity'] = dfw["Country"].apply(
        lambda x: country_standardise[x]
    )
dfw.drop('Country', axis = 1, inplace = True)

In [339]:
dfw

Unnamed: 0,DIM_YEAR_CODE,Under_5s,Years15-49,Years5-14,Years50-69,Years70plus,entity
0,2000,132,50,14,70,12,Afghanistan
1,2001,132,49,14,69,14,Afghanistan
2,2002,158,92,14,44,8,Afghanistan
3,2003,104,66,14,30,6,Afghanistan
4,2004,42,28,7,13,3,Afghanistan
...,...,...,...,...,...,...,...
3675,2015,316983,51527,34631,43029,12908,World
3676,2016,294643,50778,34582,42617,15745,World
3677,2017,281428,50774,34127,42688,15922,World
3678,2018,274908,49284,33456,41905,15977,World


In [340]:
dfw = dfw[['entity', 'DIM_YEAR_CODE', 'Under_5s', 'Years5-14', 'Years15-49', 'Years50-69', 'Years70plus']]
dfw.rename(columns={'DIM_YEAR_CODE':'year'}).to_csv(f'data/{date}_who_malaria_deaths_by_age.csv', index = False)