In [324]:
import pandas as pd
import datetime
import requests
import numpy as np
from polio_utils import download_polio_data, extract_wild_cases, extract_vd_cases, owid_population, standardise_countries, extract_historical_wild_cases, get_who_data_and_regions

In [325]:
who_data = pd.read_csv('data/who_extranet_cases.csv')
who_data['entity'] = standardise_countries(who_data['country_territory_region'])
current_year = datetime.datetime.now().year 

In [326]:
who_ext_wild = who_data[['year','entity', 'wild_poliovirus_cases', 'non_polio_afp_rate','percent_adequate_stool_collection']]
who_ext_wild = who_ext_wild[(who_ext_wild.year > 2000) & (who_ext_wild.year < current_year)]
who_ext_wild = who_ext_wild[~who_ext_wild.entity.str.contains("_REGION")]
#who_ext_wild['total_polio'] = who_ext_wild['wild_poliovirus_cases'] + who_ext_wild['c_vdpv_cases']
who_ext_wild.rename(columns= {'wild_poliovirus_cases':'wild_polio_cases' }, inplace=True)
who_ext_wild

Unnamed: 0,year,entity,wild_polio_cases,non_polio_afp_rate,percent_adequate_stool_collection
172,2001,Afghanistan,11.0,1.70,73.0
173,2001,Albania,0.0,1.11,92.0
174,2001,Algeria,1.0,1.23,98.0
175,2001,Andorra,,,
176,2001,Angola,1.0,2.40,66.0
...,...,...,...,...,...
3824,2021,Vietnam,0.0,1.15,97.0
3825,2021,Palestine,0.0,0.99,100.0
3826,2021,Yemen,0.0,7.50,88.0
3827,2021,Zambia,0.0,3.95,36.0


In [327]:
who_ext_vdpv = who_data[['year','entity', 'c_vdpv_cases']]
who_ext_vdpv = who_ext_vdpv[(who_ext_vdpv.year > 2000) & (who_ext_vdpv.year < 2016)]
who_ext_vdpv = who_ext_vdpv[~who_ext_vdpv.entity.str.contains("_REGION")]
#who_ext_vdpv['total_polio'] = who_ext_vdpv['wild_poliovirus_cases'] + who_ext_vdpv['c_vdpv_cases']
who_ext_vdpv.rename(columns= {'c_vdpv_cases':"total_cVDPV"}, inplace=True)
who_ext_vdpv

Unnamed: 0,year,entity,total_cVDPV
172,2001,Afghanistan,0.0
173,2001,Albania,0.0
174,2001,Algeria,0.0
175,2001,Andorra,
176,2001,Angola,0.0
...,...,...,...
2774,2015,Vietnam,0.0
2775,2015,Palestine,0.0
2776,2015,Yemen,0.0
2777,2015,Zambia,0.0


Data from 1980-2019 from WHO - download from http://www.who.int/entity/immunization/monitoring_surveillance/data/incidence_series.xls?ua=1

In [328]:
who_melt, regions = get_who_data_and_regions()

In [329]:
who_melt = who_melt[who_melt.year < 2001]
who_melt.sort_values(['entity','year'])

Unnamed: 0,entity,year,total_polio
7566,Afghanistan,1980,880.0
7372,Afghanistan,1981,837.0
7178,Afghanistan,1982,1390.0
6984,Afghanistan,1983,1991.0
6790,Afghanistan,1984,552.0
...,...,...,...
4655,Zimbabwe,1996,1.0
4461,Zimbabwe,1997,3.0
4267,Zimbabwe,1998,17.0
4073,Zimbabwe,1999,2.0


Get wild type data

In [330]:
res = download_polio_data(url_stub='https://polioeradication.org/wp-content/uploads/2022/03/weekly-polio-analyses-WPV-')
date_today = datetime.date.today().strftime("%Y-%m-%d")
fp = f"data/polio_wild_cases_{date_today}.pdf"

with open(fp, 'wb') as f:
    f.write(res.content)

wt_df = extract_wild_cases(file_path = fp)
wt_df['entity'] = standardise_countries(wt_df['entity'])


res = requests.get('https://polioeradication.org/wp-content/uploads/2017/01/WPV_2011-2016_03JAN17.pdf')

fp = f"data/polio_historical_wild_cases_{date_today}.pdf"

with open(fp, 'wb') as f:
    f.write(res.content)
wth_df = extract_historical_wild_cases(file_path = fp)
wth_df['entity'] = standardise_countries(wth_df['entity'])

wt_df = wt_df.append(wth_df)
wt_df.year = wt_df.year.astype(int)
wt_df.wild_polio_cases = wt_df.wild_polio_cases.astype(int)


Get vaccine derived cases

In [331]:
res = download_polio_data(url_stub='http://polioeradication.org/wp-content/uploads/2022/03/weekly-polio-analyses-cVDPV-')
date = datetime.date.today().strftime("%Y-%m-%d")
fp = f"data/polio_vaccine_derived_cases_{date}.pdf"

with open(fp, 'wb') as f:
    f.write(res.content)

vd_df = extract_vd_cases()
vd_df['entity'] = standardise_countries(vd_df['entity'])
vd_df.year = vd_df.year.astype(int)
vd_df.total_cVDPV = vd_df.total_cVDPV.astype(int)

vd_df = vd_df.groupby(['entity', 'year']).sum()
vd_df = vd_df.reset_index()
vd_df

Unnamed: 0,entity,year,cVDPV1,cVDPV2,cVDPV3,total_cVDPV
0,Afghanistan,2016,0,0,0,0
1,Afghanistan,2017,0,0,0,0
2,Afghanistan,2018,0,0,0,0
3,Afghanistan,2019,0,0,0,0
4,Afghanistan,2020,0,308,0,308
...,...,...,...,...,...,...
283,Zambia,2017,0,0,0,0
284,Zambia,2018,0,0,0,0
285,Zambia,2019,0,2,0,2
286,Zambia,2020,0,0,0,0


In [332]:

polio_df = vd_df.merge(who_ext_vdpv, on=['entity','year', 'total_cVDPV'], how='outer').merge(who_ext_wild, on=['entity','year'], how='outer').merge(who_melt, on=['entity','year'], how='outer')
#polio_df = polio_df.groupby(['year','entity'],as_index=False).first()
polio_df.sort_values(by=['entity','year'])


Unnamed: 0,entity,year,cVDPV1,cVDPV2,cVDPV3,total_cVDPV,wild_polio_cases,non_polio_afp_rate,percent_adequate_stool_collection,total_polio
7495,Afghanistan,1980,,,,,,,,880.0
7301,Afghanistan,1981,,,,,,,,837.0
7107,Afghanistan,1982,,,,,,,,1390.0
6913,Afghanistan,1983,,,,,,,,1991.0
6719,Afghanistan,1984,,,,,,,,552.0
...,...,...,...,...,...,...,...,...,...,...
3114,Zimbabwe,2017,,,,,0.0,3.33,89.0,
3239,Zimbabwe,2018,,,,,0.0,3.74,78.0,
3364,Zimbabwe,2019,,,,,0.0,3.03,90.0,
3489,Zimbabwe,2020,,,,,0.0,2.86,70.0,


In [333]:
polio_df[polio_df[['year','entity']].duplicated(keep=False)].sort_values(['year','entity'])

Unnamed: 0,entity,year,cVDPV1,cVDPV2,cVDPV3,total_cVDPV,wild_polio_cases,non_polio_afp_rate,percent_adequate_stool_collection,total_polio


In [334]:
df = polio_df.set_index(['entity','year'])
mux = pd.MultiIndex.from_product([df.index.levels[0], df.index.levels[1]],names=['entity','year'])
df = df.reindex(mux, fill_value=np.nan).reset_index()
df.sort_values(['entity','year'])

Unnamed: 0,entity,year,cVDPV1,cVDPV2,cVDPV3,total_cVDPV,wild_polio_cases,non_polio_afp_rate,percent_adequate_stool_collection,total_polio
0,Afghanistan,1980,,,,,,,,880.0
1,Afghanistan,1981,,,,,,,,837.0
2,Afghanistan,1982,,,,,,,,1390.0
3,Afghanistan,1983,,,,,,,,1991.0
4,Afghanistan,1984,,,,,,,,552.0
...,...,...,...,...,...,...,...,...,...,...
8437,Zimbabwe,2017,,,,,0.0,3.33,89.0,
8438,Zimbabwe,2018,,,,,0.0,3.74,78.0,
8439,Zimbabwe,2019,,,,,0.0,3.03,90.0,
8440,Zimbabwe,2020,,,,,0.0,2.86,70.0,


In [343]:
df['total_polio'][df.year > 2000] = df[['wild_polio_cases','total_cVDPV']][df.year > 2000].sum(axis=1, min_count=1)
df[df.entity == 'Chad']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['total_polio'][df.year > 2000] = df[['wild_polio_cases','total_cVDPV']][df.year > 2000].sum(axis=1, min_count=1)


Unnamed: 0,entity,year,cVDPV1,cVDPV2,cVDPV3,total_cVDPV,wild_polio_cases,non_polio_afp_rate,percent_adequate_stool_collection,total_polio
1386,Chad,1980,,,,,,,,
1387,Chad,1981,,,,,,,,
1388,Chad,1982,,,,,,,,
1389,Chad,1983,,,,,,,,24.0
1390,Chad,1984,,,,,,,,9.0
1391,Chad,1985,,,,,,,,45.0
1392,Chad,1986,,,,,,,,9.0
1393,Chad,1987,,,,,,,,
1394,Chad,1988,,,,,,,,
1395,Chad,1989,,,,,,,,


Add the regional totals

In [336]:
regional_total = regions.merge(df,on = 'entity').groupby(['WHO_REGION', 'year'])[['cVDPV1', 'cVDPV2', 'cVDPV3','total_cVDPV','wild_polio_cases','total_polio']].sum(min_count=1).reset_index()
regional_total['WHO_REGION'].replace(['AFR', 'AMR', 'SEAR', 'EUR', 'EMR', 'WPR'], ['Africa', 'Americas', 'South-East Asia', 'Europe', 'Eastern Mediterranean', 'Western Pacific'], inplace = True)
regional_total.rename(columns = {'WHO_REGION':'entity'}, inplace = True)
regional_total

Unnamed: 0,entity,year,cVDPV1,cVDPV2,cVDPV3,total_cVDPV,wild_polio_cases,total_polio
0,Africa,1980,,,,,,5126.0
1,Africa,1981,,,,,,4191.0
2,Africa,1982,,,,,,3897.0
3,Africa,1983,,,,,,3066.0
4,Africa,1984,,,,,,2968.0
...,...,...,...,...,...,...,...,...
247,Western Pacific,2017,0.0,0.0,0.0,0.0,0.0,0.0
248,Western Pacific,2018,26.0,0.0,0.0,26.0,0.0,26.0
249,Western Pacific,2019,5.0,13.0,0.0,18.0,0.0,18.0
250,Western Pacific,2020,1.0,1.0,0.0,2.0,0.0,2.0


Adding a global total 

In [337]:
global_entities = regions
global_entities['WHO_REGION'] = 'World'
global_total = global_entities.merge(df,on = 'entity').groupby(['WHO_REGION', 'year']).sum(min_count=1)[['cVDPV1', 'cVDPV2', 'cVDPV3','total_cVDPV','wild_polio_cases','total_polio']].reset_index()
global_total.rename(columns = {'WHO_REGION':'entity'}, inplace = True)

In [338]:
df.drop(df[df['entity'] == 'World'].index, inplace=True)
total_df = pd.concat([df,regional_total,  global_total])
total_df['total_polio'] = total_df['total_polio'].fillna(0)
total_df[total_df.entity == 'World']

Unnamed: 0,entity,year,cVDPV1,cVDPV2,cVDPV3,total_cVDPV,wild_polio_cases,non_polio_afp_rate,percent_adequate_stool_collection,total_polio
0,World,1980,,,,,,,,52630.0
1,World,1981,,,,,,,,65737.0
2,World,1982,,,,,,,,51628.0
3,World,1983,,,,,,,,39973.0
4,World,1984,,,,,,,,35084.0
5,World,1985,,,,,,,,38483.0
6,World,1986,,,,,,,,32846.0
7,World,1987,,,,,,,,39683.0
8,World,1988,,,,,,,,34617.0
9,World,1989,,,,,,,,26104.0


Add per million variables

In [339]:
population = owid_population()

pop_df = pd.DataFrame(pd.merge(left = population, right = total_df, how="right"))
per_mil_df = pop_df[['entity', 'year']]
per_mil_df[['wild_polio_cases_per_million','cVDPV1_per_million','cVDPV2_per_million','cVDPV3_per_million','total_cVDPV_per_million','total_polio_per_million']] = pop_df[['wild_polio_cases','cVDPV1','cVDPV2','cVDPV3','total_cVDPV','total_polio']].div(pop_df.population, axis=0).mul(1000000).round(3)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


For wild polio we can fill NA for 2011 with 0, for vaccine derived it is 2016 onwards. To ensure that countries with 0 cases show up in the grapher as such.

In [340]:
final_df = pop_df.merge(per_mil_df).drop(columns = 'population')
#final_df['wild_polio_cases'][(final_df.year >=2011) & (final_df['wild_polio_cases'].isna())] = 0
#final_df['wild_polio_cases_per_million'][(final_df.year >=2011) & (final_df['wild_polio_cases_per_million'].isna())] = 0

final_df['total_cVDPV'][(final_df.year >=2016) & (final_df['total_cVDPV'].isna())] = 0
final_df['total_cVDPV_per_million'][(final_df.year >=2016) & (final_df['total_cVDPV_per_million'].isna())] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['total_cVDPV'][(final_df.year >=2016) & (final_df['total_cVDPV'].isna())] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['total_cVDPV_per_million'][(final_df.year >=2016) & (final_df['total_cVDPV_per_million'].isna())] = 0


In [341]:
final_df.sort_values(['entity','year'])

Unnamed: 0,entity,year,cVDPV1,cVDPV2,cVDPV3,total_cVDPV,wild_polio_cases,non_polio_afp_rate,percent_adequate_stool_collection,total_polio,wild_polio_cases_per_million,cVDPV1_per_million,cVDPV2_per_million,cVDPV3_per_million,total_cVDPV_per_million,total_polio_per_million
0,Afghanistan,1980,,,,,,,,880.0,,,,,,65.886
1,Afghanistan,1981,,,,,,,,837.0,,,,,,63.545
2,Afghanistan,1982,,,,,,,,1390.0,,,,,,107.898
3,Afghanistan,1983,,,,,,,,1991.0,,,,,,158.801
4,Afghanistan,1984,,,,,,,,552.0,,,,,,45.230
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8437,Zimbabwe,2017,,,,0.0,0.0,3.33,89.0,0.0,0.0,,,,0.0,0.000
8438,Zimbabwe,2018,,,,0.0,0.0,3.74,78.0,0.0,0.0,,,,0.0,0.000
8439,Zimbabwe,2019,,,,0.0,0.0,3.03,90.0,0.0,0.0,,,,0.0,0.000
8440,Zimbabwe,2020,,,,0.0,0.0,2.86,70.0,0.0,0.0,,,,0.0,0.000


In [342]:
final_df.to_csv(f'data/polio_cases_to_upload_{date_today}.csv', index=False)