In [200]:
import pandas as pd
import datetime
import requests
import numpy as np
from polio_utils import download_polio_data, extract_wild_cases, extract_vd_cases, owid_population, standardise_countries, extract_historical_wild_cases, get_who_data_and_regions

In [189]:
who_data = pd.read_csv('data/who_extranet_cases.csv')
who_data['entity'] = standardise_countries(who_data['country_territory_region'])


In [190]:
who_ext = who_data[['year','entity', 'wild_poliovirus_cases','c_vdpv_cases']]
who_ext = who_ext[(who_ext.year > 2000) & (who_data.year < 2016)]
who_ext = who_ext[~who_ext.entity.str.contains("_REGION")]
who_ext['total_polio'] = who_ext['wild_poliovirus_cases'] + who_ext['c_vdpv_cases']
who_ext.rename(columns= {'c_vdpv_cases':"total_cVDPV",'wild_poliovirus_cases':'wild_polio_cases' }, inplace=True)
who_ext

Unnamed: 0,year,entity,wild_polio_cases,total_cVDPV,total_polio
172,2001,Afghanistan,11.0,0.0,11.0
173,2001,Albania,0.0,0.0,0.0
174,2001,Algeria,1.0,0.0,1.0
175,2001,Andorra,,,
176,2001,Angola,1.0,0.0,1.0
...,...,...,...,...,...
2774,2015,Vietnam,0.0,0.0,0.0
2775,2015,Palestine,0.0,0.0,0.0
2776,2015,Yemen,0.0,0.0,0.0
2777,2015,Zambia,0.0,0.0,0.0


In [191]:
afp_stool_data = who_data[['year','entity', 'non_polio_afp_rate','percent_adequate_stool_collection']]
current_year = datetime.datetime.now().year 
afp_stool_data = afp_stool_data[(who_ext.year > 2000) & (afp_stool_data.year < current_year)]
afp_stool_data = afp_stool_data[~afp_stool_data.entity.str.contains("_REGION")]
afp_stool_data

Unnamed: 0,year,entity,non_polio_afp_rate,percent_adequate_stool_collection
172,2001,Afghanistan,1.70,73.0
173,2001,Albania,1.11,92.0
174,2001,Algeria,1.23,98.0
175,2001,Andorra,,
176,2001,Angola,2.40,66.0
...,...,...,...,...
2774,2015,Vietnam,1.55,96.0
2775,2015,Palestine,2.32,92.0
2776,2015,Yemen,4.38,91.0
2777,2015,Zambia,3.84,86.0


Data from 1980-2019 from WHO - download from http://www.who.int/entity/immunization/monitoring_surveillance/data/incidence_series.xls?ua=1

In [192]:
who_melt, regions = get_who_data_and_regions()

In [193]:
who_melt = who_melt[who_melt.year < 2001]

Combining WHO datasets

In [194]:
who_both = who_ext.merge(who_melt, on=['year','entity', 'total_polio'], how='outer').sort_values(['year','entity'])
who_both

Unnamed: 0,year,entity,wild_polio_cases,total_cVDPV,total_polio
6457,1980,Afghanistan,,,880.0
6458,1980,Albania,,,1.0
6459,1980,Algeria,,,116.0
6460,1980,Andorra,,,
6461,1980,Angola,,,32.0
...,...,...,...,...,...
2571,2015,Venezuela,0.0,0.0,0.0
2572,2015,Vietnam,0.0,0.0,0.0
2574,2015,Yemen,0.0,0.0,0.0
2575,2015,Zambia,0.0,0.0,0.0


Get wild type data

In [134]:
res = download_polio_data(url_stub='https://polioeradication.org/wp-content/uploads/2022/03/weekly-polio-analyses-WPV-')
date_today = datetime.date.today().strftime("%Y-%m-%d")
fp = f"data/polio_wild_cases_{date_today}.pdf"

with open(fp, 'wb') as f:
    f.write(res.content)

wt_df = extract_wild_cases(file_path = fp)
wt_df['entity'] = standardise_countries(wt_df['entity'])


res = requests.get('https://polioeradication.org/wp-content/uploads/2017/01/WPV_2011-2016_03JAN17.pdf')

fp = f"data/polio_historical_wild_cases_{date_today}.pdf"

with open(fp, 'wb') as f:
    f.write(res.content)
wth_df = extract_historical_wild_cases(file_path = fp)
wth_df['entity'] = standardise_countries(wth_df['entity'])

wt_df = wt_df.append(wth_df)
wt_df.year = wt_df.year.astype(int)
wt_df.wild_polio_cases = wt_df.wild_polio_cases.astype(int)


Get vaccine derived cases

In [195]:
res = download_polio_data(url_stub='http://polioeradication.org/wp-content/uploads/2022/03/weekly-polio-analyses-cVDPV-')
date = datetime.date.today().strftime("%Y-%m-%d")
fp = f"data/polio_vaccine_derived_cases_{date}.pdf"

with open(fp, 'wb') as f:
    f.write(res.content)

vd_df = extract_vd_cases()
vd_df['entity'] = standardise_countries(vd_df['entity'])
vd_df.year = vd_df.year.astype(int)
vd_df.total_cVDPV = vd_df.total_cVDPV.astype(int)

vd_df = vd_df.groupby(['entity', 'year']).sum()
vd_df = vd_df.reset_index()
vd_df

Unnamed: 0,entity,year,cVDPV1,cVDPV2,cVDPV3,total_cVDPV
0,Afghanistan,2016,0,0,0,0
1,Afghanistan,2017,0,0,0,0
2,Afghanistan,2018,0,0,0,0
3,Afghanistan,2019,0,0,0,0
4,Afghanistan,2020,0,308,0,308
...,...,...,...,...,...,...
283,Zambia,2017,0,0,0,0
284,Zambia,2018,0,0,0,0
285,Zambia,2019,0,2,0,2
286,Zambia,2020,0,0,0,0


In [196]:

polio_df = vd_df.merge(who_both, on=['entity','year', 'total_cVDPV'], how='outer').merge(afp_stool_data, on=['entity','year'], how='outer')
polio_df = polio_df.groupby(['year','entity'],as_index=False).first()



In [197]:
polio_df[polio_df[['year','entity']].duplicated(keep=False)].sort_values(['year','entity'])

Unnamed: 0,year,entity,cVDPV1,cVDPV2,cVDPV3,total_cVDPV,wild_polio_cases,total_polio,non_polio_afp_rate,percent_adequate_stool_collection


In [198]:
df = polio_df.set_index(['entity','year'])
mux = pd.MultiIndex.from_product([df.index.levels[0], df.index.levels[1]],names=['entity','year'])
df = df.reindex(mux, fill_value=np.nan).reset_index()

Calculating total polio for 2020 and 2021 as these aren't in the WHO file.

In [199]:
#df['total_polio'][df.year >= 2016 ] = df['wild_polio_cases'][df.year >= 2016].fillna(0) + df['total_cVDPV'][df.year >= 2016].fillna(0)
df[df.entity == 'Afghanistan']

Unnamed: 0,entity,year,cVDPV1,cVDPV2,cVDPV3,total_cVDPV,wild_polio_cases,total_polio,non_polio_afp_rate,percent_adequate_stool_collection
0,Afghanistan,1980,,,,,,880.0,,
1,Afghanistan,1981,,,,,,837.0,,
2,Afghanistan,1982,,,,,,1390.0,,
3,Afghanistan,1983,,,,,,1991.0,,
4,Afghanistan,1984,,,,,,552.0,,
5,Afghanistan,1985,,,,,,1981.0,,
6,Afghanistan,1986,,,,,,1843.0,,
7,Afghanistan,1987,,,,,,628.0,,
8,Afghanistan,1988,,,,,,307.0,,
9,Afghanistan,1989,,,,,,55.0,,


Add the regional totals

In [None]:
regional_total = regions.merge(df,on = 'entity').groupby(['WHO_REGION', 'year']).sum(min_count=1).reset_index()
regional_total['WHO_REGION'].replace(['AFR', 'AMR', 'SEAR', 'EUR', 'EMR', 'WPR'], ['Africa', 'Americas', 'South-East Asia', 'Europe', 'Eastern Mediterranean', 'Western Pacific'], inplace = True)
regional_total.rename(columns = {'WHO_REGION':'entity'}, inplace = True)
regional_total

Unnamed: 0,entity,year,wild_polio_cases,cVDPV1,cVDPV2,cVDPV3,total_cVDPV,total_polio
0,Africa,1980,,,,,,5126.0
1,Africa,1981,,,,,,4191.0
2,Africa,1982,,,,,,3897.0
3,Africa,1983,,,,,,3066.0
4,Africa,1984,,,,,,2968.0
...,...,...,...,...,...,...,...,...
247,Western Pacific,2017,,,,,,0.0
248,Western Pacific,2018,,26.0,0.0,0.0,26.0,26.0
249,Western Pacific,2019,,5.0,13.0,0.0,18.0,18.0
250,Western Pacific,2020,,1.0,1.0,0.0,2.0,2.0


Adding a global total 

In [None]:
global_entities = regions
global_entities['WHO_REGION'] = 'World'
global_total = global_entities.merge(df,on = 'entity').groupby(['WHO_REGION', 'year']).sum(min_count=1).reset_index()
global_total.rename(columns = {'WHO_REGION':'entity'}, inplace = True)

In [None]:
df.drop(df[df['entity'] == 'World'].index, inplace=True)
total_df = pd.concat([regional_total, df, global_total])
total_df['total_polio'] = total_df['total_polio'].fillna(0)


Add per million variables

In [None]:
population = owid_population()

pop_df = pd.DataFrame(pd.merge(left = population, right = total_df, how="right"))
per_mil_df = pop_df[['entity', 'year']]
per_mil_df[['wild_polio_cases_per_million','cVDPV1_per_million','cVDPV2_per_million','cVDPV3_per_million','total_cVDPV_per_million','total_polio_per_million']] = pop_df[['wild_polio_cases','cVDPV1','cVDPV2','cVDPV3','total_cVDPV','total_polio']].div(pop_df.population, axis=0).mul(1000000)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


For wild polio we can fill NA for 2011 with 0, for vaccine derived it is 2016 onwards. To ensure that countries with 0 cases show up in the grapher as such.

In [None]:
final_df = pop_df.merge(per_mil_df).drop(columns = 'population')
final_df['wild_polio_cases'][(final_df.year >=2011) & (final_df['wild_polio_cases'].isna())] = 0
final_df['wild_polio_cases_per_million'][(final_df.year >=2011) & (final_df['wild_polio_cases_per_million'].isna())] = 0

final_df['total_cVDPV'][(final_df.year >=2016) & (final_df['total_cVDPV'].isna())] = 0
final_df['total_cVDPV_per_million'][(final_df.year >=2016) & (final_df['total_cVDPV_per_million'].isna())] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['wild_polio_cases'][(final_df.year >=2011) & (final_df['wild_polio_cases'].isna())] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['wild_polio_cases_per_million'][(final_df.year >=2011) & (final_df['wild_polio_cases_per_million'].isna())] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['total_cVDPV'][(final_df.year >=2016) & (final_df['total_cVDPV'].isna())] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See 

In [None]:
final_df

Unnamed: 0,entity,year,wild_polio_cases,cVDPV1,cVDPV2,cVDPV3,total_cVDPV,total_polio,wild_polio_cases_per_million,cVDPV1_per_million,cVDPV2_per_million,cVDPV3_per_million,total_cVDPV_per_million,total_polio_per_million
0,Africa,1980,,,,,,5126.0,,,,,,10.760177
1,Africa,1981,,,,,,4191.0,,,,,,8.552993
2,Africa,1982,,,,,,3897.0,,,,,,7.731619
3,Africa,1983,,,,,,3066.0,,,,,,5.913442
4,Africa,1984,,,,,,2968.0,,,,,,5.564877
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8479,World,2017,22.0,0.0,96.0,0.0,96.0,118.0,0.002915,0.000000,0.012718,0.000000,0.012718,0.015633
8480,World,2018,33.0,27.0,130.0,75.0,232.0,265.0,0.004324,0.003538,0.017036,0.009828,0.030402,0.034726
8481,World,2019,176.0,12.0,366.0,0.0,378.0,554.0,0.022817,0.001556,0.047449,0.000000,0.049005,0.071822
8482,World,2020,140.0,34.0,1079.0,0.0,1113.0,1253.0,0.017961,0.004362,0.138426,0.000000,0.142788,0.160748


In [None]:
final_df.to_csv(f'data/polio_cases_to_upload_{date_today}.csv', index=False)