In [1]:
import pandas as pd
import datetime
import requests
import numpy as np
from polio_utils import download_polio_data, extract_wild_cases, extract_vd_cases, owid_population, standardise_countries, extract_historical_wild_cases, get_who_data_and_regions,add_years_to_polio_status

Loading in the cases taken from the WHO extranet - https://extranet.who.int/polis/public/CaseCount.aspx . These are regularly updated and run from 2000-current year. 

We also standardise the country names in this section using the standardise_countries() function.

In [2]:
who_data = pd.read_csv('data/who_extranet_cases.csv')
who_data['entity'] = standardise_countries(who_data['country_territory_region'])
current_year = datetime.datetime.now().year 

We use only data from 2001 onwards as the values for 2000 seem incomplete. 

In [3]:
who_ext_wild = who_data[['year','entity', 'wild_poliovirus_cases', 'non_polio_afp_rate','percent_adequate_stool_collection']]
who_ext_wild = who_ext_wild[(who_ext_wild.year > 2000) & (who_ext_wild.year < current_year)]
who_ext_wild = who_ext_wild[~who_ext_wild.entity.str.contains("_REGION")]
who_ext_wild.rename(columns= {'wild_poliovirus_cases':'wild_polio_cases' }, inplace=True)
who_ext_wild

Unnamed: 0,year,entity,wild_polio_cases,non_polio_afp_rate,percent_adequate_stool_collection
172,2001,Afghanistan,11.0,1.70,73.0
173,2001,Albania,0.0,1.11,92.0
174,2001,Algeria,1.0,1.23,98.0
175,2001,Andorra,,,
176,2001,Angola,1.0,2.40,66.0
...,...,...,...,...,...
3824,2021,Vietnam,0.0,1.15,97.0
3825,2021,Palestine,0.0,0.99,100.0
3826,2021,Yemen,0.0,7.50,88.0
3827,2021,Zambia,0.0,3.95,36.0


Some values for 'percent_adequate_stool_collection' are erroneously > 100, we replace these with NA. 

In [4]:
who_ext_wild['percent_adequate_stool_collection'][who_ext_wild['percent_adequate_stool_collection'] >100] = np.nan

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  who_ext_wild['percent_adequate_stool_collection'][who_ext_wild['percent_adequate_stool_collection'] >100] = np.nan


We separate out the vaccine derived polio cases as we only want these for 2001-2015. After 2015 we will use the data from the GPEI weekly reports as this breaks down the cases by strain. 

In [5]:
who_ext_vdpv = who_data[['year','entity', 'c_vdpv_cases']]
who_ext_vdpv = who_ext_vdpv[(who_ext_vdpv.year > 2000) & (who_ext_vdpv.year < 2016)]
who_ext_vdpv = who_ext_vdpv[~who_ext_vdpv.entity.str.contains("_REGION")]
who_ext_vdpv.rename(columns= {'c_vdpv_cases':"total_cVDPV"}, inplace=True)
who_ext_vdpv

Unnamed: 0,year,entity,total_cVDPV
172,2001,Afghanistan,0.0
173,2001,Albania,0.0
174,2001,Algeria,0.0
175,2001,Andorra,
176,2001,Angola,0.0
...,...,...,...
2774,2015,Vietnam,0.0
2775,2015,Palestine,0.0
2776,2015,Yemen,0.0
2777,2015,Zambia,0.0


Data from 1980-2019 from WHO - download from http://www.who.int/entity/immunization/monitoring_surveillance/data/incidence_series.xls?ua=1

This is not broken down by wild/vaccine derived type so we will only use this where no other data is available, before 2001. 

In [6]:
who_melt, regions = get_who_data_and_regions()
who_melt_all = who_melt.rename(columns= {'total_polio':'total_polio_orig'})
who_melt_all

Unnamed: 0,entity,year,total_polio_orig
0,Afghanistan,2019,0.0
1,Albania,2019,0.0
2,Algeria,2019,0.0
3,Andorra,2019,0.0
4,Angola,2019,138.0
...,...,...,...
7755,Venezuela,1980,11.0
7756,Vietnam,1980,1741.0
7757,Yemen,1980,722.0
7758,Zambia,1980,276.0


In [7]:
extra_regions = pd.DataFrame([['EMR','Palestine'],['WPR','Macao'], ['WPR','Hong Kong']], columns=['WHO_REGION', 'entity'])
regions = regions.append(extra_regions, ignore_index=True)

Filtering out data from after 2000.

In [8]:
who_melt = who_melt[who_melt.year < 2001]
who_melt.sort_values(['entity','year'])

Unnamed: 0,entity,year,total_polio
7566,Afghanistan,1980,880.0
7372,Afghanistan,1981,837.0
7178,Afghanistan,1982,1390.0
6984,Afghanistan,1983,1991.0
6790,Afghanistan,1984,552.0
...,...,...,...
4655,Zimbabwe,1996,1.0
4461,Zimbabwe,1997,3.0
4267,Zimbabwe,1998,17.0
4073,Zimbabwe,1999,2.0


Downloading the wild type data from polioeradication.org, this is updated regularly and it probably the most fragile part of the code as the format of the pdfs reporting the cases is likely to change.

In [9]:
res = download_polio_data(url_stub='https://polioeradication.org/wp-content/uploads/2022/03/weekly-polio-analyses-WPV-')
date_today = datetime.date.today().strftime("%Y-%m-%d")
fp = f"data/polio_wild_cases_{date_today}.pdf"

with open(fp, 'wb') as f:
    f.write(res.content)

wt_df = extract_wild_cases(file_path = fp)
wt_df['entity'] = standardise_countries(wt_df['entity'])

Download a pdf summarising wild type cases by strain for 2011-2016

In [10]:

res = requests.get('https://polioeradication.org/wp-content/uploads/2017/01/WPV_2011-2016_03JAN17.pdf')

fp = f"data/polio_historical_wild_cases_{date_today}.pdf"

with open(fp, 'wb') as f:
    f.write(res.content)
wth_df = extract_historical_wild_cases(file_path = fp)
wth_df['entity'] = standardise_countries(wth_df['entity'])

wt_df = wt_df.append(wth_df)
wt_df.year = wt_df.year.astype(int)
wt_df.wild_polio_cases = wt_df.wild_polio_cases.astype(int)


Downloading the vaccine derived cases from polioeradication.org. 

I then manually added them to 'polio_cVDPV_cases.csv' which is then read in. 

In [11]:
res = download_polio_data(url_stub='http://polioeradication.org/wp-content/uploads/2022/03/weekly-polio-analyses-cVDPV-')
date = datetime.date.today().strftime("%Y-%m-%d")
fp = f"data/polio_vaccine_derived_cases_{date}.pdf"

with open(fp, 'wb') as f:
    f.write(res.content)

vd_df = extract_vd_cases()
vd_df['entity'] = standardise_countries(vd_df['entity'])
vd_df.year = vd_df.year.astype(int)
vd_df.total_cVDPV = vd_df.total_cVDPV.astype(int)

vd_df = vd_df.groupby(['entity', 'year']).sum()
vd_df = vd_df.reset_index()
vd_df

Unnamed: 0,entity,year,cVDPV1,cVDPV2,cVDPV3,total_cVDPV
0,Afghanistan,2016,0,0,0,0
1,Afghanistan,2017,0,0,0,0
2,Afghanistan,2018,0,0,0,0
3,Afghanistan,2019,0,0,0,0
4,Afghanistan,2020,0,308,0,308
...,...,...,...,...,...,...
283,Zambia,2017,0,0,0,0
284,Zambia,2018,0,0,0,0
285,Zambia,2019,0,2,0,2
286,Zambia,2020,0,0,0,0


Merging each of the polio datasets together

In [12]:
polio_df = vd_df.merge(who_ext_vdpv, on=['entity','year', 'total_cVDPV'], how='outer').merge(who_ext_wild, on=['entity','year'], how='outer').merge(who_melt, on=['entity','year'], how='outer')
polio_df.sort_values(by=['entity','year'])


Unnamed: 0,entity,year,cVDPV1,cVDPV2,cVDPV3,total_cVDPV,wild_polio_cases,non_polio_afp_rate,percent_adequate_stool_collection,total_polio
7495,Afghanistan,1980,,,,,,,,880.0
7301,Afghanistan,1981,,,,,,,,837.0
7107,Afghanistan,1982,,,,,,,,1390.0
6913,Afghanistan,1983,,,,,,,,1991.0
6719,Afghanistan,1984,,,,,,,,552.0
...,...,...,...,...,...,...,...,...,...,...
3114,Zimbabwe,2017,,,,,0.0,3.33,89.0,
3239,Zimbabwe,2018,,,,,0.0,3.74,78.0,
3364,Zimbabwe,2019,,,,,0.0,3.03,90.0,
3489,Zimbabwe,2020,,,,,0.0,2.86,70.0,


Checking there aren't duplicate year-entity pairs

In [13]:
polio_df[polio_df[['year','entity']].duplicated(keep=False)].sort_values(['year','entity'])

Unnamed: 0,entity,year,cVDPV1,cVDPV2,cVDPV3,total_cVDPV,wild_polio_cases,non_polio_afp_rate,percent_adequate_stool_collection,total_polio


Ensuring there is a country-year value for each year between 1980 and the latest year (2021 at time of writing).

In [14]:
df = polio_df.set_index(['entity','year'])
mux = pd.MultiIndex.from_product([df.index.levels[0], df.index.levels[1]],names=['entity','year'])
df = df.reindex(mux, fill_value=np.nan).reset_index()
df.sort_values(['entity','year'])

Unnamed: 0,entity,year,cVDPV1,cVDPV2,cVDPV3,total_cVDPV,wild_polio_cases,non_polio_afp_rate,percent_adequate_stool_collection,total_polio
0,Afghanistan,1980,,,,,,,,880.0
1,Afghanistan,1981,,,,,,,,837.0
2,Afghanistan,1982,,,,,,,,1390.0
3,Afghanistan,1983,,,,,,,,1991.0
4,Afghanistan,1984,,,,,,,,552.0
...,...,...,...,...,...,...,...,...,...,...
8437,Zimbabwe,2017,,,,,0.0,3.33,89.0,
8438,Zimbabwe,2018,,,,,0.0,3.74,78.0,
8439,Zimbabwe,2019,,,,,0.0,3.03,90.0,
8440,Zimbabwe,2020,,,,,0.0,2.86,70.0,


Sum the wild polio and vaccine derived cases for the year 2001 onwards. Before this the only available metric is total cases. 

In [15]:
df['total_polio'][df.year > 2000] = df[['wild_polio_cases','total_cVDPV']][df.year > 2000].sum(axis=1, min_count=1)
df[df.entity == 'Afghanistan']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['total_polio'][df.year > 2000] = df[['wild_polio_cases','total_cVDPV']][df.year > 2000].sum(axis=1, min_count=1)


Unnamed: 0,entity,year,cVDPV1,cVDPV2,cVDPV3,total_cVDPV,wild_polio_cases,non_polio_afp_rate,percent_adequate_stool_collection,total_polio
0,Afghanistan,1980,,,,,,,,880.0
1,Afghanistan,1981,,,,,,,,837.0
2,Afghanistan,1982,,,,,,,,1390.0
3,Afghanistan,1983,,,,,,,,1991.0
4,Afghanistan,1984,,,,,,,,552.0
5,Afghanistan,1985,,,,,,,,1981.0
6,Afghanistan,1986,,,,,,,,1843.0
7,Afghanistan,1987,,,,,,,,628.0
8,Afghanistan,1988,,,,,,,,307.0
9,Afghanistan,1989,,,,,,,,55.0


Borrow missing values from the original WHO data - e.g. USA has No Data for some years here but it has 0 cases for those years in the original who data set (who_melt_all)

In [16]:
df = df.merge(who_melt_all, on=['entity','year'], how='outer')
df['total_polio_orig'][df['total_polio'].isna()].value_counts()

0.0    352
Name: total_polio_orig, dtype: int64

In [17]:

df['total_polio'] = df['total_polio'].fillna(df['total_polio_orig'])
df[df['entity'] == 'United States'][['year','total_polio']]
df.drop(columns='total_polio_orig', inplace=True)

Adding the correction factor to estimate polio cases based on reported cases. Following Tebbens et al (2011) -https://www.sciencedirect.com/science/article/pii/S0264410X10014957?via%3Dihub

Correction factor is 7 for all years before 2000. 
If the 'non_polio_afp_rate' is < 1 OR 'percent_adequate_stool_collection' < 60, then the correction factor = 7.
If the 'non_polio_afp_rate' is < 2 OR 'percent_adequate_stool_collection' < 80, then the correction factor = 2.
If the 'non_polio_afp_rate' is >= 2 OR 'percent_adequate_stool_collection' >= 80, then the correction factor = 1.11.
If both 'non_polio_afp_rate' and 'percent_adequate_stool_collection' are missing then the correction factor is 7. 

Namibia had 'percent_adequate_stool_collection' > 100 in 2011 and 2014 but for other years at this time it's correction factor is 1.11 so we set it as 1.11 for 2011 and 2014. 

For China 1989-92 we set the correction factor to 1.11 and in Oman in 1988.

We set the correction factor as NA for all of 2021 as the values of 'percent_adequate_stool_collection' seemed unreliable in this year. 


In [18]:
df['correction_factor'] = np.nan
df.loc[df['year'] < 2000, 'correction_factor'] = 7.0
df.loc[(df['non_polio_afp_rate'] < 1.0) | (df['percent_adequate_stool_collection'] < 60), 'correction_factor'] = 7.0
df.loc[(df['non_polio_afp_rate'] < 2.0) | (df['percent_adequate_stool_collection'] < 80), 'correction_factor'] = 2.0
df.loc[(df['non_polio_afp_rate'] >= 2.0) & (df['percent_adequate_stool_collection'] >= 80), 'correction_factor'] = 1.11
df.loc[(df['non_polio_afp_rate'].isna()) & (df['percent_adequate_stool_collection'].isna()), 'correction_factor'] = 7.0
df['correction_factor'][(df['entity'] == 'Namibia') & (df['year'].isin([2011,2014]))] = 1.11
df['correction_factor'][(df['entity'] == 'China') & (df['year'].isin([1989,1990,1991,1992]))] = 1.11
df['correction_factor'][(df['entity'] == 'Oman') & (df['year'].isin([1988]))] = 1.11
df.loc[df['year'] == 2021, 'correction_factor'] = np.nan


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['correction_factor'][(df['entity'] == 'Namibia') & (df['year'].isin([2011,2014]))] = 1.11
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['correction_factor'][(df['entity'] == 'China') & (df['year'].isin([1989,1990,1991,1992]))] = 1.11
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['correction_factor'][(df['entity'] == 'Oman') & (df['year'].isin([1988]))] = 1.11


Estimating the cases by multiplying the total polio variable by the correction factor.

In [19]:

df['estimated_polio_cases'] = df['total_polio']*df['correction_factor']
df[['entity','estimated_polio_cases','total_polio']][(df.year == 1980)].head()

Unnamed: 0,entity,estimated_polio_cases,total_polio
0,Afghanistan,6160.0,880.0
42,Albania,7.0,1.0
84,Algeria,812.0,116.0
126,Andorra,,
168,Angola,224.0,32.0


Add the regional totals:

- Joining the regions-entity table we got from the WHO data with the current dataframe. We then groupby and sum the polio case variables by these regions. 

In [20]:
regional_total = regions.merge(df,on = 'entity').groupby(['WHO_REGION', 'year'])[['cVDPV1', 'cVDPV2', 'cVDPV3','total_cVDPV','wild_polio_cases','total_polio', 'estimated_polio_cases']].sum(min_count=1).reset_index()
regional_total['WHO_REGION'].replace(['AFR', 'AMR', 'SEAR', 'EUR', 'EMR', 'WPR'], ['Africa', 'Americas', 'South-East Asia', 'Europe', 'Eastern Mediterranean', 'Western Pacific'], inplace = True)
regional_total.rename(columns = {'WHO_REGION':'entity'}, inplace = True)


Adding a global total 

In [21]:
global_entities = regions.copy()
global_entities['WHO_REGION'] = 'World'
global_total = global_entities.merge(df,on = 'entity').groupby(['WHO_REGION', 'year']).sum(min_count=1)[['cVDPV1', 'cVDPV2', 'cVDPV3','total_cVDPV','wild_polio_cases','total_polio','estimated_polio_cases']].reset_index()
global_total.rename(columns = {'WHO_REGION':'entity'}, inplace = True)

Joining together the country, regional and global dataframes. 

In [22]:
df.drop(df[df['entity'] == 'World'].index, inplace=True)
total_df = pd.concat([df,regional_total,  global_total])

Add per million variables:

- Load in the OWID population variable.
- Merge with existing dataframe and then divide the polio case variables by the population
- Multiply by 1 million

In [23]:
population = owid_population()

pop_df = pd.DataFrame(pd.merge(left = population, right = total_df, how="right"))
per_mil_df = pop_df[['entity', 'year']]
per_mil_df[['wild_polio_cases_per_million','cVDPV1_per_million','cVDPV2_per_million','cVDPV3_per_million','total_cVDPV_per_million','total_polio_per_million','estimated_polio_cases_per_million']] = pop_df[['wild_polio_cases','cVDPV1','cVDPV2','cVDPV3','total_cVDPV','total_polio','estimated_polio_cases']].div(pop_df.population, axis=0).mul(1000000).round(3)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


Filling NAs where we can be confident this is the case, in the GPEI data we can be fairly certain that if a country doesn't have data it's because there haven't been any detected polio cases.  

For wild polio we can fill NA for 2011 with 0, for vaccine derived it is 2016 onwards. To ensure that countries with 0 cases show up in the grapher as such.


In [24]:
final_df = pop_df.merge(per_mil_df).drop(columns = 'population')

final_df['total_cVDPV'][(final_df.year >=2016) & (final_df['total_cVDPV'].isna())] = 0
final_df['total_cVDPV_per_million'][(final_df.year >=2016) & (final_df['total_cVDPV_per_million'].isna())] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['total_cVDPV'][(final_df.year >=2016) & (final_df['total_cVDPV'].isna())] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['total_cVDPV_per_million'][(final_df.year >=2016) & (final_df['total_cVDPV_per_million'].isna())] = 0


Rounding the estimated cases

In [25]:
final_df['estimated_polio_cases'] = round(final_df['estimated_polio_cases'])

In [26]:
final_df

Unnamed: 0,entity,year,cVDPV1,cVDPV2,cVDPV3,total_cVDPV,wild_polio_cases,non_polio_afp_rate,percent_adequate_stool_collection,total_polio,correction_factor,estimated_polio_cases,wild_polio_cases_per_million,cVDPV1_per_million,cVDPV2_per_million,cVDPV3_per_million,total_cVDPV_per_million,total_polio_per_million,estimated_polio_cases_per_million
0,Afghanistan,1980,,,,,,,,880.0,7.0,6160.0,,,,,,65.886,461.199
1,Afghanistan,1981,,,,,,,,837.0,7.0,5859.0,,,,,,63.545,444.818
2,Afghanistan,1982,,,,,,,,1390.0,7.0,9730.0,,,,,,107.898,755.287
3,Afghanistan,1983,,,,,,,,1991.0,7.0,13937.0,,,,,,158.801,1111.605
4,Afghanistan,1984,,,,,,,,552.0,7.0,3864.0,,,,,,45.230,316.610
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8731,World,2017,0.0,96.0,0.0,96.0,22.0,,,118.0,,131.0,0.003,0.000,0.013,0.000,0.013,0.016,0.017
8732,World,2018,27.0,71.0,7.0,105.0,33.0,,,138.0,,204.0,0.004,0.004,0.009,0.001,0.014,0.018,0.027
8733,World,2019,12.0,366.0,0.0,378.0,176.0,,,554.0,,656.0,0.023,0.002,0.047,0.000,0.049,0.072,0.085
8734,World,2020,34.0,1079.0,0.0,1113.0,140.0,,,1253.0,,1872.0,0.018,0.004,0.138,0.000,0.143,0.161,0.240


Adding a data for the charts regarding when each country's last polio case was

Reading in our existing dataset from this chart - https://ourworldindata.org/grapher/progress-towards-polio-eradication. Standardising the countries, Micronesia was previously not showing up.

In [27]:
polio_free  = pd.read_csv("data/global_year_of_last_polio_case_plus_certification_status_GPEI_2017.csv")
polio_free['Entity'] = standardise_countries(polio_free['Entity'])

For 2018-2020 the values should be the same across the board - except for Africa which was certified as Polio Free in 2020. In this code we take the last available year (2017) for each entity and copy it three times, replacing the years with 2018-2020. This happens in the 'add_years_to_polio_status()' function.

In [28]:

polio_new = add_years_to_polio_status(polio_free)

Combine this with the original polio data from the chart. Merge with the regions dataset and set countries in the African region to be polio-free in 2020.

In [30]:
polio_comb = pd.concat([polio_free, polio_new], ignore_index=True).sort_values(by = ['Entity','Year'])
polio_comb = polio_comb.merge(regions, left_on='Entity', right_on = 'entity',how = 'outer')
polio_comb['Polio status (GPEI (2017))'][(polio_comb.WHO_REGION== 'AFR') & (polio_comb.Year == 2020)] = 'WHO Region certified polio-free'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  polio_comb['Polio status (GPEI (2017))'][(polio_comb.WHO_REGION== 'AFR') & (polio_comb.Year == 2020)] = 'WHO Region certified polio-free'


Tidying up this data, dropping unused columns and rows. Renaming columns so we can combine with our existing dataset. We also add that Nigeria was polio-free (not certified) between 2017 and 2019 inclusive. 

In [31]:
polio_comb = polio_comb[~polio_comb['Entity'].isna()]
polio_comb.drop(['WHO_REGION','entity'], axis = 1, inplace = True)
polio_comb.rename(columns={"Entity": "entity", "Year":'year',"Polio status (GPEI (2017))":"polio_status"}, inplace=True)
polio_comb['polio_status'][(polio_comb['entity'] == 'Nigeria') & (polio_comb.year.isin([2017,2018,2019]))] = ['polio-free (not certified)']
polio_comb[(polio_comb['entity'] == 'Nigeria')& (polio_comb.year.isin([2017,2018,2019]))]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  polio_comb['polio_status'][(polio_comb['entity'] == 'Nigeria') & (polio_comb.year.isin([2017,2018,2019]))] = ['polio-free (not certified)']


Unnamed: 0,entity,year,polio_status
14870,Nigeria,2017.0,polio-free (not certified)
14871,Nigeria,2018.0,polio-free (not certified)
14872,Nigeria,2019.0,polio-free (not certified)


Adding entities missing from the data. We leave all their values as missing data except for the years after their region was declared polio-free.

In [32]:
missing_entities = ['Greenland', 'Serbia', 'Montenegro', 'Kosovo', 'West Bank', 'Western Sahara', 'Kiribati','Taiwan', 'Liechtenstein']
years = range(1910,2021)
unique_combinations = [(x,y) for x in missing_entities for y in years ]
polio_missing = pd.DataFrame(unique_combinations, columns =['entity','year'])

polio_missing['polio_status'] = ''
polio_missing['polio_status'][(polio_missing['entity'].isin(['Greenland', 'Serbia', 'Montenegro', 'Kosovo', 'Liechtenstein']))& (polio_missing['year'] >= 2002)] = 'WHO Region certified polio-free'
polio_missing['polio_status'][(polio_missing['entity'].isin(['Taiwan', 'Kiribati']))& (polio_missing['year'] >= 2000)] = 'WHO Region certified polio-free'
polio_missing

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  polio_missing['polio_status'][(polio_missing['entity'].isin(['Greenland', 'Serbia', 'Montenegro', 'Kosovo']))& (polio_missing['year'] >= 2002)] = 'WHO Region certified polio-free'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  polio_missing['polio_status'][(polio_missing['entity'].isin(['Taiwan', 'Kiribati']))& (polio_missing['year'] >= 2000)] = 'WHO Region certified polio-free'


Unnamed: 0,entity,year,polio_status
0,Greenland,1910,
1,Greenland,1911,
2,Greenland,1912,
3,Greenland,1913,
4,Greenland,1914,
...,...,...,...
883,Taiwan,2016,WHO Region certified polio-free
884,Taiwan,2017,WHO Region certified polio-free
885,Taiwan,2018,WHO Region certified polio-free
886,Taiwan,2019,WHO Region certified polio-free


Combine these missing countries with the existing countries. 

In [33]:
polio_comb = pd.concat([polio_comb, polio_missing])

Merge with the existing dataset.

In [34]:
df_all = final_df.merge(polio_comb, on = ['entity','year'], how = 'outer').sort_values(by = ['entity','year'])


Add surveillance column:

* If a country was certified polio-free 2 years ago or before, then it says "Certified polio free"
* If non-polio AFP rate ≥2 and adequate stool collection ≥80%, then it says "Sufficient screening and testing"
* If non-polio AFP rate ≥2 but adequate stool collection <80%, then it says "Low testing"
* If non-polio AFP rate <2 but adequate stool collection ≥80%, then it says "Low screening"
* If non-polio AFP rate <2 and adequate stool collection <80%, then it says "Low screening and testing"


In [46]:
df_all['polio_surveillance_status'] = ""
df_all.loc[(df_all['non_polio_afp_rate'] >= 2.0) & (df_all['percent_adequate_stool_collection'] >= 80), 'polio_surveillance_status'] = "Sufficient screening and testing"
df_all.loc[(df_all['non_polio_afp_rate'] >= 2.0) & (df_all['percent_adequate_stool_collection'] < 80), 'polio_surveillance_status'] = "Low testing"
df_all.loc[(df_all['non_polio_afp_rate'] < 2.0) & (df_all['percent_adequate_stool_collection'] >= 80), 'polio_surveillance_status'] = "Low screening"
df_all.loc[(df_all['non_polio_afp_rate'] < 2.0) & (df_all['percent_adequate_stool_collection'] < 80), 'polio_surveillance_status'] = "Low screening and testing"


In [70]:
surveillance_threshold_year = int(datetime.date.today().strftime("%Y")) - 3
polio_free_entities = df_all['entity'][(df_all['polio_status'] == 'WHO Region certified polio-free') & (df_all['year'] == surveillance_threshold_year)].drop_duplicates()
df_all['polio_surveillance_status'][df_all['entity'].isin(polio_free_entities)] = "Certified polio free"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_all['polio_surveillance_status'][df_all['entity'].isin(polio_free_entities)] = "Certified polio free"


For each country find the last year that the polio_status was 'endemic' and use this as the last year of recorded cases. Any country with existing polio cases should have the year as 3000. We do this manually for Malawi. As it had a case in 2021, after Africa was declared polio free. 

In [35]:
last_polio_case = df_all.loc[(df_all.polio_status == 'endemic')].sort_values('year').groupby('entity').tail(1)[['entity','year']]
last_polio_case.rename(columns = {'year':'last_polio_case'}, inplace = True)
last_polio_case['year'] = 2021
last_polio_case['last_polio_case'] = last_polio_case['last_polio_case'].replace(2020, 3000)
last_polio_case['last_polio_case'][last_polio_case['entity'] == 'Malawi'] = 3000

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  last_polio_case['last_polio_case'][last_polio_case['entity'] == 'Malawi'] = 3000


Combine the datasets together and save it. 

In [36]:
df_all = df_all.merge(last_polio_case, on = ['entity','year'], how = 'outer').sort_values(by = ['entity','year'])
df_all['year'] = df_all['year'].astype(int)
df_all.to_csv(f'data/polio_cases_to_upload_{date_today}.csv', index=False)