In [78]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

Scraping the data from the second table on this page : https://apps.who.int/dracunculiasis/dradata/html/report_Countries_t0.html

In [79]:

url = 'https://apps.who.int/dracunculiasis/dradata/html/report_Countries_t0.html'
html_doc = requests.get(url).content
soup = BeautifulSoup(html_doc, 'html.parser')

In [80]:
table = soup.findAll('table')[1]
df = pd.read_html(str(table))[0]

To understand the current situation we just want the country column and the year in which countries certified GW free

In [81]:
year_certified = df.iloc[:, [0,24]]
year_certified.columns = ['entity','year_certified']


Set the year to 2021

In [82]:
year_certified.year_certified = year_certified.year_certified.str.replace(r'Countries certified in', '', regex=True)
year_certified['year'] = 2021
year_certified.rename(columns={'entity':'Entity', 'year_certified':'Year Certified Guinea Worm Free', 'year':'Year'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  year_certified.year_certified = year_certified.year_certified.str.replace(r'Countries certified in', '', regex=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  year_certified['year'] = 2021
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  year_certified.rename(columns={'entity':'Entity', 'year_certified':'Year Certified Guinea Worm Free', 'year':'Year'

Adding the following coding to these 'year certified' categories, so they show up nicely on the grapher
Endemic = 4000, Pre-certification = 3000, and Not yet certified = 5000

In [83]:
year_certified = year_certified.replace({'Year Certified Guinea Worm Free': {"Countries at precertification stage": 3000,
"Countries currently endemic for dracunculiasis": 4000,
"Countries not known to have dracunculiasis but yet to be certified":5000}})

Changing Angola to Endemic following - https://www.who.int/news/item/23-09-2020-eradicating-dracunculiasis-human-cases-and-animal-infections-decline-as-angola-becomes-endemic

Changing Kenya to 2018 following - who.int/news/item/21-03-2018-dracunculiasis-eradication-south-sudan-claims-interruption-of-transmission-in-humans

In [84]:
year_certified['Year Certified Guinea Worm Free'][year_certified['Entity'] == 'Angola'] = 4000
year_certified['Year Certified Guinea Worm Free'][year_certified['Entity'] == 'Kenya'] = 2018

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  year_certified['Year Certified Guinea Worm Free'][year_certified['Entity'] == 'Angola'] = 4000
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  year_certified['Year Certified Guinea Worm Free'][year_certified['Entity'] == 'Kenya'] = 2018


Now we want the time-series of how certification has changed since 1996-2017

In [85]:
df_time = df.iloc[:, 0:24].drop(df.columns[[1]], axis=1)

years = [str(i) for i in range(1996,2018)]
df_time.columns = ['entity'] + years

Converting the table from wide to long

In [86]:
df_long = pd.melt(df_time, id_vars='entity', value_vars=years)

Changing the values slightly so they are what we want to show on the map.

In [87]:
df_long = df_long.replace({'value': {"Countries at precertification stage": "Pre-certification",
"Previously endemic countries certified free of dracunculiasis": "Certified Guinea worm disease free (previously endemic)",
"Certified free of dracunculiasis":"Certified Guinea worm disease free",
"Countries not known to have dracunculiasis but yet to be certified":"Not yet certified",
"Endemic for dracunculiasis":"Endemic"}})

In [88]:
df_long.rename(columns={'entity':'Entity', 'variable':'Year','value' :'Certification'}, inplace = True)
df_long.Year = df_long.Year.astype(int)

Copying 2017 for 2018-2021

In [89]:
years_to_copy = [2018, 2019, 2020,2021]

copy_year =df_long[df_long['Year'] == 2017].copy()
    
for year in years_to_copy:
    copy_year['Year'] = year
    df_long = df_long.append(copy_year, ignore_index=True)


  df_long = df_long.append(copy_year, ignore_index=True)


Changing Angola to Endemic for 2020 and 2021 following - https://www.who.int/news/item/23-09-2020-eradicating-dracunculiasis-human-cases-and-animal-infections-decline-as-angola-becomes-endemic

Changing Kenya to certified GW free from 2018, follwing - who.int/news/item/21-03-2018-dracunculiasis-eradication-south-sudan-claims-interruption-of-transmission-in-humans

In [90]:
df_long['Certification'][(df_long['Entity'] == 'Angola') & (df_long['Year'] >= 2020)] = 'Endemic'
df_long['Certification'][(df_long['Entity'] == 'Kenya') & (df_long['Year'] >= 2018)] = 'Certified Guinea worm disease free (previously endemic)'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_long['Certification'][(df_long['Entity'] == 'Angola') & (df_long['Year'] >= 2020)] = 'Endemic'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_long['Certification'][(df_long['Entity'] == 'Kenya') & (df_long['Year'] >= 2018)] = 'Certified Guinea worm disease free (previously endemic)'


Standardizing country names

In [91]:
entities = {'Country':pd.concat([year_certified.Entity, df_long.Entity]).drop_duplicates()}
ent_df = pd.DataFrame(data =entities)
ent_df.to_csv('data/input/countries_to_standardize.csv', index=False)

In [92]:
countries = pd.read_csv('data/input/countries_to_standardize_country_standardized.csv')
df_long_m = pd.merge(df_long, countries, left_on='Entity', right_on='Country')[['Our World In Data Name','Year','Certification']]
year_certified_m = pd.merge(year_certified, countries, left_on='Entity', right_on='Country')[['Our World In Data Name','Year','Year Certified Guinea Worm Free']]

df_cert = year_certified_m.merge(df_long_m, on=['Our World In Data Name', 'Year'], how = "outer")
df_cert.rename(columns={'Our World In Data Name':'Entity'}, inplace=True)

Reading in the case data which we gather manually from various WHO sources - see https://owid.cloud/admin/datasets/5496 for details

In [93]:
df_cases = pd.read_csv('data/input/Reported guinea worm cases - WHO (2021).csv')


Combining all the datasources into a key guinea worm dataset

In [94]:
df_all = df_cases.merge(df_cert, on=['Entity', 'Year'], how = "outer")
df_all.shape


(5277, 5)

Creating a combination of all countries and years so we can backfill guinea worms cases with 0

In [95]:
countries = df_all['Entity'].drop_duplicates()
years = range(min(df_all['Year']), max(df_all['Year'])+1)
all_entities_years = pd.DataFrame([(x, y) for x in countries for y in years])
all_entities_years.columns = ['Entity', 'Year']

In [96]:

df_all = df_all.merge(all_entities_years, on=['Entity', 'Year'], how = "outer")
df_all.shape

(7056, 5)

In [97]:
df_all['Guinea Worm Reported Cases'].fillna(0, inplace=True)
df_all['Guinea Worm Reported Cases'] = df_all['Guinea Worm Reported Cases'].astype(int)

Writing out the data

In [98]:
df_all.to_csv('data/output/guinea_worm_to_upload.csv', index=False)