In [130]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

Scraping the data from the second table on this page : https://apps.who.int/dracunculiasis/dradata/html/report_Countries_t0.html

In [131]:

url = 'https://apps.who.int/dracunculiasis/dradata/html/report_Countries_t0.html'
html_doc = requests.get(url).content
soup = BeautifulSoup(html_doc, 'html.parser')

In [132]:
table = soup.findAll('table')[1]
df = pd.read_html(str(table))[0]

To understand the current situation we just want the country column and the year in which countries certified GW free

In [133]:
year_certified = df.iloc[:, [0,24]]
year_certified.columns = ['entity','year_certified']


Set the year to 2021

In [134]:
year_certified.year_certified = year_certified.year_certified.str.replace(r'Countries certified in', '', regex=True)
year_certified['year'] = 2021
year_certified.rename(columns={'entity':'Entity', 'year_certified':'Year Certified Guinea Worm Free', 'year':'Year'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  year_certified.year_certified = year_certified.year_certified.str.replace(r'Countries certified in', '', regex=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  year_certified['year'] = 2021
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  year_certified.rename(columns={'entity':'Entity', 'year_certified':'Year Certified Guinea Worm Free', 'year':'Year'

In [135]:
year_certified = year_certified.replace({'Year Certified Guinea Worm Free': {"Countries at precertification stage": "Pre-certification",
"Countries currently endemic for dracunculiasis": "Endemic",
"Countries not known to have dracunculiasis but yet to be certified":"Not yet certified"}})

Now we want the time-series of how certification has changed since 1996-2017

In [136]:
df_time = df.iloc[:, 0:24].drop(df.columns[[1]], axis=1)

years = [str(i) for i in range(1996,2018)]
df_time.columns = ['entity'] + years

Converting the table from wide to long

In [137]:
df_long = pd.melt(df_time, id_vars='entity', value_vars=years)

Changing the values slightly so they are what we want to show on the map.

In [138]:
df_long = df_long.replace({'value': {"Countries at precertification stage": "Guinea worm disease free (pre-certification)",
"Previously endemic countries certified free of dracunculiasis": "Certified Guinea worm disease free (previously endemic)",
"Certified free of dracunculiasis":"Certified Guinea worm disease free",
"Countries not known to have dracunculiasis but yet to be certified":"Not yet certified",
"Endemic for dracunculiasis":"Endemic"}})

In [139]:
df_long.rename(columns={'entity':'Entity', 'variable':'Year','value' :'Certifcation'}, inplace = True)
df_long.Year = df_long.Year.astype(int)

Standardizing country names

In [140]:
entities = {'Country':pd.concat([year_certified.Entity, df_long.Entity]).drop_duplicates()}
ent_df = pd.DataFrame(data =entities)
ent_df.to_csv('data/input/countries_to_standardize.csv', index=False)

In [141]:
countries = pd.read_csv('data/input/countries_to_standardize_country_standardized.csv')
df_long_m = pd.merge(df_long, countries, left_on='Entity', right_on='Country')[['Our World In Data Name','Year','Certifcation']]
year_certified_m = pd.merge(year_certified, countries, left_on='Entity', right_on='Country')[['Our World In Data Name','Year','Year Certified Guinea Worm Free']]

df_cert = year_certified_m.merge(df_long_m, on=['Our World In Data Name', 'Year'], how = "outer")
df_cert.rename(columns={'Our World In Data Name':'Entity'}, inplace=True)

Reading in the case data which we gather from various WHO sources

In [142]:
df_cases = pd.read_csv('data/input/Reported guinea worm cases - WHO (2021).csv')

Combining all the datasources into a key guinea worm dataset

In [143]:
df_all = df_cases.merge(df_cert, on=['Entity', 'Year'], how = "outer")

Writing out the data

In [144]:
df_all.to_csv('data/output/guinea_worm_to_upload.csv', index=False)