## Scrape lockdowns from Wikipedia

Scrape lockdown dates from a Wikipedia page that has them listed as a table.

Rerun this notebook to get up-to-date values. Exports data to both csv and json.

In [1]:
import pandas as pd

In [8]:
dfs = pd.read_html('https://en.wikipedia.org/wiki/National_responses_to_the_COVID-19_pandemic',
                 match='COVID-19 pandemic lockdowns')

In [9]:
df = dfs[0]
df

Unnamed: 0_level_0,vteCOVID-19 pandemic lockdowns,vteCOVID-19 pandemic lockdowns,vteCOVID-19 pandemic lockdowns,vteCOVID-19 pandemic lockdowns,vteCOVID-19 pandemic lockdowns
Unnamed: 0_level_1,Countries and territories,Place,Start date,End date,Level
0,Albania,Albania,2020-03-13[4],,National
1,Algeria,Algeria,2020-03-23[5],2020-05-14[6],City
2,Argentina,Argentina,2020-03-19[7],2020-05-24[8],National
3,Armenia,Armenia,2020-03-24[9],2020-05-04[10],National
4,Australia,Australia,2020-03-23[11],,National
...,...,...,...,...,...
107,United States,Oregon,2020-03-24[160],,State
108,United States,Wisconsin,2020-03-24[161],,State
109,Venezuela,Venezuela,2020-03-17[162],2020-05-13[163],National
110,Zimbabwe,Zimbabwe,2020-03-30[164],2020-05-02[165],National


In [30]:
# remove the last row
df = dfs[0].iloc[:-1, :].copy()
df

Unnamed: 0_level_0,vteCOVID-19 pandemic lockdowns,vteCOVID-19 pandemic lockdowns,vteCOVID-19 pandemic lockdowns,vteCOVID-19 pandemic lockdowns,vteCOVID-19 pandemic lockdowns
Unnamed: 0_level_1,Countries and territories,Place,Start date,End date,Level
0,Albania,Albania,2020-03-13[4],,National
1,Algeria,Algeria,2020-03-23[5],2020-05-14[6],City
2,Argentina,Argentina,2020-03-19[7],2020-05-24[8],National
3,Armenia,Armenia,2020-03-24[9],2020-05-04[10],National
4,Australia,Australia,2020-03-23[11],,National
...,...,...,...,...,...
106,United States,New York,2020-03-22[158],2020-06-13[159],State
107,United States,Oregon,2020-03-24[160],,State
108,United States,Wisconsin,2020-03-24[161],,State
109,Venezuela,Venezuela,2020-03-17[162],2020-05-13[163],National


In [31]:
# get rid of multi-level index introduced for handling the title of the table
df.columns = df.columns.get_level_values(1)
df

Unnamed: 0,Countries and territories,Place,Start date,End date,Level
0,Albania,Albania,2020-03-13[4],,National
1,Algeria,Algeria,2020-03-23[5],2020-05-14[6],City
2,Argentina,Argentina,2020-03-19[7],2020-05-24[8],National
3,Armenia,Armenia,2020-03-24[9],2020-05-04[10],National
4,Australia,Australia,2020-03-23[11],,National
...,...,...,...,...,...
106,United States,New York,2020-03-22[158],2020-06-13[159],State
107,United States,Oregon,2020-03-24[160],,State
108,United States,Wisconsin,2020-03-24[161],,State
109,Venezuela,Venezuela,2020-03-17[162],2020-05-13[163],National


In [32]:
df['Start date'] = df['Start date'].str.replace(r"\[.*?\]","")
df['End date'] = df['End date'].str.replace(r"\[.*?\]","")
df

Unnamed: 0,Countries and territories,Place,Start date,End date,Level
0,Albania,Albania,2020-03-13,,National
1,Algeria,Algeria,2020-03-23,2020-05-14,City
2,Argentina,Argentina,2020-03-19,2020-05-24,National
3,Armenia,Armenia,2020-03-24,2020-05-04,National
4,Australia,Australia,2020-03-23,,National
...,...,...,...,...,...
106,United States,New York,2020-03-22,2020-06-13,State
107,United States,Oregon,2020-03-24,,State
108,United States,Wisconsin,2020-03-24,,State
109,Venezuela,Venezuela,2020-03-17,2020-05-13,National


In [34]:
df.rename(columns={'Countries and territories': 'Country'}, inplace=True)

In [35]:
# TODO: copied from another notebook, factor out to a tiny module that can be shared between notebooks
COUNTRIES_INCLUDED = ['Austria', 'Italy', 'Belgium', 'Latvia', 'Bulgaria', 'Lithuania', 'Croatia', 'Luxembourg', 
                     'Cyprus', 'Malta', 'Czechia', 'Netherlands', 'Denmark', 'Poland', 'Estonia', 'Portugal', 
                     'Finland', 'Romania', 'France', 'Slovakia', 'Germany', 'Slovenia', 'Greece', 'Spain', 
                     'Hungary', 'Sweden', 'Ireland', 'Russia', 'Norway', 'Switzerland',
                    'United Kingdom', 'Belarus']

In [38]:
df = df[df['Country'].isin(COUNTRIES_INCLUDED)]
df

In [42]:
df.set_index('Country', inplace=True)

In [49]:
assert df.loc['Germany']['End date'] == '2020-04-20to 2020-05-10'
# fixup the date manually for Germany that has irregularity compared to all other countries
df.loc['Germany']['End date'] = '2020-04-20'

In [50]:
df

Unnamed: 0_level_0,Place,Start date,End date,Level
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Austria,Austria,2020-03-16,2020-04-13,National
Belgium,Belgium,2020-03-18,2020-05-04,National
Croatia,Croatia,2020-03-18,2020-04-19,National
Finland,Uusimaa,2020-03-27,2020-04-16,Region
France,France,2020-03-17,2020-05-11,National
Germany,Germany,2020-03-23,2020-04-20,National
Greece,Greece,2020-03-23,2020-05-04,National
Hungary,Hungary,2020-03-28,2020-04-10,National
Ireland,Ireland,2020-03-12,2020-05-18,National
Italy,Italy,2020-03-09,2020-05-18,National


In [57]:
to_save = (df[df['Level'] == 'National'])[['Start date', 'End date']]
to_save

Unnamed: 0_level_0,Start date,End date
Country,Unnamed: 1_level_1,Unnamed: 2_level_1
Austria,2020-03-16,2020-04-13
Belgium,2020-03-18,2020-05-04
Croatia,2020-03-18,2020-04-19
France,2020-03-17,2020-05-11
Germany,2020-03-23,2020-04-20
Greece,2020-03-23,2020-05-04
Hungary,2020-03-28,2020-04-10
Ireland,2020-03-12,2020-05-18
Italy,2020-03-09,2020-05-18
Lithuania,2020-03-16,2020-04-27


In [58]:
to_save.to_csv('data/lockdowns.csv')

In [60]:
to_save.to_json('data/lockdowns.json', orient='index')