In [1]:
from gazpacho import get, Soup

In [2]:
url = 'https://www.worldometers.info/coronavirus/'
html = get(url)

In [3]:
soup = Soup(html)

In [4]:
table = soup.find('table', {'id': 'main_table_countries_today'})

In [5]:
str(table)[:100]

'<table id="main_table_countries_today" class="table table-bordered table-hover main_table_countries"'

In [6]:
trs = table.find('tbody', mode='first').find('tr')

In [7]:
trs[:2]

[<tr style="">
                                 <td style="font-weight: bold; font-size:15px; text-align:left;"><a class="mt_a" href="country/china/">China</a></td>
                                 <td style="font-weight: bold; text-align:right">81,285</td>
                                                                 <td style="font-weight: bold; text-align:right;background-color:#FFEEAA;">+67</td>
                                 <td style="font-weight: bold; text-align:right;">3,287                                </td>
 
                                                                 <td style="font-weight: bold; text-align:right;background-color:red; color:white">+6</td>
                                 <td style="font-weight: bold; text-align:right">74,051</td>
 
                                                                                                 <td style="text-align:right;font-weight:bold;">3,947</td>
                                 <td style="font-weight: bold;

In [8]:
tr = trs[0]

In [9]:
tr.find('td', mode='first').text

'China'

In [10]:
tr.find('td')[1].text

'81,285'

In [11]:
def parse_tr(tr):
    country = tr.find('td', mode='first').text
    total = tr.find('td')[1].text
    total = float(total.replace(',', ''))
    return country, total

In [12]:
totals = [parse_tr(tr) for tr in trs]

In [13]:
totals[:10]

[('China', 81285.0),
 ('Italy', 74386.0),
 ('USA', 65797.0),
 ('Spain', 49515.0),
 ('Germany', 37323.0),
 ('Iran', 27017.0),
 ('France', 25233.0),
 ('Switzerland', 10897.0),
 ('UK', 9529.0),
 ('S. Korea', 9137.0)]

In [14]:
url = 'https://en.wikipedia.org/wiki/List_of_countries_by_total_health_expenditure_per_capita'
html = get(url)

In [15]:
soup = Soup(html)

In [16]:
import pandas as pd

In [17]:
table = soup.find('table', {'id': 'WHO'})[0]

In [18]:
who = pd.read_html(str(table))[0]

In [19]:
who = who.rename(columns={'Countries': 'country', '2015': 'health_expenditure'})
who = who[['country', 'health_expenditure']]

In [20]:
totals = pd.DataFrame(totals, columns=['country', 'cases'])

In [21]:
df = pd.merge(totals, who, on='country', how='left')

In [22]:
df

Unnamed: 0,country,cases,health_expenditure
0,China,81285.0,426.0
1,Italy,74386.0,2700.0
2,USA,65797.0,
3,Spain,49515.0,2354.0
4,Germany,37323.0,4592.0
...,...,...,...
194,Papua New Guinea,1.0,77.0
195,St. Vincent Grenadines,1.0,
196,Somalia,1.0,
197,Timor-Leste,1.0,72.0


### Saving results

In [23]:
df.to_csv('../data/corona.csv', index=False)

In [24]:
df.head()

Unnamed: 0,country,cases,health_expenditure
0,China,81285.0,426.0
1,Italy,74386.0,2700.0
2,USA,65797.0,
3,Spain,49515.0,2354.0
4,Germany,37323.0,4592.0


In [25]:
df['date_fetched'] = pd.Timestamp('today')

In [26]:
df.head()

Unnamed: 0,country,cases,health_expenditure,date_fetched
0,China,81285.0,426.0,2020-03-25 20:38:44.549337
1,Italy,74386.0,2700.0,2020-03-25 20:38:44.549337
2,USA,65797.0,,2020-03-25 20:38:44.549337
3,Spain,49515.0,2354.0,2020-03-25 20:38:44.549337
4,Germany,37323.0,4592.0,2020-03-25 20:38:44.549337


In [27]:
import sqlite3

con = sqlite3.connect('../data/corona.db')

df.to_sql('corona', con, index=False, if_exists='append')

In [28]:
pd.read_sql('''
    select 
    * 
    from corona
    where cases > 1000
    order by health_expenditure desc
''', con)

Unnamed: 0,country,cases,health_expenditure,date_fetched
0,Switzerland,3939.0,9818.0,2020-03-19 10:15:17.678212
1,Switzerland,10897.0,9818.0,2020-03-25 20:38:44.549337
2,Norway,1706.0,7464.0,2020-03-19 10:15:17.678212
3,Norway,3084.0,7464.0,2020-03-25 20:38:44.549337
4,Luxembourg,1333.0,6236.0,2020-03-25 20:38:44.549337
5,Sweden,1423.0,5600.0,2020-03-19 10:15:17.678212
6,Sweden,2526.0,5600.0,2020-03-25 20:38:44.549337
7,Denmark,1132.0,5497.0,2020-03-19 10:15:17.678212
8,Denmark,1724.0,5497.0,2020-03-25 20:38:44.549337
9,Australia,2676.0,4934.0,2020-03-25 20:38:44.549337
