In [1]:
from gazpacho import get, Soup

In [2]:
url = 'https://www.worldometers.info/coronavirus/'
html = get(url)

In [3]:
soup = Soup(html)

In [4]:
table = soup.find('table', {'id': 'main_table_countries_today'})

In [None]:
str(table)[:100]

In [5]:
trs = table.find('tbody', mode='first').find('tr')

In [6]:
trs[:2]

[<tr style="">
                                 <td style="font-weight: bold; font-size:15px; text-align:left;"><a class="mt_a" href="country/china/">China</a></td>
                                 <td style="font-weight: bold; text-align:right">80,928</td>
                                                                 <td style="font-weight: bold; text-align:right;background-color:#FFEEAA;">+34</td>
                                 <td style="font-weight: bold; text-align:right;">3,245                                </td>
 
                                                                 <td style="font-weight: bold; text-align:right;background-color:red; color:white">+8</td>
                                 <td style="font-weight: bold; text-align:right">70,420</td>
 
                                                                                                 <td style="text-align:right;font-weight:bold;">7,263</td>
                                 <td style="font-weight: bold;

In [7]:
tr = trs[0]

In [8]:
tr.find('td', mode='first').text

'China'

In [9]:
tr.find('td')[1].text

'80,928'

In [10]:
def parse_tr(tr):
    country = tr.find('td', mode='first').text
    total = tr.find('td')[1].text
    total = float(total.replace(',', ''))
    return country, total

In [11]:
totals = [parse_tr(tr) for tr in trs]

In [12]:
totals[:10]

[('China', 80928.0),
 ('Italy', 35713.0),
 ('Iran', 18407.0),
 ('Spain', 17147.0),
 ('Germany', 13632.0),
 ('USA', 9486.0),
 ('France', 9134.0),
 ('S. Korea', 8565.0),
 ('Switzerland', 3939.0),
 ('UK', 2626.0)]

In [13]:
url = 'https://en.wikipedia.org/wiki/List_of_countries_by_total_health_expenditure_per_capita'
html = get(url)

In [14]:
soup = Soup(html)

In [15]:
import pandas as pd

In [16]:
table = soup.find('table', {'id': 'WHO'})[0]

In [17]:
who = pd.read_html(str(table))[0]

In [18]:
who = who.rename(columns={'Countries': 'country', '2015': 'health_expenditure'})
who = who[['country', 'health_expenditure']]

In [19]:
totals = pd.DataFrame(totals, columns=['country', 'cases'])

In [20]:
df = pd.merge(totals, who, on='country', how='left')

In [23]:
df

Unnamed: 0,country,cases,health_expenditure
0,China,80928.0,426.0
1,Italy,35713.0,2700.0
2,Iran,18407.0,
3,Spain,17147.0,2354.0
4,Germany,13632.0,4592.0
...,...,...,...
172,Sint Maarten,1.0,
173,Somalia,1.0,
174,Suriname,1.0,577.0
175,Eswatini,1.0,


### Saving results

In [24]:
df.to_csv('../data/corona.csv', index=False)

In [25]:
df.head()

Unnamed: 0,country,cases,health_expenditure
0,China,80928.0,426.0
1,Italy,35713.0,2700.0
2,Iran,18407.0,
3,Spain,17147.0,2354.0
4,Germany,13632.0,4592.0


In [26]:
df['date_fetched'] = pd.Timestamp('today')

In [27]:
df.head()

Unnamed: 0,country,cases,health_expenditure,date_fetched
0,China,80928.0,426.0,2020-03-19 10:15:17.678212
1,Italy,35713.0,2700.0,2020-03-19 10:15:17.678212
2,Iran,18407.0,,2020-03-19 10:15:17.678212
3,Spain,17147.0,2354.0,2020-03-19 10:15:17.678212
4,Germany,13632.0,4592.0,2020-03-19 10:15:17.678212


In [28]:
import sqlite3

con = sqlite3.connect('../data/corona.db')

df.to_sql('corona', con, index=False, if_exists='append')

In [29]:
pd.read_sql('''
    select 
    * 
    from corona
    where cases > 1000
    order by health_expenditure desc
''', con)

Unnamed: 0,country,cases,health_expenditure,date_fetched
0,Switzerland,3939.0,9818.0,2020-03-19 10:15:17.678212
1,Norway,1706.0,7464.0,2020-03-19 10:15:17.678212
2,Sweden,1423.0,5600.0,2020-03-19 10:15:17.678212
3,Denmark,1132.0,5497.0,2020-03-19 10:15:17.678212
4,Netherlands,2460.0,4746.0,2020-03-19 10:15:17.678212
5,Germany,13632.0,4592.0,2020-03-19 10:15:17.678212
6,Austria,1843.0,4536.0,2020-03-19 10:15:17.678212
7,Belgium,1795.0,4228.0,2020-03-19 10:15:17.678212
8,France,9134.0,4026.0,2020-03-19 10:15:17.678212
9,Italy,35713.0,2700.0,2020-03-19 10:15:17.678212
