# Library

In [1]:
from bs4 import BeautifulSoup
import datetime
import requests as req

# Get page content

In [2]:
URL = 'https://www.worldometers.info/coronavirus/'
html = req.get(URL).content
soup = BeautifulSoup(html, 'html.parser')

# Extract yesterday of yesterday data

In [3]:
table = soup.find('table', id="main_table_countries_yesterday2")
table_rows = table.find_all('tr', attrs={'style':''})
table_header = table.find_all('th')

# Checking header

In [4]:
header = [header_i.text for header_i in table_header]
print(header)

['#', 'Country,Other', 'TotalCases', 'NewCases', 'TotalDeaths', 'NewDeaths', 'TotalRecovered', 'NewRecovered', 'ActiveCases', 'Serious,Critical', 'Tot\xa0Cases/1M pop', 'Deaths/1M pop', 'TotalTests', 'Tests/\n1M pop\n', 'Population', 'Continent', '1 Caseevery X ppl', '1 Deathevery X ppl', '1 Testevery X ppl', 'New Cases/1M pop', 'New Deaths/1M pop', 'Active Cases/1M pop']


# Clean header

In [5]:
header = header[1:16]
for i in range(len(header)):
    header[i] = header[i].replace("\xa0"," ")
    header[i] = header[i].replace("\n","")
print(header)

['Country,Other', 'TotalCases', 'NewCases', 'TotalDeaths', 'NewDeaths', 'TotalRecovered', 'NewRecovered', 'ActiveCases', 'Serious,Critical', 'Tot Cases/1M pop', 'Deaths/1M pop', 'TotalTests', 'Tests/1M pop', 'Population', 'Continent']


# Extract data

In [6]:
rows = []
for row_i in table_rows[2:-2]:
  temp = row_i.text
  words = temp.split('\n')[2:17]
  rows.append(words)
  print(words)

['China', '111,520', '+325', '4,636 ', '', '102,832', '+110', '4,052', '9', '77', '3', '160,000,000', '111,163', '1,439,323,776 ', 'Asia']
['USA', '81,024,903', '+33,615', '988,208 ', '+1,299', '55,221,462', '+202,659', '24,815,233', '5,421', '242,395', '2,956', '960,233,929', '2,872,645', '334,268,263 ', 'North America']
['India', '42,975,883', '+4,575', '515,386 ', '+145', '42,413,566', '+7,416', '46,931', '8,944', '30,635', '367', '774,310,567', '551,964', '1,402,828,373 ', 'Asia']
['Brazil', '29,144,964', '+75,495', '652,936 ', '+518', '27,344,949', '+165,757', '1,147,079', '8,318', '135,494', '3,035', '63,776,166', '296,493', '215,101,451 ', 'South America']
['France', '23,164,872', '+93,050', '139,618 ', '+167', '21,836,839', '+98,559', '1,188,415', '2,484', '353,573', '2,131', '246,629,975', '3,764,393', '65,516,532 ', 'Europe']
['UK', '19,306,725', '+61,803', '162,359  ', '+212', '18,086,708', '+63,719', '1,057,658', '268', '281,908', '2,371', '487,512,832', '7,118,431', '68,48

# Cast to DataFrame

In [7]:
import pandas as pd 

df = pd.DataFrame(rows,columns=header)
df.head()

Unnamed: 0,"Country,Other",TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,NewRecovered,ActiveCases,"Serious,Critical",Tot Cases/1M pop,Deaths/1M pop,TotalTests,Tests/1M pop,Population,Continent
0,China,111520,325,4636,,102832,110,4052,9,77,3,160000000,111163,1439323776,Asia
1,USA,81024903,33615,988208,1299.0,55221462,202659,24815233,5421,242395,2956,960233929,2872645,334268263,North America
2,India,42975883,4575,515386,145.0,42413566,7416,46931,8944,30635,367,774310567,551964,1402828373,Asia
3,Brazil,29144964,75495,652936,518.0,27344949,165757,1147079,8318,135494,3035,63776166,296493,215101451,South America
4,France,23164872,93050,139618,167.0,21836839,98559,1188415,2484,353573,2131,246629975,3764393,65516532,Europe


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 218 entries, 0 to 217
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Country,Other     218 non-null    object
 1   TotalCases        218 non-null    object
 2   NewCases          218 non-null    object
 3   TotalDeaths       218 non-null    object
 4   NewDeaths         218 non-null    object
 5   TotalRecovered    218 non-null    object
 6   NewRecovered      218 non-null    object
 7   ActiveCases       218 non-null    object
 8   Serious,Critical  218 non-null    object
 9   Tot Cases/1M pop  218 non-null    object
 10  Deaths/1M pop     218 non-null    object
 11  TotalTests        218 non-null    object
 12  Tests/1M pop      218 non-null    object
 13  Population        218 non-null    object
 14  Continent         218 non-null    object
dtypes: object(15)
memory usage: 25.7+ KB


# Write to CSV

In [11]:
def crawl(delta=0):
    assert (0<=delta and delta<=2)
    
    id_table = ""
    if (delta==0): id_table = "main_table_countries_today"
    elif (delta==1): id_table = "main_table_countries_yesterday"
    else: id_table = "main_table_countries_yesterday2"

    URL = 'https://www.worldometers.info/coronavirus/'
    html = req.get(URL).content
    soup = BeautifulSoup(html, 'html.parser')

    table = soup.find('table', id=id_table)
    table_rows = table.find_all('tr', attrs={'style':''})
    table_header = table.find_all('th')
    
    header = [header_i.text for header_i in table_header]
    header = header[1:16]
    
    for i in range(len(header)):
        header[i] = header[i].replace("\xa0"," ")
        header[i] = header[i].replace("\n","")
    rows = []

    for row_i in table_rows[2:-2]:
        temp = row_i.text
        words = temp.split('\n')[2:17]
        rows.append(words)
    df = pd.DataFrame(rows,columns=header)
    
    today = datetime.datetime.now()
    yesterday = today - datetime.timedelta(days=delta)
    df.to_csv("data/{}-unclean.csv".format(str(yesterday).split(" ")[0]),index=False)

for i in range(0,3):
    print("Crawling day",i)
    crawl(i)

Crawling day 0
Crawling day 1
Crawling day 2
