METHOD1: USING lxml and requests

In [1]:
import requests, six
import lxml.html as lh
from itertools import cycle, islice
import pandas as pd

In [2]:
url = 'https://www.worldometers.info/coronavirus/'

In [3]:
# create a variable to handle the contents of the website
page = requests.get(url)

In [4]:
# storing the contents of the website under one document variable
doc = lh.fromstring(page.content)

In [5]:
# the rows of a table are stored under the <tr> headers
# parsing the the data between the <tr> .. </tr> of the site
tr_elements = doc.xpath('//tr')

In [6]:
# checking that all the rows of the table have the same number of elements
l = []
for T in tr_elements[:12]:
    l.append(len(T))
    
l

[13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13]

In [7]:
# thus the first few rows of the html table have the same number of elements
# this ensures that we are collecting the data from the correct portion of the webpage

tr_elements = doc.xpath('//tr')

col = []
i = 0

# for each of the column of the table, we store the first element of the column i.e. col name along with empty list in a tuple.
# We use tuples because they are memory efficient and fast

for t in tr_elements[0]:
    i+=1
    name = t.text_content()
    print('%d : %s'%(i,name))
    # appending the name of the column and an empty list into the main list of all columns.
    col.append((name, []))

1 : Country,Other
2 : TotalCases
3 : NewCases
4 : TotalDeaths
5 : NewDeaths
6 : TotalRecovered
7 : ActiveCases
8 : Serious,Critical
9 : Tot Cases/1M pop
10 : Deaths/1M pop
11 : TotalTests
12 : Tests/
1M pop

13 : Continent


In [8]:
type(tr_elements[3])

lxml.html.HtmlElement

In [9]:
# filling the empty lists with the elements of the columns so that we have the complete html table.

for j in range(1, len(tr_elements)):
    T = tr_elements[j]
    # if the row is not of size 13 then the data is not from the desired html table
    if len(T) != 13:
        break
    # i is the index of our column
    i = 0
    # iterating through each of the elements of the row
    for t in T.iterchildren():
        data = t.text_content()
        # checking if the row is empty.....................DID NOT UNDERSTAND THE NEXT LINE..............
        if i>0:
            try:
                data = float(data)
            except:
                pass
        # appending the data into the empty list of the respective column
        col[i][1].append(data)    # using 1 as the column number because we ar dealing with a tuple
        # going to the next column
        i += 1        

In [10]:
# Checking the length of each column... ideally they should all be the same
[len(C) for (title,C) in col]

[457, 457, 457, 457, 457, 457, 457, 457, 457, 457, 457, 457, 457]

In [11]:
# Great all columns have the same number of elements.. thus we can say we have imported the html table saafely

# we can conver this list of tuples into a dictionary which we can inturn convert to a dataframe using pandas
Dict = {title:column for (title, column) in col}
df = pd.DataFrame(Dict)

In [12]:
df.head(30)

Unnamed: 0,"Country,Other",TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,ActiveCases,"Serious,Critical",Tot Cases/1M pop,Deaths/1M pop,TotalTests,Tests/\n1M pop\n,Continent
0,\nNorth America\n,881249,1382.0,48613,177.0,100507.0,732129,15268.0,,,,,North America
1,\nEurope\n,1153207,13156.0,108750,844.0,354745.0,689712,26023.0,,,,,Europe
2,\nAsia\n,415155,4895.0,15665,156.0,195644.0,203846,6228.0,,,,,Asia
3,\nSouth America\n,91789,529.0,4344,32.0,39780.0,47665,9472.0,,,,,South America
4,\nOceania\n,8201,11.0,88,4.0,6016.0,2097,51.0,,,,,Australia/Oceania
5,\nAfrica\n,25777,366.0,1202,5.0,6806.0,17769,167.0,,,,,Africa
6,\n\n,721,,15,,644.0,62,7.0,,,,,
7,World,2576099,20339.0,178677,1218.0,704142.0,1693280,57216.0,330.0,22.9,,,All
8,USA,819175,431.0,45343,25.0,82973.0,690859,14016.0,2475.0,137.0,4190002.0,12659.0,North America
9,Spain,208389,4211.0,21717,435.0,85915.0,100757,7705.0,4457.0,464.0,930230.0,19896.0,Europe


In [13]:
df.tail(20)

Unnamed: 0,"Country,Other",TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,ActiveCases,"Serious,Critical",Tot Cases/1M pop,Deaths/1M pop,TotalTests,Tests/\n1M pop\n,Continent
437,Papua New Guinea,7,,,,,7,,0.8,,604.0,68.0,Australia/Oceania
438,Bhutan,6,1.0,,,2.0,4,,8.0,,8765.0,11359.0,Asia
439,St. Barth,6,,,,6.0,0,,607.0,,,,North America
440,Western Sahara,6,,,,,6,,10.0,,,,Africa
441,British Virgin Islands,5,,1.0,,3.0,1,,165.0,33.0,,,North America
442,Burundi,5,,1.0,,4.0,0,,0.4,0.08,80.0,7.0,Africa
443,Caribbean Netherlands,5,,,,,5,,191.0,,110.0,4195.0,North America
444,Sao Tome and Principe,4,,,,,4,,18.0,,19.0,87.0,Africa
445,South Sudan,4,,,,,4,,0.4,,,,Africa
446,Anguilla,3,,,,1.0,2,,200.0,,,,North America


In [14]:
df = df[7:906]

In [15]:
df

Unnamed: 0,"Country,Other",TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,ActiveCases,"Serious,Critical",Tot Cases/1M pop,Deaths/1M pop,TotalTests,Tests/\n1M pop\n,Continent
7,World,2576099,+20339,178677,+1218,704142,1693280,57216,330,22.9,,,All
8,USA,819175,431,45343,25,82973,690859,14016,2475,137,4190002,12659,North America
9,Spain,208389,+4211,21717,435,85915,100757,7705,4457,464,930230,19896,Europe
10,Italy,183957,,24648,,51600,107709,2471,3043,408,1450150,23985,Europe
11,France,158050,,20796,,39181,98073,5433,2421,319,463662,7103,Europe
...,...,...,...,...,...,...,...,...,...,...,...,...,...
452,Total:,91260,+4774,4312,233,39757,47191,9472,,,,,South America
453,Total:,8190,26,84,1,5751,2355,54,,,,,Australia/Oceania
454,Total:,25411,+1240,1197,33,6770,17444,164,,,,,Africa
455,Total:,721,,15,,644,62,7,,,,,


In [16]:
df.reset_index(inplace = True)

In [17]:
df.columns

Index(['index', 'Country,Other', 'TotalCases', 'NewCases', 'TotalDeaths',
       'NewDeaths', 'TotalRecovered', 'ActiveCases', 'Serious,Critical',
       'Tot Cases/1M pop', 'Deaths/1M pop', 'TotalTests', 'Tests/\n1M pop\n',
       'Continent'],
      dtype='object')

In [18]:
del df['index']

In [19]:
df

Unnamed: 0,"Country,Other",TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,ActiveCases,"Serious,Critical",Tot Cases/1M pop,Deaths/1M pop,TotalTests,Tests/\n1M pop\n,Continent
0,World,2576099,+20339,178677,+1218,704142,1693280,57216,330,22.9,,,All
1,USA,819175,431,45343,25,82973,690859,14016,2475,137,4190002,12659,North America
2,Spain,208389,+4211,21717,435,85915,100757,7705,4457,464,930230,19896,Europe
3,Italy,183957,,24648,,51600,107709,2471,3043,408,1450150,23985,Europe
4,France,158050,,20796,,39181,98073,5433,2421,319,463662,7103,Europe
...,...,...,...,...,...,...,...,...,...,...,...,...,...
445,Total:,91260,+4774,4312,233,39757,47191,9472,,,,,South America
446,Total:,8190,26,84,1,5751,2355,54,,,,,Australia/Oceania
447,Total:,25411,+1240,1197,33,6770,17444,164,,,,,Africa
448,Total:,721,,15,,644,62,7,,,,,


-----

METHOD2: DIRECTLY importing HTML tables as data frames using pandas

In [20]:
import pandas as pd
import requests

url = 'https://www.worldometers.info/coronavirus/'

header = {
  "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36",
  "X-Requested-With": "XMLHttpRequest"
}

r = requests.get(url, headers=header)
# remember that this returns a list of dataframes
df = pd.read_html(r.text)

In [21]:
df

[             Country,Other  TotalCases NewCases  TotalDeaths NewDeaths  \
 0                    World     2576099  +20,339     178677.0    +1,218   
 1                      USA      819175     +431      45343.0       +25   
 2                    Spain      208389   +4,211      21717.0      +435   
 3                    Italy      183957      NaN      24648.0       NaN   
 4                   France      158050      NaN      20796.0       NaN   
 ..                     ...         ...      ...          ...       ...   
 209               Anguilla           3      NaN          NaN       NaN   
 210  Saint Pierre Miquelon           1      NaN          NaN       NaN   
 211                  Yemen           1      NaN          NaN       NaN   
 212                  China       82788      +30       4632.0       NaN   
 213                 Total:     2576099  +20,339     178677.0    +1,218   
 
      TotalRecovered  ActiveCases  Serious,Critical  Tot Cases/1M pop  \
 0          704142.0     

In [22]:
# 214 rows x 12 columns

In [23]:
dfs = df[0]

In [24]:
dfs.head()

Unnamed: 0,"Country,Other",TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,ActiveCases,"Serious,Critical",Tot Cases/1M pop,Deaths/1M pop,TotalTests,Tests/ 1M pop
0,World,2576099,20339.0,178677.0,1218.0,704142.0,1693280,57216.0,330.0,22.9,,
1,USA,819175,431.0,45343.0,25.0,82973.0,690859,14016.0,2475.0,137.0,4190002.0,12659.0
2,Spain,208389,4211.0,21717.0,435.0,85915.0,100757,7705.0,4457.0,464.0,930230.0,19896.0
3,Italy,183957,,24648.0,,51600.0,107709,2471.0,3043.0,408.0,1450150.0,23985.0
4,France,158050,,20796.0,,39181.0,98073,5433.0,2421.0,319.0,463662.0,7103.0


------

METHOD 3: Webscraping HTML using BeautifulSoup Library

In [25]:
from bs4 import BeautifulSoup
import csv
from urllib.request import Request, urlopen

In [26]:
url_page = 'https://www.worldometers.info/coronavirus/'

In [27]:
# Now we query the page and return the html to the page varible

#..  page = urllib.request.urlopen(url_page)
# now if we straightaway run the above command we will get a 403 error
# because we did not disguise our request and the website is secure enough to 
# avoid our request thinking it was a bot.

try:
    page = urllib.request.urlopen(url)
except:
    print("An error occured.")

An error occured.


In [28]:
req = Request(url_page, headers = {'User-Agent':'Mozilla/5.0'})
page = urlopen(req).read()
soup = BeautifulSoup(page, 'lxml')

In [29]:
print(soup)

<!DOCTYPE html>
<!--[if IE 8]> <html lang="en" class="ie8"> <![endif]--><!--[if IE 9]> <html lang="en" class="ie9"> <![endif]--><!--[if !IE]><!--><html lang="en">
<!--<![endif]-->
<head>
<meta charset="utf-8"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<title>Coronavirus Update (Live): 2,576,099 Cases and 178,677 Deaths from COVID-19 Virus Pandemic - Worldometer</title>
<meta content="Live statistics and coronavirus news tracking the number of confirmed cases, recovered patients, tests, and death toll due to the COVID-19 coronavirus from Wuhan, China. Coronavirus counter with new cases, deaths, and number of tests per 1 Million population. Historical data and info. Daily charts, graphs, news and updates" name="description"/>
<link href="/favicon/favicon.ico" rel="shortcut icon" type="image/x-icon"/>
<link href="/favicon/apple-icon-57x57.png" rel="apple-touch-icon" sizes="57x57"/>
<link href="/favicon/app

In [30]:
table = soup.find('table')

In [31]:
table_th = table.find_all('th')
headers = [i.text for i in table_th]
print(headers)
print(len(headers))

['Country,Other', 'TotalCases', 'NewCases', 'TotalDeaths', 'NewDeaths', 'TotalRecovered', 'ActiveCases', 'Serious,Critical', 'Tot\xa0Cases/1M pop', 'Deaths/1M pop', 'TotalTests', 'Tests/\n1M pop\n', 'Continent']
13


In [32]:
table_rows = table.find_all('tr')

In [33]:
# checking that the length of the rows is same for all the extracted rows
list = []
for row in table_rows[:10]:
    list.append(len(row))

list

[27, 27, 27, 27, 27, 27, 27, 27, 27, 27]

In [34]:
rows = []
for row in table_rows:
    table_data = row.find_all('td')
    row_data = [i.text for i in table_data]
    rows.append(row_data)
    print(row_data)
    print(len(row_data))

[]
0
['\nNorth America\n', '881,249', '+1,382', '48,613', '+177', '100,507', '732,129', '15,268', '', '', '', '', 'North America']
13
['\nEurope\n', '1,153,207', '+13,156', '108,750', '+844', '354,745', '689,712', '26,023', '', '', '', '', 'Europe']
13
['\nAsia\n', '415,193', '+4,933', '15,665', '+156', '195,686', '203,842', '6,228', '', '', '', '', 'Asia']
13
['\nSouth America\n', '91,789', '+529', '4,344', '+32', '39,780', '47,665', '9,472', '', '', '', '', 'South America']
13
['\nOceania\n', '8,201', '+11', '88', '+4', '6,016', '2,097', '51', '', '', '', '', 'Australia/Oceania']
13
['\nAfrica\n', '25,777', '+366', '1,202', '+5', '6,806', '17,769', '167', '', '', '', '', 'Africa']
13
['\n\n', '721', '', '15', '', '644', '62', '7', '', '', '', '', '']
13
['World', '2,576,137', '+20,377', '178,677', '+1,218', '704,184', '1,693,276', '57,216', '330', '22.9', '', '', 'All']
13
['USA', '819,175', '+431', '45,343 ', '+25', '82,973', '690,859', '14,016', '2,475', '137', '4,190,002', '12,659

In [35]:
import pandas as pd
df = pd.DataFrame(rows)

In [36]:
df.head(20)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,,,,,,,,,,,,,
1,\nNorth America\n,881249.0,1382.0,48613.0,177.0,100507.0,732129.0,15268.0,,,,,North America
2,\nEurope\n,1153207.0,13156.0,108750.0,844.0,354745.0,689712.0,26023.0,,,,,Europe
3,\nAsia\n,415193.0,4933.0,15665.0,156.0,195686.0,203842.0,6228.0,,,,,Asia
4,\nSouth America\n,91789.0,529.0,4344.0,32.0,39780.0,47665.0,9472.0,,,,,South America
5,\nOceania\n,8201.0,11.0,88.0,4.0,6016.0,2097.0,51.0,,,,,Australia/Oceania
6,\nAfrica\n,25777.0,366.0,1202.0,5.0,6806.0,17769.0,167.0,,,,,Africa
7,\n\n,721.0,,15.0,,644.0,62.0,7.0,,,,,
8,World,2576137.0,20377.0,178677.0,1218.0,704184.0,1693276.0,57216.0,330.0,22.9,,,All
9,USA,819175.0,431.0,45343.0,25.0,82973.0,690859.0,14016.0,2475.0,137.0,4190002.0,12659.0,North America


In [37]:
df = df[8:]
df.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
8,World,2576137,20377.0,178677,1218.0,704184.0,1693276,57216,330,22.9,,,All
9,USA,819175,431.0,45343,25.0,82973.0,690859,14016,2475,137.0,4190002.0,12659.0,North America
10,Spain,208389,4211.0,21717,435.0,85915.0,100757,7705,4457,464.0,930230.0,19896.0,Europe
11,Italy,183957,,24648,,51600.0,107709,2471,3043,408.0,1450150.0,23985.0,Europe
12,France,158050,,20796,,39181.0,98073,5433,2421,319.0,463662.0,7103.0,Europe
13,Germany,148704,251.0,5100,14.0,99400.0,44204,2908,1775,61.0,1728357.0,20629.0,Europe
14,UK,129044,,17337,,,111363,1559,1901,255.0,535342.0,7886.0,Europe
15,Turkey,95591,,2259,,14918.0,78414,1865,1133,27.0,713409.0,8459.0,Asia
16,Iran,85996,1194.0,5391,94.0,63113.0,17492,3311,1024,64.0,377396.0,4493.0,Asia
17,Russia,57999,5236.0,513,57.0,4420.0,53066,700,397,4.0,2250000.0,15418.0,Europe


In [38]:
df.reset_index(inplace = True)

In [39]:
del df['index']

In [40]:
df.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,World,2576137,20377.0,178677,1218.0,704184.0,1693276,57216,330,22.9,,,All
1,USA,819175,431.0,45343,25.0,82973.0,690859,14016,2475,137.0,4190002.0,12659.0,North America
2,Spain,208389,4211.0,21717,435.0,85915.0,100757,7705,4457,464.0,930230.0,19896.0,Europe
3,Italy,183957,,24648,,51600.0,107709,2471,3043,408.0,1450150.0,23985.0,Europe
4,France,158050,,20796,,39181.0,98073,5433,2421,319.0,463662.0,7103.0,Europe
5,Germany,148704,251.0,5100,14.0,99400.0,44204,2908,1775,61.0,1728357.0,20629.0,Europe
6,UK,129044,,17337,,,111363,1559,1901,255.0,535342.0,7886.0,Europe
7,Turkey,95591,,2259,,14918.0,78414,1865,1133,27.0,713409.0,8459.0,Asia
8,Iran,85996,1194.0,5391,94.0,63113.0,17492,3311,1024,64.0,377396.0,4493.0,Asia
9,Russia,57999,5236.0,513,57.0,4420.0,53066,700,397,4.0,2250000.0,15418.0,Europe


In [41]:
df.columns = headers

In [42]:
df.head()

Unnamed: 0,"Country,Other",TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,ActiveCases,"Serious,Critical",Tot Cases/1M pop,Deaths/1M pop,TotalTests,Tests/\n1M pop\n,Continent
0,World,2576137,20377.0,178677,1218.0,704184,1693276,57216,330,22.9,,,All
1,USA,819175,431.0,45343,25.0,82973,690859,14016,2475,137.0,4190002.0,12659.0,North America
2,Spain,208389,4211.0,21717,435.0,85915,100757,7705,4457,464.0,930230.0,19896.0,Europe
3,Italy,183957,,24648,,51600,107709,2471,3043,408.0,1450150.0,23985.0,Europe
4,France,158050,,20796,,39181,98073,5433,2421,319.0,463662.0,7103.0,Europe


-----

METHOD 4: Webscraping HTML using Scrapy

-----

METHOD 5: Webscraping HTML using selenium