### Importing libraries

In [1]:
import pandas as pd
import numpy as np

import requests
from bs4 import BeautifulSoup

import warnings
warnings.filterwarnings("ignore")

### web-scraping

In [2]:
src = requests.get("https://www.worldometers.info/coronavirus/")

In [3]:
# successful hit of the website
src

<Response [200]>

In [4]:
# converting to bs4 object
soup = BeautifulSoup(src.text, 'lxml')

In [5]:
# finding the table
table = soup.find('table', id = 'main_table_countries_today')

In [6]:
# headers for the table
headers = [ele.text for ele in table.find_all('th')]

In [7]:
# selecting only the countries list
countries_list = table.find_all('tr')[9:-8]

In [8]:
# extracting the rows and storing it in a list
countries_df = pd.DataFrame(columns=headers)
for country in countries_list:
    countries_df.loc[len(countries_df)] = [ele.text for ele in country.find_all('td')]

In [9]:
# keeping only the required columns
countries_df = countries_df[['Country,Other', 'TotalCases', 'TotalDeaths', 'TotalTests', 'Population']]

In [10]:
# renaming columns
countries_df.rename(columns={'Country,Other' : 'country',
                             'TotalCases' : 'cases',
                             'TotalDeaths' : 'deaths',
                             'TotalTests' : 'tests',
                             'Population' : 'population'}, inplace=True)

<p>There are two columns that have null values in it. Deaths and population</p>

In [11]:
# removing the blank spaces
countries_df = countries_df[countries_df["deaths"] != " "]
countries_df = countries_df[countries_df["population"] != " "]
countries_df = countries_df[countries_df["tests"] != ""]

In [12]:
# converting object to int for the required columns
for col in countries_df.columns.tolist()[1:]:
    
    countries_df[col] = countries_df[col].apply(lambda x: int(x.replace(",", "")))

In [13]:
countries_df.reset_index(inplace=True, drop=True)

In [14]:
countries_df.head(3)

Unnamed: 0,country,cases,deaths,tests,population
0,USA,104996288,1142704,1163294129,334805269
1,India,44685132,530761,917870730,1406631776
2,France,39590190,164759,271490188,65584518


In [15]:
# calculating the tests_per_case
countries_df["tests_per_case"] = countries_df["tests"] / countries_df["cases"]

In [16]:
# rounding tests_per_case to four digits
countries_df["tests_per_case"] = countries_df["tests_per_case"].apply(lambda x: round(x, 4))

In [17]:
# setting country as the index
countries_df.set_index('country', inplace=True)

In [18]:
# displaying top 20 rows
countries_df.sort_values(by=["tests_per_case"], ascending=False).head(20)

Unnamed: 0_level_0,cases,deaths,tests,population,tests_per_case
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
China,503302,5272,160000000,1448471400,317.9006
UAE,1051241,2348,199077922,10081785,189.3742
Turks and Caicos,6551,38,611527,39741,93.3486
Oman,399449,4628,25000000,5323993,62.5862
Bermuda,18791,159,1026742,61939,54.6401
Saudi Arabia,829041,9606,45121063,35844909,54.4256
Rwanda,133170,1468,5959042,13600464,44.7476
Denmark,3174758,8237,129185732,5834950,40.6915
Bhutan,62611,21,2303734,787941,36.7944
Austria,5871234,21825,211273524,9066710,35.9845


<p>total tests per case is a feature that helps in identifying the number of tests required to identify a positive case. For example if 100 tests are done to identify 2 cases then the tests per case would be 50. Generally the assumption would be to suggest that a low value means that the infection is wide spread across the country and a high value means that the spread is not high. But this would not be a very good indicator in understanding the spread of the infection because countries like China, despite facing worst covid outbreaks in recent months, does not comply to the interpretation.</p>