## In this module, we will scrape Covid-19 data for India and World using web scraping

In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests

###### importing useful libraries:
#requests:  it requests data from the website
#BeautifulSoup: it extracts data from HTML

### Fetching covid-19 data for India

In [2]:
url="https://www.mohfw.gov.in/"

In [3]:
response=requests.get(url)
htmlcontent=response.content

##### when request is made to url, it returns a response, this response object is used to access certain features


In [4]:
print(response)

<Response [200]>


##### Response [200] indicates that the request has succeeded

In [5]:
soup=BeautifulSoup(response.content,"html.parser")

#### getting title 

In [6]:
soup.title

<title>MoHFW | Home</title>

##### getting only text from title

In [7]:
soup.title.text

'MoHFW | Home'

###### finding table from table tag

In [8]:
get_table=soup.find_all("table")

In [9]:
#get_table

#### Finding length of get_table as there can be more than one table present

In [10]:
len(get_table)

1

In [11]:
#get_table

#### if the length of the table is more than one it means there are more than one table presents, so to fetch the required table we do indexing

In [12]:
co=get_table[0]

#### taking data from this table
#### taking empty lists of the column, finding rows using tr tag, iterating the rows and appending the values

In [13]:
Name_of_state=[]
Total_confirmed_cases=[]
Cured_or_discharged_or_migrated=[]
Death=[]
rows= co.find_all("tr")[1:-5]
for row in rows:
    col=row.find_all("td")
    Name_of_state.append(col[1].text.strip())
    Total_confirmed_cases.append(col[2].text.strip())
    Cured_or_discharged_or_migrated.append(col[3].text.strip())
    Death.append(col[4].text.strip())
    

In [14]:
print(Name_of_state)
print(Total_confirmed_cases)
print(Cured_or_discharged_or_migrated)
print(Death)

['Andaman and Nicobar Islands', 'Andhra Pradesh', 'Arunachal Pradesh', 'Assam', 'Bihar', 'Chandigarh', 'Chhattisgarh', 'Delhi', 'Goa', 'Gujarat', 'Haryana', 'Himachal Pradesh', 'Jammu and Kashmir', 'Jharkhand', 'Karnataka', 'Kerala', 'Ladakh', 'Madhya Pradesh', 'Maharashtra', 'Manipur', 'Meghalaya', 'Mizoram', 'Odisha', 'Puducherry', 'Punjab', 'Rajasthan', 'Tamil Nadu', 'Telengana', 'Tripura', 'Uttarakhand', 'Uttar Pradesh']
['33', '1583', '1', '43', '482', '94', '43', '4122', '7', '5055', '394', '40', '666', '115', '606', '500', '40', '2846', '12296', '2', '12', '1', '160', '8', '772', '2772', '2757', '1063', '4', '59', '2626']
['17', '488', '1', '32', '117', '19', '36', '1256', '7', '896', '227', '33', '254', '22', '282', '400', '17', '798', '2000', '2', '0', '0', '56', '5', '112', '1121', '1341', '458', '2', '39', '698']
['0', '33', '0', '1', '4', '0', '0', '64', '0', '262', '4', '1', '8', '3', '25', '4', '0', '156', '521', '0', '1', '0', '1', '0', '20', '65', '29', '28', '0', '0', 

#### We want this data in ordered way so creating a dataframe

In [15]:
df=pd.DataFrame(list(zip(Name_of_state,Total_confirmed_cases,Cured_or_discharged_or_migrated,Death)),columns=["State","Total_confirmed_cases","Cured_or_discharged_or_migrated","Death"])

In [16]:
df

Unnamed: 0,State,Total_confirmed_cases,Cured_or_discharged_or_migrated,Death
0,Andaman and Nicobar Islands,33,17,0
1,Andhra Pradesh,1583,488,33
2,Arunachal Pradesh,1,1,0
3,Assam,43,32,1
4,Bihar,482,117,4
5,Chandigarh,94,19,0
6,Chhattisgarh,43,36,0
7,Delhi,4122,1256,64
8,Goa,7,7,0
9,Gujarat,5055,896,262


#### there are no such missing values in this data

#### checking datatype

In [17]:
df.dtypes

State                              object
Total_confirmed_cases              object
Cured_or_discharged_or_migrated    object
Death                              object
dtype: object

#### the datatype is object for numerical values also, so converting data type to int which are hab=ving numerical values

In [18]:
df["Total_confirmed_cases"]=df["Total_confirmed_cases"].astype(int)
df["Cured_or_discharged_or_migrated"]=df["Cured_or_discharged_or_migrated"].astype(int)
df["Death"]=df["Death"].astype(int)

In [19]:
df.dtypes

State                              object
Total_confirmed_cases               int32
Cured_or_discharged_or_migrated     int32
Death                               int32
dtype: object

In this way we get covid-19 data for India






# World covid-19 data

#### in similar way, we will fetch world's covid-19 data

In [20]:
url2="https://www.worldometers.info/coronavirus/"
response2=requests.get(url2)

In [21]:
content2=response2.content

In [22]:
soup2=BeautifulSoup(content2,"html.parser")

In [23]:
soup2.title.string

'Coronavirus Update (Live): 3,562,525 Cases and 248,103 Deaths from COVID-19 Virus Pandemic - Worldometer'

#### finding data using div tag and its class

In [24]:
divs=soup2.find("div",class_="panel_flip")
for i in divs.find_all("div",class_="panel_front"):
    print(i.text)


2,161,351
Currently Infected Patients


2,111,308 (98%)
in Mild Condition

50,043 (2%)
Serious or Critical


Show Graph



In [25]:
coronatable=soup2.find_all("table")

In [26]:
#print(coronatable)

In [27]:
len(coronatable)

2

In [28]:
cp=coronatable[0]


In [29]:
Country=[]
Total_cases=[]
New_cases=[]
Total_deaths=[]
New_deaths=[]
Total_recovered=[]
Active_cases=[]
Serious=[]
Case_per_million=[]
Deaths_per_million=[]
Total_tests=[]
Tests_per_million=[]
rows=cp.find_all("tr")[9:-8]
for row in rows:
    col=row.find_all("td")
    Country.append(col[0].text.strip())
    Total_cases.append(col[1].text.strip())
    New_cases.append(col[2].text.strip())
    Total_deaths.append(col[3].text.strip())
    New_deaths.append(col[4].text.strip())
    Total_recovered.append(col[5].text.strip())
    Active_cases.append(col[6].text.strip())
    Serious.append(col[7].text.strip())
    Case_per_million.append(col[8].text.strip())
    Deaths_per_million.append(col[9].text.strip())
    Total_tests.append(col[10].text.strip())
    Tests_per_million.append(col[11].text.strip())
    
#print(Country)



In [30]:
world_df=pd.DataFrame(list(zip(Country,Total_cases,New_cases,Total_deaths,New_deaths,Total_recovered,Active_cases,Serious,Case_per_million,Deaths_per_million,Total_tests,Tests_per_million)),columns=["Country","Total_cases","New_cases","Total_deaths","New_deaths","Total_recovered","Active_cases","Serious","Case_per_million","Deaths_per_million","Total_tests","Tests_per_million"])

In [31]:
world_df

Unnamed: 0,Country,Total_cases,New_cases,Total_deaths,New_deaths,Total_recovered,Active_cases,Serious,Case_per_million,Deaths_per_million,Total_tests,Tests_per_million
0,USA,1187387,+26613,68570,+1126,178263,940554,16139,3587,207,7188538,21717
1,Spain,247122,+1533,25264,+164,148558,73300,2386,5285,540,1932455,41332
2,Italy,210717,+1389,28884,+174,81654,100179,1501,3485,478,2153772,35622
3,UK,186599,+4339,28446,+315,,157809,1559,2749,419,1206405,17771
4,France,168693,+297,24895,+135,50784,93014,3819,2584,381,1100228,16856
5,Germany,165664,+697,6866,+54,130600,28198,1979,1977,82,2547052,30400
6,Russia,134687,+10633,1280,+58,16639,116768,2300,923,9,4100000,28095
7,Turkey,126045,+1670,3397,+61,63151,59497,1424,1495,40,1135367,13462
8,Brazil,101147,+4588,7025,+275,42991,51131,8318,476,33,339552,1597
9,Iran,97424,+976,6203,+47,78422,12799,2690,1160,74,496273,5909


In [32]:
world_df.dtypes

Country               object
Total_cases           object
New_cases             object
Total_deaths          object
New_deaths            object
Total_recovered       object
Active_cases          object
Serious               object
Case_per_million      object
Deaths_per_million    object
Total_tests           object
Tests_per_million     object
dtype: object

In [33]:
world_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 214 entries, 0 to 213
Data columns (total 12 columns):
Country               214 non-null object
Total_cases           214 non-null object
New_cases             214 non-null object
Total_deaths          214 non-null object
New_deaths            214 non-null object
Total_recovered       214 non-null object
Active_cases          214 non-null object
Serious               214 non-null object
Case_per_million      214 non-null object
Deaths_per_million    214 non-null object
Total_tests           214 non-null object
Tests_per_million     214 non-null object
dtypes: object(12)
memory usage: 20.1+ KB


In [34]:
#world_df

In [35]:
world_df.shape

(214, 12)

In [36]:
missing=world_df[world_df.isnull().any(axis=1)]

#### now there are lot of missing values in the data

#### replacing empty values with NaN 

In [37]:
world_df.replace(r'^\s*$', np.NaN, regex=True, inplace=True)

In [38]:
world_df

Unnamed: 0,Country,Total_cases,New_cases,Total_deaths,New_deaths,Total_recovered,Active_cases,Serious,Case_per_million,Deaths_per_million,Total_tests,Tests_per_million
0,USA,1187387,+26613,68570,+1126,178263,940554,16139,3587,207,7188538,21717
1,Spain,247122,+1533,25264,+164,148558,73300,2386,5285,540,1932455,41332
2,Italy,210717,+1389,28884,+174,81654,100179,1501,3485,478,2153772,35622
3,UK,186599,+4339,28446,+315,,157809,1559,2749,419,1206405,17771
4,France,168693,+297,24895,+135,50784,93014,3819,2584,381,1100228,16856
5,Germany,165664,+697,6866,+54,130600,28198,1979,1977,82,2547052,30400
6,Russia,134687,+10633,1280,+58,16639,116768,2300,923,9,4100000,28095
7,Turkey,126045,+1670,3397,+61,63151,59497,1424,1495,40,1135367,13462
8,Brazil,101147,+4588,7025,+275,42991,51131,8318,476,33,339552,1597
9,Iran,97424,+976,6203,+47,78422,12799,2690,1160,74,496273,5909


#### replacing NaN values with 0 for New_cases and New_deaths
#### Also replacing N/A value with NaN present in Total_recovered column

In [39]:
world_df["New_cases"].replace(np.nan,0, inplace=True)
world_df["New_deaths"].replace(np.nan,0, inplace=True)
world_df["Total_recovered"].replace("N/A",np.nan,inplace=True)
world_df

Unnamed: 0,Country,Total_cases,New_cases,Total_deaths,New_deaths,Total_recovered,Active_cases,Serious,Case_per_million,Deaths_per_million,Total_tests,Tests_per_million
0,USA,1187387,+26613,68570,+1126,178263,940554,16139,3587,207,7188538,21717
1,Spain,247122,+1533,25264,+164,148558,73300,2386,5285,540,1932455,41332
2,Italy,210717,+1389,28884,+174,81654,100179,1501,3485,478,2153772,35622
3,UK,186599,+4339,28446,+315,,157809,1559,2749,419,1206405,17771
4,France,168693,+297,24895,+135,50784,93014,3819,2584,381,1100228,16856
5,Germany,165664,+697,6866,+54,130600,28198,1979,1977,82,2547052,30400
6,Russia,134687,+10633,1280,+58,16639,116768,2300,923,9,4100000,28095
7,Turkey,126045,+1670,3397,+61,63151,59497,1424,1495,40,1135367,13462
8,Brazil,101147,+4588,7025,+275,42991,51131,8318,476,33,339552,1597
9,Iran,97424,+976,6203,+47,78422,12799,2690,1160,74,496273,5909


#### using replace function :
#### removing "+" present in New_cases and New_deaths columns
#### also removing "," from numeric values 
#### and appending into new empty list

In [40]:
New_Cases=[]
for i in world_df["New_cases"]:
    s=str(i)
    s1=s.replace("+","")
    s2=s1.replace(",","")
    s3=int(s2)
    New_Cases.append(s3)
    
#print(New_Cases)

In [41]:
New_Deaths=[]
for i in world_df["New_deaths"]:
    s=str(i)
    s1=s.replace("+","")
    s2=s1.replace(",","")
    s3=int(s2)
    New_Deaths.append(s3)
    
#print(New_Deaths)

In [42]:
Total_Cases=[]
for i in world_df["Total_cases"]:
    s=str(i)
    s1=s.replace(",","")
    s2=int(s1)
    Total_Cases.append(s2)
    
#print(Total_Cases)

In [43]:
Total_Deaths=[]
for i in world_df["Total_deaths"]:
    s=str(i)
    s1=s.replace(",","")
    Total_Deaths.append(s1)
#print(Total_Deaths)

In [44]:
Total_Recovered=[]
for i in world_df["Total_recovered"]:
    s=str(i)
    s2=s.replace(",","")
    Total_Recovered.append(s2)
    
#print(Total_Recovered)

In [45]:
Active_Cases=[]
for i in world_df["Active_cases"]:
    s=str(i)
    s1=s.replace(",","")
    s2=int(s1)
    Active_Cases.append(s2)
    
#print(Active_Cases)

In [46]:
Serious1=[]
for i in world_df["Serious"]:
    s=str(i)
    s1=s.replace(",","")
    Serious1.append(s1)
    
#print(Serious1)

In [47]:
Case_per_Million=[]
for i in world_df["Case_per_million"]:
    s=str(i)
    s1=s.replace(",","")
    Case_per_Million.append(s1)
    
#print(Case_per_Million)

In [48]:
Deaths_per_Million=[]
for i in world_df["Deaths_per_million"]:
    s=str(i)
    s1=s.replace(",","")
    Deaths_per_Million.append(s1)
    
#print(Deaths_per_Million)

In [49]:
Total_Tests=[]
for i in world_df["Total_tests"]:
    s=str(i)
    s1=s.replace(",","")
    Total_Tests.append(s1)
    
#print(Total_Tests)

In [50]:
Tests_per_Million=[]
for i in world_df["Tests_per_million"]:
    s=str(i)
    s1=s.replace(",","")
    Tests_per_Million.append(s1)
    
#print(Tests_per_Million)

#### crating a new dataframe using above data 

In [51]:
world_df=pd.DataFrame(list(zip(Country,Total_Cases,New_Cases,Total_Deaths,New_Deaths,Total_Recovered,Active_Cases,Serious1,Case_per_Million,Deaths_per_Million,Total_Tests,Tests_per_Million)),columns=["Country","Total_cases","New_cases","Total_deaths","New_deaths","Total_recovered","Active_cases","Serious","Case_per_million","Deaths_per_million","Total_tests","Tests_per_million"])

In [52]:
#world_df

In [53]:
world_df.dtypes

Country               object
Total_cases            int64
New_cases              int64
Total_deaths          object
New_deaths             int64
Total_recovered       object
Active_cases           int64
Serious               object
Case_per_million      object
Deaths_per_million    object
Total_tests           object
Tests_per_million     object
dtype: object

#### now the datatype for numeric value is object, so converting it into float. can't convert to int directly since it contains NaN values

In [54]:
world_df["Total_deaths"]=world_df["Total_deaths"].astype(float)
world_df["Total_recovered"]=world_df["Total_recovered"].astype(float)
world_df["Active_cases"]=world_df["Active_cases"].astype(float)
world_df["Serious"]=world_df["Serious"].astype(float)
world_df["Case_per_million"]=world_df["Case_per_million"].astype(float)
world_df["Deaths_per_million"]=world_df["Deaths_per_million"].astype(float)
world_df["Total_tests"]=world_df["Total_tests"].astype(float)
world_df["Tests_per_million"]=world_df["Tests_per_million"].astype(float)



In [55]:
world_df.dtypes

Country                object
Total_cases             int64
New_cases               int64
Total_deaths          float64
New_deaths              int64
Total_recovered       float64
Active_cases          float64
Serious               float64
Case_per_million      float64
Deaths_per_million    float64
Total_tests           float64
Tests_per_million     float64
dtype: object

In [56]:
world_df.describe()

Unnamed: 0,Total_cases,New_cases,Total_deaths,New_deaths,Total_recovered,Active_cases,Serious,Case_per_million,Deaths_per_million,Total_tests,Tests_per_million
count,214.0,214.0,176.0,214.0,205.0,214.0,135.0,212.0,174.0,179.0,179.0
mean,16647.31,379.219626,1409.676136,16.074766,5621.839024,10099.771028,370.688889,894.633019,51.300805,212198.7,15375.290503
std,87425.41,2034.884743,6569.198445,85.58747,21559.980963,66487.284176,1632.779434,1950.619135,139.030765,696036.5,25247.939648
min,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.3,0.03,19.0,4.0
25%,81.25,0.0,6.0,0.0,27.0,26.25,3.0,37.0,1.0,2410.0,908.0
50%,585.5,3.0,24.0,0.0,235.0,243.0,13.0,180.0,5.0,25206.0,5909.0
75%,4705.75,77.75,194.0,2.0,1441.0,2027.75,88.5,954.75,29.75,144037.0,18117.0
max,1187387.0,26613.0,68570.0,1126.0,178263.0,940554.0,16139.0,17152.0,1208.0,7188538.0,165074.0


#### finding missing value in data

In [57]:
world_df.isnull().sum()

Country                0
Total_cases            0
New_cases              0
Total_deaths          38
New_deaths             0
Total_recovered        9
Active_cases           0
Serious               79
Case_per_million       2
Deaths_per_million    40
Total_tests           35
Tests_per_million     35
dtype: int64

#### from above "describe" function it can be seen that there is a huge difference between mean and median for every column, this can be due to the outliers in the data
#### the mean value is very high than median value
#### therefore, filling missing values with the median 

In [58]:
world_df["Total_deaths"].fillna(world_df["Total_deaths"].median(),inplace=True)

In [59]:
world_df["Serious"].fillna(world_df["Serious"].median(),inplace=True)

In [60]:
world_df["Case_per_million"].fillna(world_df["Case_per_million"].median(),inplace=True)

In [61]:
world_df["Deaths_per_million"].fillna(world_df["Deaths_per_million"].median(),inplace=True)

In [62]:
world_df["Total_tests"].fillna(world_df["Total_tests"].median(),inplace=True)

In [63]:
world_df["Tests_per_million"].fillna(world_df["Tests_per_million"].median(),inplace=True)

#### for Total_recovered column, there are 9 missing rows and the median value is 234, but when checked data, the values are mssing from those countries which does not have much coronavirus cases, and when google, the recovered cases for these countries are very less, therefore filled missing values with min value

In [64]:
world_df["Total_recovered"].fillna(world_df["Total_recovered"].min(),inplace=True)

#### now checking missing values:


In [65]:
world_df.isnull().sum()

Country               0
Total_cases           0
New_cases             0
Total_deaths          0
New_deaths            0
Total_recovered       0
Active_cases          0
Serious               0
Case_per_million      0
Deaths_per_million    0
Total_tests           0
Tests_per_million     0
dtype: int64

### there is no missing value
