<h1>Analysis of Global COVID-19 Pandemic Data</h1>

In [43]:
library(httr)
library(rvest)
library(magrittr)

## Getting a `COVID-19 pandemic` Wiki page using HTTP request


In [32]:
# creating a function with url and query inputs, to get an http response
get_covid19_page <- function(url,query) {
    response <- GET(url, query)
    return(response)
}  

base_url <- "https://en.wikipedia.org/w/index.php"
query_parameters <- list(title="Template:COVID-19_testing_by_country")

get_covid19_page(base_url,query_parameters)


Response [https://en.wikipedia.org/wiki/Main_Page]
  Date: 2023-02-28 13:12
  Status: 200
  Content-Type: text/html; charset=UTF-8
  Size: 93.7 kB
<!DOCTYPE html>
<html class="client-nojs vector-feature-language-in-header-enabled vector-fea...
<head>
<meta charset="UTF-8"/>
<title>Wikipedia, the free encyclopedia</title>
<script>document.documentElement.className="client-js vector-feature-language...
"wgCSPNonce":false,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":fal...
"wgMediaViewerOnClick":true,"wgMediaViewerEnabledByDefault":true,"wgPopupsFla...
"mmv.head","mmv.bootstrap.autostart","ext.visualEditor.desktopArticleTarget.i...
<script>(RLQ=window.RLQ||[]).push(function(){mw.loader.implement("user.option...
...

## Extracting COVID-19 testing data table from the wiki HTML page


Getting the root html node


In [57]:
root_node <- read_html("https://en.wikipedia.org/w/index.php?title=Template:COVID-19_testing_by_country")
root_node

{html_document}
<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-language-alert-in-sidebar-enabled vector-feature-sticky-header-disabled vector-feature-page-tools-disabled vector-feature-page-tools-pinned-disabled vector-feature-main-menu-pinned-disabled vector-feature-limited-width-enabled vector-feature-limited-width-content-enabled" lang="en" dir="ltr">
[1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
[2] <body class="skin-vector skin-vector-search-vue vector-toc-pinned mediawi ...

Getting the table node and converting to a dataframe

In [66]:
read_page <- read_html("https://en.wikipedia.org/w/index.php?title=Template:COVID-19_testing_by_country",as.data.frame=T,stringsAsFactors=TRUE)
read_page %>%
html_nodes("table") %>%
.[[2]] %>%
html_table(fill=T) -> covid19

In [67]:
head(covid19)

Unnamed: 0_level_0,Country or region,Date[a],Tested,Units[b],Confirmed(cases),"Confirmed /tested,%","Tested /population,%","Confirmed /population,%",Ref.
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
1,Afghanistan,17 Dec 2020,154767,samples,49621,32.1,0.4,0.13,[1]
2,Albania,18 Feb 2021,428654,samples,96838,22.6,15.0,3.4,[2]
3,Algeria,2 Nov 2020,230553,samples,58574,25.4,0.53,0.13,[3][4]
4,Andorra,23 Feb 2022,300307,samples,37958,12.6,387.0,49.0,[5]
5,Angola,2 Feb 2021,399228,samples,20981,5.3,1.3,0.067,[6]
6,Antigua and Barbuda,6 Mar 2021,15268,samples,832,5.4,15.9,0.86,[7]


## Pre-processing and exporting the extracted data frame as a csv file

In [69]:
# summary of the data frame
summary(covid19)

 Country or region    Date[a]             Tested            Units[b]        
 Length:173         Length:173         Length:173         Length:173        
 Class :character   Class :character   Class :character   Class :character  
 Mode  :character   Mode  :character   Mode  :character   Mode  :character  
 Confirmed(cases)   Confirmed /tested,% Tested /population,%
 Length:173         Length:173          Length:173          
 Class :character   Class :character    Class :character    
 Mode  :character   Mode  :character    Mode  :character    
 Confirmed /population,%     Ref.          
 Length:173              Length:173        
 Class :character        Class :character  
 Mode  :character        Mode  :character  

The data frame read from HTML table will need some pre-processing such as removing irrelvant columns, renaming columns, and convert columns into proper data types.

In [70]:
preprocess_covid19_df <- function(data_frame) {
    
    shape <- dim(data_frame)

    # Removing the World row
    data_frame<-data_frame[!(data_frame$`Country or region`=="World"),]
    # Removing the last row
    data_frame <- data_frame[1:172, ]
    
    # Removing the Units and Ref columns
    data_frame["Ref."] <- NULL
    data_frame["Units[b]"] <- NULL
    
    # Renaming the columns
    names(data_frame) <- c("country", "date", "tested", "confirmed", "confirmed.tested.ratio", "tested.population.ratio", "confirmed.population.ratio")
    
    # Converting column data types
    data_frame$country <- as.factor(data_frame$country)
    data_frame$date <- as.factor(data_frame$date)
    data_frame$tested <- as.numeric(gsub(",","",data_frame$tested))
    data_frame$confirmed <- as.numeric(gsub(",","",data_frame$confirmed))
    data_frame$'confirmed.tested.ratio' <- as.numeric(gsub(",","",data_frame$`confirmed.tested.ratio`))
    data_frame$'tested.population.ratio' <- as.numeric(gsub(",","",data_frame$`tested.population.ratio`))
    data_frame$'confirmed.population.ratio' <- as.numeric(gsub(",","",data_frame$`confirmed.population.ratio`))
    
    return(data_frame)
}

In [71]:
covid19_df <- preprocess_covid19_df(covid19)
head(covid19_df)

Unnamed: 0_level_0,country,date,tested,confirmed,confirmed.tested.ratio,tested.population.ratio,confirmed.population.ratio
Unnamed: 0_level_1,<fct>,<fct>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,Afghanistan,17 Dec 2020,154767,49621,32.1,0.4,0.13
2,Albania,18 Feb 2021,428654,96838,22.6,15.0,3.4
3,Algeria,2 Nov 2020,230553,58574,25.4,0.53,0.13
4,Andorra,23 Feb 2022,300307,37958,12.6,387.0,49.0
5,Angola,2 Feb 2021,399228,20981,5.3,1.3,0.067
6,Antigua and Barbuda,6 Mar 2021,15268,832,5.4,15.9,0.86


The final data frame has the following columns:

- **country** - The name of the country
- **date** - Reported date
- **tested** - Total tested cases by the reported date
- **confirmed** - Total confirmed cases by the reported date
- **confirmed.tested.ratio** - The ratio of confirmed cases to the tested cases
- **tested.population.ratio** - The ratio of tested cases to the population of the country
- **confirmed.population.ratio** - The ratio of confirmed cases to the population of the country


In [72]:
write.csv(covid19_df,"covid19_global_testing.csv")

## Getting a subset of the extracted data frame


In [73]:
covid19_data <- read.csv("covid19_global_testing.csv")
covid19_data[5:10,c("country","confirmed")]

Unnamed: 0_level_0,country,confirmed
Unnamed: 0_level_1,<fct>,<int>
5,Angola,20981
6,Antigua and Barbuda,832
7,Argentina,9060495
8,Armenia,422963
9,Australia,10112229
10,Austria,5789991


## Calculating worldwide COVID positive testing ratio


In [118]:
# Total confirmed cases worldwide
confirmed <- print(sum(covid19_df$confirmed))
# Total tested cases worldwide
tested <- print(sum(covid19_df$tested))
# Positive ratio 
print(confirmed/tested)

[1] 431226336
[1] 5392900253
[1] 0.07996186


## Getting a list of countries who reported their testing data 


In [99]:
#Converting the country column to type character and then sorting
#A to Z
sort(as.character(covid19_df$country))
#Z to A
print(sort(as.character(covid19_df$country),decreasing=T))

  [1] "Zimbabwe"               "Zambia"                 "Vietnam"               
  [4] "Venezuela"              "Uzbekistan"             "Uruguay"               
  [7] "United States"          "United Kingdom"         "United Arab Emirates"  
 [10] "Ukraine"                "Uganda"                 "Turkey"                
 [13] "Tunisia"                "Trinidad and Tobago"    "Togo"                  
 [16] "Thailand"               "Tanzania"               "Taiwan[m]"             
 [19] "Switzerland[l]"         "Sweden"                 "Sudan"                 
 [22] "Sri Lanka"              "Spain"                  "South Sudan"           
 [25] "South Korea"            "South Africa"           "Slovenia"              
 [28] "Slovakia"               "Singapore"              "Serbia"                
 [31] "Senegal"                "Saudi Arabia"           "San Marino"            
 [34] "Saint Vincent"          "Saint Lucia"            "Saint Kitts and Nevis" 
 [37] "Rwanda"              

## Identifying country names with a specific pattern

In [117]:
#Finding countries that contain "United"
strings <- covid19_df$country
pattern <- "United.+"
grep(pattern,strings,value=T)

## Selecting two countries and reviewing their testing data


In [114]:
Kenya_data <- covid19_df[85,c("country","confirmed","confirmed.population.ratio")]

Philippines_data <- covid19_df[130,c("country","confirmed","confirmed.population.ratio")]

print(Kenya_data)
print(Philippines_data)

   country confirmed confirmed.population.ratio
85   Kenya    107729                       0.23
        country confirmed confirmed.population.ratio
130 Philippines   4073980                          4


## Comparing which one of the selected countries has a larger ratio of confirmed cases to population

In [115]:
if (Kenya_data$confirmed.population.ratio > Philippines_data$confirmed.population.ratio) {
    print("Kenya has a larger ratio of confirmed cases per the population")
} else {
    print("The Philippines has a larger ratio of confirmed cases per the population")
}

[1] "The Philippines has a larger ratio of confirmed cases per the population"


## Finding countries with confirmed to population ratio rate less than 1%

In [116]:
subset(covid19_df,confirmed.population.ratio < 0.01)

Unnamed: 0_level_0,country,date,tested,confirmed,confirmed.tested.ratio,tested.population.ratio,confirmed.population.ratio
Unnamed: 0_level_1,<fct>,<fct>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
28,Burundi,5 Jan 2021,90019,884,0.98,0.76,0.0074
34,China[c],31 Jul 2020,160000000,87655,0.055,11.1,0.0061
89,Laos,1 Mar 2021,114030,45,0.039,1.6,0.00063
119,North Korea,25 Nov 2020,16914,0,0.0,0.066,0.0
156,Tanzania,18 Nov 2020,3880,509,13.1,0.0065,0.00085
