# Project Proposal: Case study analysis: Holidays in COVID-19  #

Authors: Fares Burwag, Nikko Dumrique (63631204)

In [14]:
install.packages("conflicted")

Updating HTML index of packages in '.Library'

Making 'packages.html' ...
 done



In [2]:
library(tidyverse)
library(repr)
library(datateachr)
library(digest)
library(infer)
library(gridExtra)
library(cowplot)
library(dplyr)
library(conflicted)
library(lubridate)

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.0 ──

[32m✔[39m [34mggplot2[39m 3.3.2     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.0.3     [32m✔[39m [34mdplyr  [39m 1.0.2
[32m✔[39m [34mtidyr  [39m 1.1.1     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.3.1     [32m✔[39m [34mforcats[39m 0.5.0

“package ‘ggplot2’ was built under R version 4.0.1”
“package ‘tibble’ was built under R version 4.0.2”
“package ‘tidyr’ was built under R version 4.0.2”
“package ‘dplyr’ was built under R version 4.0.2”
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()

“package ‘infer’ was built under R version 4.0.2”

Attaching package: ‘gridExtra’


The following object is masked from ‘package:dplyr’:

    combine





In [3]:
conflict_prefer("select", "dplyr")
conflict_prefer("filter", "dplyr")

[conflicted] Will prefer [34mdplyr::select[39m over any other package

[conflicted] Will prefer [34mdplyr::filter[39m over any other package



## Data Wrangling ##

In [4]:
#reading public covid-19 data from https://health-infobase.canada.ca
covid <- read.csv("https://health-infobase.canada.ca/src/data/covidLive/covid19-download.csv")
head(covid)

Unnamed: 0_level_0,pruid,prname,prnameFR,date,update,numconf,numprob,numdeaths,numtotal,numtested,⋯,ratedeaths_last14,numtotal_last7,ratetotal_last7,numdeaths_last7,ratedeaths_last7,avgtotal_last7,avgincidence_last7,avgdeaths_last7,avgratedeaths_last7,raterecovered
Unnamed: 0_level_1,<int>,<chr>,<chr>,<chr>,<int>,<int>,<int>,<int>,<int>,<int>,⋯,<dbl>,<int>,<dbl>,<int>,<dbl>,<int>,<dbl>,<int>,<dbl>,<dbl>
1,35,Ontario,Ontario,2020-01-31,,3,0,0,3,,⋯,,,,,,,,,,0
2,59,British Columbia,Colombie-Britannique,2020-01-31,,1,0,0,1,,⋯,,,,,,,,,,0
3,1,Canada,Canada,2020-01-31,,4,0,0,4,,⋯,,,,,,,,,,0
4,35,Ontario,Ontario,2020-02-08,,3,0,0,3,,⋯,,,,,,,,,,0
5,59,British Columbia,Colombie-Britannique,2020-02-08,,4,0,0,4,,⋯,,,,,,,,,,0
6,1,Canada,Canada,2020-02-08,,7,0,0,7,,⋯,,,,,,,,,,0


we would like to localize the datatset to canadian provinces and territories, therefore we will remove prnames that are not one of the provinces/territories. Additionally would like to remove the observations with numtested == NA. 

In [5]:
provinces = c('Ontario','British Columbia','Quebec','Alberta',
              'Saskatchewan','Manitoba','New Brunswick','Newfoundland and Labrador',
              'Nova Scotia','Prince Edward Island','Northwest Territories','Nunavut','Yukon')

covid = covid  %>% 
    filter(prname %in% provinces)  %>% 
    filter(!is.na(numtested))
head(covid)

Unnamed: 0_level_0,pruid,prname,prnameFR,date,update,numconf,numprob,numdeaths,numtotal,numtested,⋯,ratedeaths_last14,numtotal_last7,ratetotal_last7,numdeaths_last7,ratedeaths_last7,avgtotal_last7,avgincidence_last7,avgdeaths_last7,avgratedeaths_last7,raterecovered
Unnamed: 0_level_1,<int>,<chr>,<chr>,<chr>,<int>,<int>,<int>,<int>,<int>,<int>,⋯,<dbl>,<int>,<dbl>,<int>,<dbl>,<int>,<dbl>,<int>,<dbl>,<dbl>
1,59,British Columbia,Colombie-Britannique,2020-03-11,,39,0,1,39,4373,⋯,,,,,,,,,,0
2,48,Alberta,Alberta,2020-03-11,,14,0,0,14,1969,⋯,,,,,,,,,,0
3,47,Saskatchewan,Saskatchewan,2020-03-11,,0,0,0,0,204,⋯,,,,,,,,,,0
4,46,Manitoba,Manitoba,2020-03-11,,0,0,0,0,352,⋯,,,,,,,,,,0
5,35,Ontario,Ontario,2020-03-11,,42,0,1,42,3394,⋯,,,,,,,,,,0
6,24,Quebec,Québec,2020-03-11,,7,0,0,7,556,⋯,,,,,,,,,,0


In [6]:
#we will now select the columns we would like to work with
covid_wrangled = covid  %>%
    select(c(prname, date, numconf, numdeaths, numtested, numtoday,ratetotal))
head(covid_wrangled)

Unnamed: 0_level_0,prname,date,numconf,numdeaths,numtested,numtoday,ratetotal
Unnamed: 0_level_1,<chr>,<chr>,<int>,<int>,<int>,<int>,<dbl>
1,British Columbia,2020-03-11,39,1,4373,7,0.76
2,Alberta,2020-03-11,14,0,1969,7,0.32
3,Saskatchewan,2020-03-11,0,0,204,0,0.0
4,Manitoba,2020-03-11,0,0,352,0,0.0
5,Ontario,2020-03-11,42,1,3394,8,0.29
6,Quebec,2020-03-11,7,0,556,3,0.08


In [7]:
covid_wrangled$date <- as.Date(covid_wrangled$date)
head(covid_wrangled)

Unnamed: 0_level_0,prname,date,numconf,numdeaths,numtested,numtoday,ratetotal
Unnamed: 0_level_1,<chr>,<date>,<int>,<int>,<int>,<int>,<dbl>
1,British Columbia,2020-03-11,39,1,4373,7,0.76
2,Alberta,2020-03-11,14,0,1969,7,0.32
3,Saskatchewan,2020-03-11,0,0,204,0,0.0
4,Manitoba,2020-03-11,0,0,352,0,0.0
5,Ontario,2020-03-11,42,1,3394,8,0.29
6,Quebec,2020-03-11,7,0,556,3,0.08


As we are comparing the timeline between confirmed cases to the holidays, we will categorize and filter our data according to canadian holidays

In [8]:
#we will use the University of Waterloo dataset to have a tibble of Canadian holidays
cdn_holidays  <- read_csv("https://raw.githubusercontent.com/uWaterloo/Datasets/master/Holidays/holidays.csv")
head(cdn_holidays)

Parsed with column specification:
cols(
  date = [34mcol_date(format = "")[39m,
  holiday = [31mcol_character()[39m
)



date,holiday
<date>,<chr>
2012-01-02,New Year's Day
2012-02-20,Family Day
2012-04-06,Good Friday
2012-05-21,Victoria Day
2012-07-02,Canada Day
2012-08-06,Civic Holiday


In [9]:
#We will only consider Canadian public holidays within the date of the first observation 
# and last observation of the covid dataset

public_holidays  <- cdn_holidays  %>% 
    filter(holiday %in% c("New Year's Day", "Good Friday", "Canada Day", 
                          "Labour Day", "Thanksgiving", "Christmas Day"))  %>% 
    filter(date >= min(covid_wrangled$date), date <= max(covid_wrangled$date))
                    
public_holidays

date,holiday
<date>,<chr>
2020-04-10,Good Friday
2020-07-01,Canada Day
2020-09-07,Labour Day
2020-10-12,Thanksgiving
2020-12-25,Christmas Day
2021-01-01,New Year's Day


In [10]:
#as mentioned from cdc.gov, symptoms may appear 2-14 days after first encounter
#within the holidays dataframe, we would like to measure dates 2 weeks (14 days) after the holiday. 
#We will give a little room for celebrations celebrated close to the actual holiday (14 days will become 17 days).

#likewise we will also add a column for a week in advance to the holiday.
public_holiday_bound <- public_holidays  %>% 
    mutate(post_holiday = date + 17, pre_holiday = date - 7) 

public_holiday_bound_2020  <- filter(public_holiday_bound, ymd(date) >= ymd("2020-01-01"),  ymd(date) <= ymd("2020-12-31")) 
public_holiday_bound_2021  <- filter(public_holiday_bound, ymd(date) >= ymd("2021-01-01"),  ymd(date) <= ymd("2021-12-31"))

public_holiday_bound_2020
public_holiday_bound_2021

date,holiday,post_holiday,pre_holiday
<date>,<chr>,<date>,<date>
2020-04-10,Good Friday,2020-04-27,2020-04-03
2020-07-01,Canada Day,2020-07-18,2020-06-24
2020-09-07,Labour Day,2020-09-24,2020-08-31
2020-10-12,Thanksgiving,2020-10-29,2020-10-05
2020-12-25,Christmas Day,2021-01-11,2020-12-18


date,holiday,post_holiday,pre_holiday
<date>,<chr>,<date>,<date>
2021-01-01,New Year's Day,2021-01-18,2020-12-25


In [19]:
#2020 holiday bounds
pre_good_friday = ymd(public_holiday_bound_2020$pre_holiday[public_holiday_bound_2020$holiday == 'Good Friday'])
pre_canada_day = ymd(public_holiday_bound_2020$pre_holiday[public_holiday_bound_2020$holiday == 'Canada Day'])
pre_labour_day = ymd(public_holiday_bound_2020$pre_holiday[public_holiday_bound_2020$holiday == 'Labour Day'])
pre_thanksgiving = ymd(public_holiday_bound_2020$pre_holiday[public_holiday_bound_2020$holiday == 'Thanksgiving'])
pre_christmas = ymd(public_holiday_bound_2020$pre_holiday[public_holiday_bound_2020$holiday == 'Christmas Day'])

post_good_friday = ymd(public_holiday_bound_2020$post_holiday[public_holiday_bound_2020$holiday == 'Good Friday'])
post_canada_day = ymd(public_holiday_bound_2020$post_holiday[public_holiday_bound_2020$holiday == 'Canada Day'])
post_labour_day = ymd(public_holiday_bound_2020$post_holiday[public_holiday_bound_2020$holiday == 'Labour Day'])
post_thanksgiving = ymd(public_holiday_bound_2020$post_holiday[public_holiday_bound_2020$holiday == 'Thanksgiving'])
post_christmas = ymd(public_holiday_bound_2020$post_holiday[public_holiday_bound_2020$holiday == 'Christmas Day'])

#2021 holiday bounds
pre_new_years = ymd(public_holiday_bound_2021$pre_holiday[public_holiday_bound_2021$holiday == "New Year's Day"])
post_new_years = ymd(public_holiday_bound_2021$post_holiday[public_holiday_bound_2021$holiday == "New Year's Day"])


pre_good_friday
post_good_friday

In [18]:
covid_clean  <- covid_wrangled  %>%  
    mutate(holiday = case_when(
    (.$date >= pre_good_friday) & (.$date <= post_good_friday) ~ "Good Friday",
    (.$date >= pre_canada_day) & (.$date <= post_canada_day) ~ "Canada Day",
    (.$date >= pre_labour_day) & (.$date <= post_labour_day) ~ "Labour Day",
    (.$date >= pre_thanksgiving) & (.$date <= post_thanksgiving) ~ "Thanksgiving", 
    (.$date >= pre_christmas) & (.$date <= post_christmas) ~ "Christmas Day",
    (.$date >= pre_new_years) & (.$date <= post_new_years) ~ "New Year's Day"))  %>% 
    filter(holiday == "Good Friday")

length(covid_clean)
head(covid_clean)

Unnamed: 0_level_0,prname,date,numconf,numdeaths,numtested,numtoday,ratetotal,holiday
Unnamed: 0_level_1,<chr>,<date>,<int>,<int>,<int>,<int>,<dbl>,<chr>
1,British Columbia,2020-04-03,1174,35,39926,53,22.81,Good Friday
2,Alberta,2020-04-03,1075,18,60508,107,24.31,Good Friday
3,Saskatchewan,2020-04-03,220,3,11720,14,18.66,Good Friday
4,Manitoba,2020-04-03,164,2,11650,15,13.2,Good Friday
5,Ontario,2020-04-03,3255,67,67998,462,22.09,Good Friday
6,Quebec,2020-04-03,6101,61,83570,583,71.15,Good Friday


additionally we will add a column to find the proportion between number of confirmed cases to the number of tested individuals 

In [13]:
covid_clean  <-  mutate(covid_clean, propconfirmed = numconf / numtested)
head(covid_clean)

Unnamed: 0_level_0,prname,date,numconf,numdeaths,numtested,numtoday,ratetotal,holiday,propconfirmed
Unnamed: 0_level_1,<chr>,<date>,<int>,<int>,<int>,<int>,<dbl>,<chr>,<dbl>
1,British Columbia,2020-04-03,1174,35,39926,53,22.81,Good Friday,0.0294044
2,Alberta,2020-04-03,1075,18,60508,107,24.31,Good Friday,0.01776625
3,Saskatchewan,2020-04-03,220,3,11720,14,18.66,Good Friday,0.01877133
4,Manitoba,2020-04-03,164,2,11650,15,13.2,Good Friday,0.01407725
5,Ontario,2020-04-03,3255,67,67998,462,22.09,Good Friday,0.04786905
6,Quebec,2020-04-03,6101,61,83570,583,71.15,Good Friday,0.07300467
