In [None]:
library(tidyverse)
library("ggplot2")
library("dplyr")
library("reshape2")
library("knitr")
library(e1071)
library(mosaic)

dfSC = read.csv("CumulativeCases.csv", header = T)
dfSD = read.csv("TotalDeaths.csv", header = T)
dfCC = read.csv("CumulCases2020.csv")
dfCD = read.csv("TotalDeaths2020.csv")

dfW_7 = read.csv("WHOselect7.csv", header = T)

cumulSC = read.csv("Cumulative.csv")
totDLD = read.csv("TotalDeathsLD.csv")

dfC3 = read.csv("CulumNon-Interpolated2003.csv", header = T)
dfD3 = read.csv("DeathNon-Interpolated2003.csv", header = T)
dfR3 = read.csv("RecoveredNon-Interpolated2003.csv", header = T)

In [None]:
sapply(cumulSC, class)

In [None]:
t2003 <- c(cumulSC$Canada.2003, cumulSC$Germany.2003, cumulSC$Italy.2003, cumulSC$Singapore.2003, cumulSC$US.2003, cumulSC$Vietnam.2003, cumulSC$China.2003)
hist(t2003, xlab ='Number of Total Cases', main = "Histogram of Number of Cases in 2003 SARS Outbreak")
skew <- skewness(t2003)
cat("Skewness: ", skew, "\n")
k <- kurtosis(t2003)
cat("Kurtosis: ", k)

favstats(t2003)
boxplot(t2003, ylab = "Number of Total Cases SARS 2003", main = "Boxplot of Number of Cases SARS 2003")

China seems to be an outlier in 2003. The outlier is removed in order to see how the distribution changes. 

In [None]:
t2003_2 = c(cumulSC$Canada.2003, cumulSC$Germany.2003, cumulSC$Italy.2003, cumulSC$Singapore.2003, cumulSC$US.2003, cumulSC$Vietnam.2003)
hist(t2003_2, xlab ='Number of Total Cases', main = "Histogram of Number of Cases in 2003: SARS Outbreak W/O Outliers")
skew <- skewness(t2003_2)
cat("Skewness: ", skew, "\n")
k <- kurtosis(t2003_2)
cat("Kurtosis: ", k)

favstats(t2003_2)
boxplot(t2003_2, ylab = "Number of Total Cases", main = "Boxplot of Number of Cases SARS 2003 W/O Outliers")

The skew decreased almost completely. The distribution is almost completely normal. 

In [None]:
t2020 <- c(cumulSC$Canada.2020, cumulSC$Germany.2020, cumulSC$Italy.2020, cumulSC$Singapore.2020, cumulSC$US.2020, cumulSC$Vietnam.2020, cumulSC$China.2020)
hist(t2020, xlab ='Number of Total Cases', main = "Histogram of Number of Cases in 2020: COVID Outbreak")
skew <- skewness(t2020)
cat("Skewness: ", skew, "\n")
k <- kurtosis(t2020)
cat("Kurtosis: ", k)

favstats(t2020)
boxplot(t2020, ylab = "Number of Total Cases", main = "Boxplot of Number of Cases in 2020: COVID Outbreak")

Distribution is positively skewed. The distribution is not normal. 

In [None]:
d2003 = c(totDLD$Canada.2003, totDLD$Germany.2003, totDLD$Italy.2003, totDLD$Singapore.2003, totDLD$US.2003, totDLD$Vietnam.2003, totDLD$China.2003)
hist(d2003, xlab ='Number of Total Deaths', main = "Histogram of Number of Deaths in 2003: SARS Outbreak")
skew <- skewness(d2003)
cat("Skewness: ", skew, "\n")
k <- kurtosis(d2003)
cat("Kurtosis: ", k)

favstats(d2003)
boxplot(d2003, ylab = "Number of Total Deaths", main = "Boxplot of Number of Deaths 2003 SARS Outbreak")

China is an outlier. The skew is large and the distribution is not normal. 

In [None]:
d2003 = c(totDLD$Canada.2003, totDLD$Germany.2003, totDLD$Italy.2003, totDLD$Singapore.2003, totDLD$US.2003, totDLD$Vietnam.2003)
hist(d2003, xlab ='Number of Total Deaths', main = "Histogram of Number of Deaths in 2003: SARS Outbreak")
skew <- skewness(d2003)
cat("Skewness: ", skew, "\n")
k <- kurtosis(d2003)
cat("Kurtosis: ", k)

favstats(d2003)
boxplot(d2003, ylab = "Number of Total Deaths", main = "Boxplot of Number of Deaths 2003 SARS Outbreak")

The Deaths are still positively skewed. However, the removal of China significantly improved the skewness of the data. 

In [None]:
d2020 = c(totDLD$Canada.2020, totDLD$Germany.2020, totDLD$Italy.2020, totDLD$Singapore.2020, totDLD$US.2020, totDLD$Vietnam.2020, totDLD$China.2020)
hist(d2020, xlab ='Number of Total Deaths', main = "Histogram of Number of Deaths in 2020: COVID Outbreak")
skew <- skewness(d2020)
cat("Skewness: ", skew, "\n")
k <- kurtosis(d2020)
cat("Kurtosis: ", k)

favstats(d2020)
boxplot(d2020, ylab = "Number of Total Deaths", main = "Boxplot of Number of Deaths 2020 COVID Outbreak")

Italy is an outlier. The data is positively skewed. 

In [None]:
d2020 = c(totDLD$Canada.2020, totDLD$Germany.2020, totDLD$Singapore.2020, totDLD$US.2020, totDLD$Vietnam.2020, totDLD$China.2020)
hist(d2020, xlab ='Number of Total Deaths', main = "Histogram of Number of Deaths in 2020: COVID Outbreak")
skew <- skewness(d2020)
cat("Skewness: ", skew, "\n")
k <- kurtosis(d2020)
cat("Kurtosis: ", k)

favstats(d2020)
boxplot(d2020, ylab = "Number of Total Deaths", main = "Boxplot of Number of Deaths 2020 COVID Outbreak")

Wiht the removal of Italy, the data is less skewed but still not normal. 

In [None]:
library(ggplot2)

data <- data.frame(CountryYear= c("Canada 2020", "Canada 2003", "Germany 2020", "Germany 2003", "Italy 2020", 
                                  "Italy 2003", "Singapore 2020", "Singapore 2003", "US 2020", "US 2003", 
                                  "Vietnam 2020", "Vietnam 2003", "China 2020", "China 2003"),  
                   Total=c(totDLD$Canada.2020, totDLD$Canada.2003, totDLD$Germany.2020, totDLD$Germany.2003,
                          totDLD$Italy.2020, totDLD$Italy.2003, totDLD$Singapore.2020, totDLD$Singapore.2003,
                          totDLD$US.2020, totDLD$US.2003, totDLD$Vietnam.2020, totDLD$Vietnam.2003,
                          totDLD$China.2020, totDLD$China.2003))

barplot <- ggplot(data, aes(x=CountryYear, y=Total)) + geom_bar(stat = "identity", fill = 'burlywood4') + 
                    ggtitle("Comparison of countries: Death Totals from Both Outbreaks") + xlab("Country And Year") + ylab("Total Deaths") +
                    theme(axis.text.x = element_text(angle = 90))
barplot

In [None]:
data <- data.frame(CountryYear= c("Canada 2020", "Canada 2003", "Germany 2020", "Germany 2003", "Italy 2020", 
                                  "Italy 2003", "Singapore 2020", "Singapore 2003", "US 2020", "US 2003", 
                                  "Vietnam 2020", "Vietnam 2003", "China 2020", "China 2003"),  
                   Total=c(cumulSC$Canada.2020, cumulSC$Canada.2003, cumulSC$Germany.2020, cumulSC$Germany.2003,
                          cumulSC$Italy.2020, cumulSC$Italy.2003, cumulSC$Singapore.2020, cumulSC$Singapore.2003,
                          cumulSC$US.2020, cumulSC$US.2003, cumulSC$Vietnam.2020, cumulSC$Vietnam.2003,
                          cumulSC$China.2020, cumulSC$China.2003))

barplot <- ggplot(data, aes(x=CountryYear, y=Total)) + geom_bar(stat = "identity", fill = 'orange2') + 
                    ggtitle("Comparison of countries: Cumulative Cases from Both Outbreaks") + xlab("Country And Year") + ylab("Total Deaths") +
                    theme(axis.text.x = element_text(angle = 90))
barplot

In [None]:
data <- data.frame(CountryYear= c("Canada", "Germany", "Italy", "Singapore", "US", "Vietnam", "China"),  
                   Total=c(cumulSC$Canada.2003, cumulSC$Germany.2003, cumulSC$Italy.2003, cumulSC$Singapore.2003,
                        cumulSC$US.2003, cumulSC$Vietnam.2003, cumulSC$China.2003))

barplot <- ggplot(data, aes(x=CountryYear, y=Total)) + geom_bar(stat = "identity", fill = 'deeppink2') + 
                    ggtitle("Comparison of countries: Cumulative Cases SARS 2003") + xlab("Country And Year") + ylab("Total Deaths") +
                    theme(axis.text.x = element_text(angle = 90))
barplot

In [None]:
library(ggplot2)

data <- data.frame(CountryYear= c("Canada", "Germany", "Italy", "Singapore", "US", "Vietnam", "China"),  
                   Total=c(totDLD$Canada.2003, totDLD$Germany.2003, totDLD$Italy.2003, totDLD$Singapore.2003,
                           totDLD$US.2003, totDLD$Vietnam.2003, totDLD$China.2003))

barplot <- ggplot(data, aes(x=CountryYear, y=Total)) + geom_bar(stat = "identity", fill="steelblue") + 
                    ggtitle("Comparison of countries: Death Totals SARS 2003") + xlab("Country And Year") + ylab("Total Deaths") +
                    theme(axis.text.x = element_text(angle = 90))
barplot

In [None]:
library(pwr)
library(distr)

In [None]:
pwr.t.test(n = 96, d = 0.5, sig.level = 0.05, alternative = "greater")

The t tests comparing the death totals, in selected countries, from the 2003 SARS Outbreak will have 96.4% power. 

In [None]:
pwr.t.test(d=0.5, sig.level = 0.05, power = 0.8, alternative = "greater")

50 days of SARS data need to be collected in each country to have 0.8 power. 

In [None]:
ttest = t.test(dfSD$China, dfSD$Canada, alternative = "greater", paired = FALSE)
ttest

Null hypothesis: There is no difference in the mean deaths per day between the two countries. 
Alternative hypothesis: This is a difference in the mean deaths per day between the two countries.

China will recieve the most severe classification. The t-test indicates that China has significantly more mean deaths per day than the country with the second highest mean deaths per day (Canada). The 95% confidence interval does not contain the value 0 and the p-value is below 0.05. Therefore, this conclusion is valid. 

Reject the null hypothesis for the t-test between China and Canada. 

In [None]:
ttest = t.test(dfSD$Canada, dfSD$Singapore, alternative = "greater", paired = FALSE)
ttest

Null hypothesis: There is no difference in the mean deaths per day between the two countries. 
Alternative hypothesis: This is a difference in the mean deaths per day between the two countries.

Canada and Singapore will be placed in the second most severe classification. The t-test indicates that there is no significant difference between the mean deaths per day of the two countries. The 95% confidence interval contains the value 0 and the p-value is above 0.05. Therefore, this conclusion is valid.

Fail to reject the null hypothesis for the t-test between Singapore and Canada.

In [None]:
ttest = t.test(dfSD$Singapore, dfSD$Vietnam, alternative = "greater", paired = FALSE)
ttest

Null hypothesis: There is no difference in the mean deaths per day between the two countries. 
Alternative hypothesis: This is a difference in the mean deaths per day between the two countries.

Vietnam will be placed in the third most severe classification. The t-test indicates that there is a significant difference in the mean deaths per day between Vietnam and Singapore. The 95% confidence interval does not contain the value 0 and the p-value is below 0.05. Therefore, this conclusion is valid.
Singapore has a lower death total than Canada. Therefore, it can be assumed that Vietnam's mean deaths per day will have a significant difference from Canada's mean deaths per day. 

Reject the null hypothesis for the t-test between Singapore and Vietnam. 

In [None]:
ttest = t.test(dfSD$Vietnam, dfSD$United.States, alternative = "greater", paired = FALSE)
ttest

Null hypothesis: There is no difference in the mean deaths per day between the two countries. 
Alternative hypothesis: This is a difference in the mean deaths per day between the two countries.

United States will be placed in the fourth most severe classification. The t-test indicates that there is a significant difference in the mean deaths per day between Vietnam and the United States. The 95% confidence interval does not contain the value 0 and the p-value is below 0.05. Therefore, this conclusion is valid.

Reject the null hypothesis for the t-test between Vietnam and United States. 

In [None]:
ttest = t.test(dfSD$United.States, dfSD$Germany, alternative = "greater", paired = FALSE)
ttest

Null hypothesis: There is no difference in the mean deaths per day between the two countries. 
Alternative hypothesis: This is a difference in the mean deaths per day between the two countries.

Fail to reject the null hypothesis for the t-test between Germany and United States.

Germany, Italy, and United States did not experience a death during the outbreak. They will be placed in the fourth classification. The t-test does not have anything to compare. 

In [None]:
dfW_7$classification <- c(2,1,4,4,2,4,3)
dfW_7

In [None]:
library(magrittr)

In [None]:
dfW_20 = read.csv("WHOselect20.csv", header = T)

In [None]:
dfW_20$classification <- c(0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1)
dfW_20

In [None]:
dfW_20 %<>% mutate_if(is.integer,as.numeric)
dfW_20

In [None]:
write.csv(dfW20,"dfW20.csv", row.names = TRUE)

In [None]:
dfW20 = read.csv("dfW20.csv", header = T)

In [None]:
library(caret)
library(randomForest)
library(varImp)
library(naivebayes)

In [None]:
sapply(dfW20, class)

In [None]:
df <- dfW20 
df
#0 means not severe
#1 means moderately severe

In [None]:
set.seed(998)
indxTrain <- createDataPartition(y = df$classification,p = 0.5,list = FALSE)
training <- df[indxTrain,] 
testing <- df[-indxTrain,] 

In [None]:
sapply(df, class)

In [None]:
df$classification <- as.factor(df$classification)
x = training[,2:28]
y = training$classification
y = as.factor(y)
z = testing$classification
z = as.factor(z)

model = train(x,y,'glm',trControl=trainControl(method='cv',number=10))
print(model)

Predict <- predict(model,newdata = testing[,2:28])

confusionMatrix(Predict, z)

In [None]:
levels(z)

In [None]:
df$Outcome = as.factor(df$Outcome)
set.seed(998)
indxTrain <- createDataPartition(y = df$Outcome,p = 0.75,list = FALSE)
training <- df[indxTrain,] 
testing <- df[-indxTrain,] 

z = testing$Outcome
z = as.factor(z)

train.control <- trainControl(method = "cv", number = 10)
# Train the model
model <- train(Outcome ~ Glucose + BloodPressure + SkinThickness+Insulin+BMI+DiabetesPedigreeFunction+
            Age, data = training, method = "glm",
               trControl = train.control)
# Summarize the results
print(model)

Predict <- predict(model,newdata = testing)

confusionMatrix(Predict, z)

In [None]:
X <- caret::varImp(model)
plot(X)

In [None]:
dfSC = read.csv("CumulativeCases.csv", header = T)

In [None]:
dfSC$China[1]

In [None]:
class(dfSC$Date)
dfSC$Date <- as.character(dfSC$Date)
dfSC$Date <- as.POSIXct(dfSC$Date)

In [None]:
log_plot <- ggplot(data = dfSC, aes(x = Date, y = China/7084)) + 
geom_jitter(alpha = 0.5, height = 0.05) +  geom_point() + 
            stat_smooth(method = "glm", method.args = list(family = "binomial"), se = TRUE) + 
            ylab("Percentage of Total Cases")
log_plot + ggtitle("Cumulative Cases in China SARS Outbreak 2003")


In [None]:
log_reg <- glm(China/7084 ~ seq(1,96,1), data = dfSC, family = binomial)

log_reg

In [None]:
library(stats)

In [None]:
log_plot <- ggplot(data = dfSC, aes(x = Date, y = China)) + 
geom_jitter(alpha = 0.5, height = 0.05) +  geom_point() + 
            ylab("Total Cases")
log_plot + ggtitle("Cumulative Cases in China SARS Outbreak 2003")

In [None]:
#find the parameters for the equation
times <- seq(1,96,1)
SS<-getInitial(dfSC$China~SSlogis(times,alpha,xmid,scale),data=data.frame(dfSC$China==dfSC$China,times=times))
SS

In [None]:
K_0<-SS["alpha"]
R_0<-1/SS["scale"]
N0<-SS["alpha"]/(exp(SS["xmid"]/SS["scale"])+1)
y <- dfSC$China

m<-nls(y~K*N0*exp(R*times)/(K+N0*(exp(R*times)-1)),start=list(K=K_0,R=R_0,N0=N0))
#estimated parameters
summary(m)

c <- cor(y,predict(m))
cat("correlation coefficient: ", c)

In [None]:
Days = times
Total_cases = dfSC$China

plot(Days, Total_cases) + lines(times,predict(m),col="red", lty=1,lwd=3)
title(main = "Cumulative Cases in China SARS Outbreak 2003")

In [None]:
yC = read.csv("Comp20032020.csv")

In [None]:
str(yC)

In [None]:
yC

In [None]:
library(lattice)
histogram(~Cumulative.Cases|factor(Year),
         data = yC,
         layout = c(1,2),
         nint = 1,
         xlab = "Cumulative Cases",
         strip = FALSE,
         strip.left = TRUE,
         breaks=seq(from=0,to=150000,by=10000),
         ylab = "Percentage of the Data")

100 percent of the data is below 5000 cumulative cases in 2003. 