-
Notifications
You must be signed in to change notification settings - Fork 0
/
MED3007_Lab1_exercise_solution.R
129 lines (90 loc) · 4.38 KB
/
MED3007_Lab1_exercise_solution.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
###-------------------------###
### SOLUTIONS TO EXERCISES ###
###-------------------------###
###--------------###
### 2. EXERCISE ###
###--------------###
# Load the data from the file "Testfil_Rcourse.xlsx"
# and consider the variable "vitD_v1"
# 1. plot the histogram of this variable and save it, and then also plot the
# boxplot of vitD_v1 stratified according to gender.
# Does this variable look normally distributed? test it.
# 2. If the data are gaussian, perform a t-test to verify
# that vitD_v1 is different across gender groups (otherwise, which test?).
library(readxl)
Testfil_Rcourse <- read_excel("data/Testfil_Rcourse.xlsx")
View(Testfil_Rcourse)
Testfil_Rcourse <- as.data.frame(Testfil_Rcourse)
png('./Rplots/ex2_histogram_vitaminD.png', height = 500, width = 500)
hist(Testfil_Rcourse$vitD_v1, prob = TRUE, xlab = 'vitamin D', main = 'histogram of vitamin D')
dev.off() # for closing the plot file
png('./ex2_boxplot_vitaminD.png', height = 500, width = 500)
boxplot(vitD_v1 ~ gender, Testfil_Rcourse)
dev.off() # for closing the plot file
vit1 <- Testfil_Rcourse[which(Testfil_Rcourse$gender==0),'vitD_v1']
vit2 <- Testfil_Rcourse[which(Testfil_Rcourse$gender==1),'vitD_v1']
# alternative syntax:
vit1 <- Testfil_Rcourse$vitD_v1[which(Testfil_Rcourse$gender==0)]
vit2 <- Testfil_Rcourse$vitD_v1[which(Testfil_Rcourse$gender==1)]
qqnorm(vit1, main = 'vitamin D - females')
qqline(vit1)
qqnorm(vit2, main = 'vitamin D - males')
qqline(vit2)
shapiro.test(vit1)
shapiro.test(vit2) # data is normally distb.
t.test(vitD_v1 ~ gender, Testfil_Rcourse) # there is a difference in vitD between groups (gender)
###-------------###
### 4. EXERCISE ###
###-------------###
## Consider the gene expression data set "Ch10Ex11.csv"
## that consists of 40 tissue samples with measurements on 1,000 genes.
## The first 20 samples are from healthy patients,
## while the second 20 are from a diseased group.
## 1. Load in the data using read.csv(). You will need to select header=F.
exp.data <- read.csv("./data/Ch10Ex11.csv",header=F)
dim(exp.data)
# I want each row to represent a sample, and each column a gene
exp.data <- t(exp.data)
dim(exp.data)
# should have n=40 samples/rows, and 1000 columns --> OK!
## 2. Have a look at the data and describe them with appropriate descriptive measures.
genes.means <- apply(exp.data, 2, mean)
genes.var <- apply(exp.data, 2, var)
plot(genes.means, xlab = 'genes', main = 'mean across samples', ylab = 'mean expr')
abline(h=0)
# we can try to compute the within group means and plot in different colors..
groups <- c(rep(1,20), rep(2,20))
genes.gr.means <- apply(exp.data, 2, function(x){tapply(x, groups, mean)})
plot(genes.gr.means[1,], xlab = 'genes', main = 'within-group mean across samples',
ylim = range(genes.gr.means), ylab = 'mean expr')
points(genes.gr.means[2,], col=2)
abline(h=0)
## 3. Your collaborator wants to know which genes differ the most across the two groups.
## Suggest a way to answer this question, and apply it here.
## We have to apply t-test to all genes, and then correct for multiple testing
# let us first apply the t-test and get an idea of the significance in the data
# How many significant p-values (without any correction)
pval.ttest <- apply(exp.data, 2,
function(x){t.test(x[which(groups==1)], x[which(groups==2)])$p.value})
sum(pval.ttest < alpha)
# how many expected false positives?
Vexp <- alpha*dim(exp.data)[2] # this is E(V) from the slides
Vexp
# Simple way of calculation adjusted p-values (using p.adjust()):
pval.fwer <- p.adjust(pval.ttest, method = "bonferroni")
pval.fdr <- p.adjust(pval.ttest, method = "BH")
# Number of significant p-values after Bonferroni correction
sum(pval.fwer < alpha) # conservative
# Number of significant p-values after BH correction
sum(pval.fdr < alpha)
# Get the significant genes (here the genes are numbered from 1 to 1000)
sign.genes <- which(pval.fdr < alpha)
# or
sign.genes <- which(pval.fwer < alpha)
# Plot them
plot(genes.gr.means[1,], xlab = 'genes', main = 'significant genes in blue',
ylim = range(genes.gr.means), ylab = 'mean expr')
points(genes.gr.means[2,], col=2)
points(sign.genes, genes.gr.means[1,sign.genes], col=4, pch=4, lwd=2)
points(sign.genes,genes.gr.means[2,sign.genes], col=4, pch=4, lwd=2)
abline(h=0)