-
Notifications
You must be signed in to change notification settings - Fork 3
/
gs.wrapper.R
161 lines (155 loc) · 8 KB
/
gs.wrapper.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
gs.wrapper = function(setlist, backgroundset, include.identifiers=FALSE,
filename=NULL, resourceset=NULL,
functiondir=NULL, resourcedir=NULL, return.OM=FALSE,
gs.range=c(25,2500), ecut=0.05, ocut=5, anno.uni=TRUE,
annolist = c('kegg','msigdb5','GOBP','GOMF','GOCC',
'pC','nci','BioCarta','Reactome'),
idtypes = c(rep('Entrez.ID',5),rep('Anno.Symbol',4)),
use.gsr = c(F,T,T,T,T,T,F,F,F) ){
# function to run basic pathway annotation on an input list of gene sets, using predefined annotation sets.
# gs.range limits the sizes of gene sets used for annotation
# ecut sets the maximum E-value for annotations to return
# ocut sets the minimum identifier number for annotations to return
# return.OM is a flag indicating whether occurrence matrix is to be returned
# occurrence matrix == logical matrix, genes x sets
# anno.uni is a flag: if TRUE, use the annotation source's universe size;
# or if FALSE, use the overlap of backgroundset & anno universes
# functiondir = optional directory with functions to be loaded (default wd)
# resourcedir = optional directory with genesets to be loaded (default wd)
# please adjust locations of functions and annotation files for your system
# if resourceset is NULL (default) a default set of annotation sources
# is loaded, with all arguments ready as defaults
# otherwise:
# resourceset is a vector of filenames containing annotation
# sources as lists of named elements, each of which contains a vector
# of identifiers. Each file is an RData file. The filename should be
# the same as that of the list varname inside the file.
# idtypes is a vector of the same length as resourceset that contains
# backgroundset colnames corresponding to each resourceset's id type
# annolist is a vector of annotation source variable names as strings
# use.gsr is a logical vector setting whether to use gs.range to restrict
# annotation source by gene set size
# backgroundset is a data.table with fields ID, Entrez.ID and Anno.Symbol
# Entrez.ID must be HUMAN IDs to use default annotation sources
# any other identifier types required by anno sources are in addnl columns
# setlist is a list containing vectors of identifiers in backgroundset ID
# each element is named with the display name for the gene set
# Note that setlist MUST include a names() attribute.
if(is.null(functiondir)) functiondir=getwd()
if(is.null(resourcedir)) resourcedir=getwd()
# imports
source(file.path(functiondir,'gs.pval.R'))
tmp=suppressPackageStartupMessages(require(data.table))
# check arguments
# setlist should be list of nonzero-length vectors
if(!is.list(setlist) || sum(lengths(setlist)>0) != length(setlist) ){
stop("setlist must be a list of nonzero-length vectors")
}
# get ready to save occurrence matrices if requested
if( return.OM ){ OM_ls = NULL }
# load annotations
if( is.null(resourceset) ){
message("Using default annotation sets")
# annotations with EntrezGeneIDs
load(file.path(resourcedir,"hsa.kegg.gs.RData")); kegg=hsa.kegg.gs
kegg = lapply(hsa.kegg.gs,function(x) as.numeric(x))
load(file.path(resourcedir,"msigdb.v5.0.entrez.gmt.RData"))
msigdb5 = lapply(msigdb5,function(x) as.numeric(x))
load(file.path(resourcedir,"GO_processed.gmt.RData"))
# annotations with geneSymbols; not a robust identifier type
load(file.path(resourcedir,"pathwayCommons.gmt.RData"));
load(file.path(resourcedir,"NCI-Nature_Curated.gmt.RData"));nci=nci_gmt; rm(nci_gmt)
load(file.path(resourcedir,"BioCarta.gmt.RData"));BioCarta=BioCarta_gmt;rm(BioCarta_gmt)
load(file.path(resourcedir,"Reactome.gmt.RData"));Reactome=Reactome_gmt;rm(Reactome_gmt)
# define universes for each gene set source
uni = list(uni.kegg = unique(unlist(kegg,F,F)),
uni.pC = unique(unlist(pC,F,F)),
uni.msigdb5 = unique(unlist(msigdb5,F,F)),
uni.GOBP = unique(unlist(GOBP,F,F)),
uni.GOMF = unique(unlist(GOMF,F,F)),
uni.GOCC = unique(unlist(GOCC,F,F)),
uni.nci = toupper(unique(unlist(nci,F,F))),
uni.BioCarta = toupper(unique(unlist(BioCarta,F,F))),
uni.Reactome = toupper(unique(unlist(Reactome,F,F)))
)
} else { #non-default load
uni = NULL
for( i in 1:length(resourceset) ){
resourcename = sub('.RData','',resourceset[i])
message("loading ",file.path(resourcedir,resourceset[i]))
stopifnot( file.exists(file.path(resourcedir,resourceset[i])) )
load(file.path(resourcedir,resourceset[i]))
uni[[ paste("uni",resourcename,sep='.') ]] =
unique(unlist(get(resourcename)))
}
for( src in annolist ){
message(src," universe size: ",length(uni[[paste('uni',src,sep='.')]]))
}
}
# annotate up- and down-regulated selected genes in files
out.dt = data.table(rn="",E.Value="",P.Value="",Set.Size="",Overlap="",Sig.Size="",Uni.Size="",Source="",Condx="",FoldEnrich="")
if(include.identifiers){
out.dt = cbind(out.dt,ID.list="",alt.ID.list="")
}
out.dt = out.dt[0,]
# loop over anno sources
for( src in annolist ){
a = which(annolist==src)
# set anno parameters
# range of gene set sizes
if( use.gsr[a] ){ min.size=gs.range[1]; max.size=gs.range[2]
}else{ min.size=ocut; max.size=Inf }
# ID type(s) to use
tmp = backgroundset[,get(idtypes[a])]
alt.idtype = "Anno.Symbol" # default == Symbol for human readability
if( idtypes[a]=="Anno.Symbol" ){ alt.idtype = "Entrez.ID" }
# universe size to use
if( anno.uni ){
uni.size=length(uni[[paste('uni',src,sep='.')]])
} else { uni.size = NULL }
# annotate each set
for( i in 1:length(setlist) ){
signame = names(setlist)[i]; sig = setlist[[i]]
if(include.identifiers){
anno = gs.pval(gs=get(src), rnames=tmp,
is.sig = backgroundset$ID %in% sig, uni.size=uni.size,
min.size=min.size, max.size=max.size, include.identifiers=TRUE,
alt.rnames=backgroundset$Anno.Symbol,return.OM=return.OM)
} else {
anno = gs.pval(gs=get(src), rnames=tmp,
is.sig = backgroundset$ID %in% sig, uni.size=uni.size,
min.size=min.size, max.size=max.size, include.identifiers=FALSE,
return.OM=return.OM)
}
GS = anno$GS
if( return.OM ){
OM_ls = c(OM_ls, anno['OM'])
names(OM_ls)[length(OM_ls)] = signame
}
sig.n = sum(GS[,"E.Value"]<ecut & GS[,"Overlap"]>=ocut,na.rm=T)
print(paste(signame, src,"uni.size",attr(GS,"Uni.Size"),"sig.n", sig.n))
if(sig.n){
# convert slice of GS to data.frame to preserve structure if n==1
out.dt = rbindlist(list(out.dt,data.table(GS,keep.rownames=T)[GS[,"E.Value"]<ecut & GS[,"Overlap"]>=ocut,]),fill=T)
out.dt[(nrow(out.dt)-sig.n+1):(nrow(out.dt)),"Sig.Size"] = attr(GS,"Sig.Size")
out.dt[(nrow(out.dt)-sig.n+1):(nrow(out.dt)),"Uni.Size"] = attr(GS,"Uni.Size")
out.dt[(nrow(out.dt)-sig.n+1):(nrow(out.dt)),"Source"] = src
out.dt[(nrow(out.dt)-sig.n+1):(nrow(out.dt)),"Condx"] = signame
out.dt[(nrow(out.dt)-sig.n+1):(nrow(out.dt)),"FoldEnrich"] =
( as.numeric(out.dt[(nrow(out.dt)-sig.n+1):(nrow(out.dt)),"Overlap"]) /
as.numeric(out.dt[(nrow(out.dt)-sig.n+1):(nrow(out.dt)),"Sig.Size"]) ) /
( as.numeric(out.dt[(nrow(out.dt)-sig.n+1):(nrow(out.dt)),"Set.Size"]) /
as.numeric(out.dt[(nrow(out.dt)-sig.n+1):(nrow(out.dt)),"Uni.Size"]) )
if(include.identifiers){
out.dt[(nrow(out.dt)-sig.n+1):(nrow(out.dt)),"ID.list"] = attr(GS,"ID.list")[GS[,"E.Value"]<ecut & GS[,"Overlap"]>=ocut]
out.dt[(nrow(out.dt)-sig.n+1):(nrow(out.dt)),"alt.ID.list"] = attr(GS,"alt.ID.list")[GS[,"E.Value"]<ecut & GS[,"Overlap"]>=ocut]
}
}
}
}
names(out.dt)[names(out.dt)=="rn"] = "Set.Name"
if(!is.null(filename)){
write.csv(out.dt,file=filename,row.names=F,quote=F)
}
return(list(out.dt=out.dt, OM_ls=OM_ls))
}