/
PANDA.R
203 lines (173 loc) · 10.3 KB
/
PANDA.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
#' Run Python implementation PANDA in R
#'
#' \strong{PANDA}(Passing Attributes between Networks for Data Assimilation) is a message-passing model
#' to reconstruct gene regulatory network, which integrates multiple sources of biological data-including protein-protein interaction data,
#' gene expression data, and transcription factor binding motifs data to reconstruct genome-wide, condition-specific regulatory networks.
#' \href{http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0064832}{[(Glass et al. 2013)])}
#' This function is designed to run the a derived PANDA implementation in Python Library "netZooPy" \href{https://github.com/netZoo/netZooPy}{netZooPy}.
#'
#' @param expr_file Character string indicating the file path of expression values file, with each gene(in rows) across samples(in columns).
#' @param motif_file An optional character string indicating the file path of a prior transcription factor binding motifs dataset.
#' When this argument is not provided, analysis will continue with Pearson correlation matrix.
#' @param ppi_file An optional character string indicating the file path of protein-protein interaction edge dataset.
#' Also, this can be generated with a list of proteins of interest by \code{\link{sourcePPI}}.
#'
#' @param computing 'cpu' uses Central Processing Unit (CPU) to run PANDA; 'gpu' use the Graphical Processing Unit (GPU) to run PANDA. The default value is "cpu".
#'
#' @param precision 'double' computes the regulatory network in double precision (15 decimal digits); 'single' computes the regulatory network in single precision (7 decimal digits) which is fastaer, requires half the memory but less accurate. The default value is 'double'.
#' @param save_memory 'TRUE' removes temporary results from memory. The result network is weighted adjacency matrix of size (nTFs, nGenes); 'FALSE' keeps the temporary files in memory. The result network has 4 columns in the form gene - TF - weight in motif prior - PANDA edge. PANDA indegree/outdegree of panda network, only if save_memory = FALSE. The default value is 'FALSE'.
#' @param save_tmp 'TRUE' saves middle data like expression matrix and normalized networks; 'FALSE' deletes the middle data. The default value is 'TURE'.
#' @param keep_expression_matrix 'TRUE' keeps the input expression matrix as an attribute in the result Panda object.'FALSE' deletes the expression matrix attribute in the Panda object. The default value is 'FALSE'.
#' @param modeProcess 'legacy' refers to the processing mode in netZooPy<=0.5, 'union': takes the union of all TFs and genes across priors and fills the missing genes in the priors with zeros; 'intersection': intersects the input genes and TFs across priors and removes the missing TFs/genes. Default values is 'union'.
#' @param remove_missing Only when modeProcess='legacy': remove_missing='TRUE' removes all unmatched TF and genes; remove_missing='FALSE' keeps all tf and genes. The default value is 'FALSE'.
#' @param with_header Boolean to read gene expression file with a header for sample names
#' @return When save_memory=FALSE(default), this function will return a list of three items:
#' Use \code{$panda} to access the standard output of PANDA as data frame, which consists of four columns:
#' "TF", "Gene", "Motif" using 0 or 1 to indicate if this edge belongs to prior motif dataset, and "Score".
#'
#' Use \code{$indegree} to access the indegree of PANDA network as data frame, which consists of two columns: "Gene", "Score".
#'
#' Use \code{$outdegree} to access the outdegree of PANDA network as data frame, which consists of two columns: "TF", "Score".
#'
#' When save_memory=TRUE, this function will return a weigheted adjacency matirx of size (nTFs, nGenes), use \code{$WAMpanda} to access.
#'
#' @examples
#' # take the treated TB dataset as example here.
#' # refer to the datasets files path in inst/extdat
#'
#' treated_expression_file_path <- system.file("extdata", "expr4_matched.txt",
#' package = "netZooR", mustWork = TRUE)
#' treated_expression_file_path <- system.file("extdata", "expr4_matched.txt",
#' package = "netZooR", mustWork = TRUE)
#' motif_file_path <- system.file("extdata", "chip_matched.txt", package = "netZooR", mustWork = TRUE)
#' ppi_file_path <- system.file("extdata", "ppi_matched.txt", package = "netZooR", mustWork = TRUE)
#'
#'
#' # Run PANDA for treated and control network
#' \donttest{
#' treated_all_panda_result <- pandaPy(expr_file = treated_expression_file_path,
#' motif_file = motif_file_path, ppi_file = ppi_file_path,
#' modeProcess="legacy", remove_missing = TRUE )
#'
#' # access PANDA regulatory network
#' treated_net <- treated_all_panda_result$panda
#'
#' # access PANDA regulatory indegree network.
#' indegree_net <- treated_all_panda_result$indegree
#'
#' # access PANDA regulatory outdegree networks
#' outdegree_net <- treated_all_panda_result$outdegree
#' }
#'
#' @import reticulate
#' @export
#'
pandaPy <- function(expr_file, motif_file=NULL, ppi_file=NULL, computing="cpu", precision="double",save_memory=FALSE, save_tmp=TRUE, keep_expression_matrix=FALSE, modeProcess="union", remove_missing=FALSE, with_header=FALSE){
if(missing(expr_file)){
stop("Please provide the path of gene expression data file to 'expr_file' variable") }
else{ expr.str <- paste("\'", expr_file, "\'", sep = '') }
if(is.null(motif_file)){
motif.str <- 'None'
message("The prior motif network is not provided, so analysis continues with Pearson correlation matrix and gene coexpression mastrix is returned as a result network")
} else{ motif.str <- paste("\'", motif_file,"\'", sep = '') }
if(is.null(ppi_file)){
ppi.str <- 'None'
message("No protein-protein interaction network provided.")
} else{ ppi.str <- paste("\'", ppi_file, "\'", sep = '') }
# computing variable
if(computing=="gpu"){
computing.str <- "computing='gpu'"
} else{ computing.str <- "computing='cpu'"}
# precision variable
if(precision == "single" ){
precision.str <- "precision='single'"
} else{ precision.str <- "precision='double'"}
# save_memory variable
if(save_memory==TRUE){
savememory.str <- "save_memory= True"
} else{ savememory.str <- "save_memory=False" }
# save_tmp variable
if(save_tmp==FALSE){
savetmp.str <- "save_tmp= False"
} else{ savetmp.str <- "save_tmp=True" }
# keep_expression_matrix variable
if(keep_expression_matrix==TRUE){
keepexpression.str <- "keep_expression_matrix=True"
} else{ keepexpression.str <- "keep_expression_matrix=False" }
# with header option
if(with_header==FALSE){
withheader.str <- "with_header=False"
}else if (with_header==TRUE){
withheader.str <- "with_header=True"
}
# when pre-processing mode is legacy
if(modeProcess == "legacy"){
if(remove_missing == TRUE){
message("Use the legacy mode to pre-process the input dataset and keep only the matched TFs or Genes")
mode.str <- "modeProcess ='legacy', remove_missing = True"
} else{ message("Use the legacy mode (netZooPy version <= 0.5) to pre-process the input dataset and keep all the unmatched TFs or Genes")
mode.str <- "modeProcess ='legacy', remove_missing = False"}
}
# when pre-processing mode is union
else if(modeProcess == "union"){
mode.str <- "modeProcess ='union'"
}
else if(modeProcess == "intersection"){
mode.str <- "modeProcess ='intersection'"
}
# source the pypanda from github raw website.
pandapath <- system.file("extdata", "panda.py", package = "netZooR", mustWork = TRUE)
reticulate::source_python(pandapath,convert = TRUE)
# invoke Python script to create a Panda object
obj.str <- paste("panda_obj=Panda(", expr.str, ",", motif.str,",", ppi.str, ",",
computing.str, ",", precision.str, ",", savememory.str, ",", savetmp.str, "," ,
keepexpression.str, ",", mode.str, "," , withheader.str, ")", sep ='')
# run Python code
py_run_string(obj.str)
# run PAMDA
if(save_memory == FALSE){
py_run_string("panda_network=panda_obj.export_panda_results",local = FALSE, convert = TRUE)
# convert python object to R vector
panda_net <- py$panda_network
# re-assign data type
panda_net$tf <- as.character(panda_net$tf)
panda_net$gene <- as.character(panda_net$gene)
if("motif" %in% names(panda_net)){
panda_net$motif <- as.numeric(panda_net$motif)
}
panda_net$force <- as.numeric(panda_net$force)
if("motif" %in% names(panda_net)){
# adjust column order
panda_net <- panda_net[,c("tf","gene","motif","force")]
# rename the PANDA output colnames
colnames(panda_net) <- c("TF","Gene","Motif","Score")
}else{
# adjust column order
panda_net <- panda_net[,c("tf","gene","force")]
# rename the PANDA output colnames
colnames(panda_net) <- c("TF","Gene","Score")
}
# in-degree of panda network
py_run_string(paste("indegree=panda_obj.return_panda_indegree()"))
indegree_net <- py$indegree
indegree_net <- as.data.frame(cbind(Target = rownames(indegree_net), Target_Score = indegree_net$force), stringsAsFactors =FALSE)
indegree_net$`Target_Score` <- as.numeric(indegree_net$`Target_Score`)
# out-degree of panda netwook
py_run_string(paste("outdegree=panda_obj.return_panda_outdegree()"))
outdegree_net <- py$outdegree
outdegree_net <- as.data.frame(cbind(Regulator = rownames(outdegree_net), Regulator_Score = outdegree_net$force), stringsAsFactors =FALSE)
outdegree_net$`Regulator_Score` <- as.numeric(outdegree_net$`Regulator_Score`)
if( length(intersect(panda_net$Gene, panda_net$TF))>0){
panda_net$TF <- paste('reg_', panda_net$TF, sep='')
panda_net$Gene <- paste('tar_', panda_net$Gene, sep='')
message("Rename the content of first two columns with prefix 'reg_' and 'tar_' as there are some duplicate node names between the first two columns" )
}
output <- list("panda" = panda_net, "indegree" = indegree_net, "outdegree" = outdegree_net)
} else{ py_run_string("panda_network=panda_obj.panda_network",local = FALSE, convert = TRUE)
panda_net <- py$panda_network
# weighted adjacency matrix of PANDA network
output <- list("WAMpanda" = panda_net)
}
message ("...Finish PANDA...")
return(output)
}