-
Notifications
You must be signed in to change notification settings - Fork 13
/
desc-15-PSSM.R
285 lines (250 loc) 路 11 KB
/
desc-15-PSSM.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
#' Compute PSSM (Position-Specific Scoring Matrix) for given protein sequence
#'
#' This function calculates the PSSM (Position-Specific Scoring Matrix) derived
#' by PSI-Blast for given protein sequence or peptides.
#'
#' For given protein sequences or peptides, PSSM represents the
#' log-likelihood of the substitution of the 20 types of amino acids at that
#' position in the sequence. Note that the output value is not normalized.
#'
#' @param seq Character vector, as the input protein sequence.
#' @param start.pos Optional integer denoting the start position of the
#' fragment window. Default is \code{1},
#' i.e. the first amino acid of the given sequence.
#' @param end.pos Optional integer denoting the end position of the
#' fragment window. Default is \code{nchar(seq)},
#' i.e. the last amino acid of the given sequence.
#' @param psiblast.path Character string indicating the path of the
#' \code{psiblast} program.
#' If NCBI Blast+ was previously installed in the operation system,
#' the path will be automatically detected.
#' @param makeblastdb.path Character string indicating the path of the
#' \code{makeblastdb} program.
#' If NCBI Blast+ was previously installed in the system,
#' the path will be automatically detected.
#' @param database.path Character string indicating the path of
#' a reference database (a FASTA file).
#' @param iter Number of iterations to perform for PSI-Blast.
#' @param silent Logical. Whether the PSI-Blast running output
#' should be shown or not (May not work on some Windows versions and
#' PSI-Blast versions), default is \code{TRUE}.
#'
#' @param evalue Expectation value (E) threshold for saving hits.
#' Default is \code{10}.
#' @param word.size Word size for wordfinder algorithm. An integer >= 2.
#' @param gapopen Integer. Cost to open a gap.
#' @param gapextend Integer. Cost to extend a gap.
#' @param matrix Character string. The scoring matrix name
#' (default is \code{"BLOSUM62"}).
#' @param threshold Minimum word score such that the word is added to
#' the BLAST lookup table. A real value >= 0.
#' @param seg Character string. Filter query sequence with SEG (\code{"yes"},
#' \code{"window locut hicut"}, or \code{"no"} to disable).
#' Default is \code{"no"}.
#' @param soft.masking Logical. Apply filtering locations as soft masks?
#' Default is \code{FALSE}.
#' @param culling.limit An integer >= 0. If the query range of a hit is
#' enveloped by that of at least this many higher-scoring hits,
#' delete the hit. Incompatible with \code{best.hit.overhang} and
#' \code{best_hit_score_edge}.
#' @param best.hit.overhang Best Hit algorithm overhang value
#' (A real value >= 0 and =< 0.5, recommended value: 0.1).
#' Incompatible with \code{culling_limit}.
#' @param best.hit.score.edge Best Hit algorithm score edge value
#' (A real value >=0 and =< 0.5, recommended value: 0.1).
#' Incompatible with \code{culling_limit}.
#' @param xdrop.ungap X-dropoff value (in bits) for ungapped extensions.
#' @param xdrop.gap X-dropoff value (in bits) for preliminary gapped extensions.
#' @param xdrop.gap.final X-dropoff value (in bits) for final gapped alignment.
#' @param window.size An integer >= 0. Multiple hits window size,
#' To specify 1-hit algorithm, use \code{0}.
#' @param gap.trigger Number of bits to trigger gapping. Default is \code{22}.
#' @param num.threads Integer. Number of threads (CPUs) to use in the
#' BLAST search. Default is \code{1}.
#' @param pseudocount Integer. Pseudo-count value used when constructing PSSM.
#' Default is \code{0}.
#' @param inclusion.ethresh E-value inclusion threshold for pairwise alignments.
#' Default is \code{0.002}.
#'
#' @return The original PSSM, a numeric matrix which has
#' \code{end.pos - start.pos + 1} columns and \code{20} named rows.
#'
#' @note
#' The function requires the \code{makeblastdb} and \code{psiblast} programs
#' to be properly installed in the operation system or
#' their paths provided.
#'
#' The two command-line programs are included in the NCBI-BLAST+
#' software package. To install NCBI Blast+ for your operating system, see
#' \url{https://blast.ncbi.nlm.nih.gov/doc/blast-help/downloadblastdata.html}
#' for detailed instructions.
#'
#' Ubuntu/Debian users can directly use the command
#' \code{sudo apt-get install ncbi-blast+} to install NCBI Blast+.
#' For OS X users, download \code{ncbi-blast- ... .dmg} then install.
#' For Windows users, download \code{ncbi-blast- ... .exe} then install.
#'
#' @seealso \link{extractPSSMFeature} \link{extractPSSMAcc}
#'
#' @author Nan Xiao <\url{https://nanx.me}>
#'
#' @export extractPSSM
#'
#' @references
#' Altschul, Stephen F., et al.
#' "Gapped BLAST and PSI-BLAST: a new generation of protein database search programs."
#' \emph{Nucleic acids research} 25.17 (1997): 3389--3402.
#'
#' Ye, Xugang, Guoli Wang, and Stephen F. Altschul.
#' "An assessment of substitution scores for protein profile-profile comparison."
#' \emph{Bioinformatics} 27.24 (2011): 3356--3363.
#'
#' Rangwala, Huzefa, and George Karypis.
#' "Profile-based direct kernels for remote homology detection and fold recognition."
#' \emph{Bioinformatics} 21.23 (2005): 4239--4247.
#'
#' @examples
#' if (Sys.which("makeblastdb") == "" | Sys.which("psiblast") == "") {
#' cat("Cannot find makeblastdb or psiblast. Please install NCBI Blast+ first")
#' } else {
#' x <- readFASTA(system.file(
#' "protseq/P00750.fasta",
#' package = "protr"
#' ))[[1]]
#' dbpath <- tempfile("tempdb", fileext = ".fasta")
#' invisible(file.copy(from = system.file(
#' "protseq/Plasminogen.fasta",
#' package = "protr"
#' ), to = dbpath))
#'
#' pssmmat <- extractPSSM(seq = x, database.path = dbpath)
#'
#' dim(pssmmat) # 20 x 562 (P00750: length 562, 20 Amino Acids)
#' }
extractPSSM <- function(
seq, start.pos = 1L, end.pos = nchar(seq),
psiblast.path = NULL, makeblastdb.path = NULL,
database.path = NULL, iter = 5, silent = TRUE,
evalue = 10L, word.size = NULL,
gapopen = NULL, gapextend = NULL,
matrix = "BLOSUM62", threshold = NULL,
seg = "no", soft.masking = FALSE,
culling.limit = NULL, best.hit.overhang = NULL,
best.hit.score.edge = NULL,
xdrop.ungap = NULL, xdrop.gap = NULL,
xdrop.gap.final = NULL,
window.size = NULL, gap.trigger = 22L,
num.threads = 1L, pseudocount = 0L,
inclusion.ethresh = 0.002) {
if (Sys.which("makeblastdb") == "" & is.null(makeblastdb.path)) {
stop("Please install makeblastdb (included in NCBI BLAST+) or specify makeblastdb.path")
}
if (Sys.which("psiblast") == "" & is.null(psiblast.path)) {
stop("Please install psiblast (included in NCBI BLAST+) or specify psiblast.path")
}
makeblastdb.path <- if (!is.null(makeblastdb.path)) {
makeblastdb.path
} else {
Sys.which("makeblastdb")
}
psiblast.path <- if (!is.null(psiblast.path)) {
psiblast.path
} else {
Sys.which("psiblast")
}
if (is.null(database.path)) stop("Must specify the database (a FASTA file) path")
N <- end.pos - start.pos + 1L
# Prepare data for PSI-Blast
tmpdb <- tempfile("protrPSIBlastDB")
cmddb <- paste0(
shQuote(makeblastdb.path), " -dbtype prot -in ",
shQuote(database.path), " -out ", shQuote(tmpdb)
)
if (silent == TRUE) system(cmddb, ignore.stdout = TRUE) else system(cmddb)
# Basic parameters for PSI-Blast
tmp <- tempfile("protrPSIBlast")
queryFasta <- paste0(tmp, ".fasta")
PSSMfile <- paste0(tmp, ".pssm")
querySeq <- Biostrings::AAStringSet(seq)
Biostrings::writeXStringSet(querySeq, queryFasta)
# Additional parameters for PSI-Blast
if (!is.null(evalue)) {
if (evalue <= 0L) {
stop("evalue must be > 0")
}
}
evalue.arg <- ifelse(evalue == 10L, " ", paste0(" -evalue ", evalue))
if (!is.null(word.size)) {
if (word.size < 2L | word.size >= 6L) {
stop("word.size must be an integer >= 2 and < 6")
}
}
word.size.arg <- ifelse(is.null(word.size), " ", paste0(" -word_size ", word.size))
gapopen.arg <- ifelse(is.null(gapopen), " ", paste0(" -gapopen ", gapopen))
gapextend.arg <- ifelse(is.null(gapextend), " ", paste0(" -gapextend ", gapextend))
matrix.arg <- ifelse(matrix == "BLOSUM62", " ", paste0(" -matrix ", matrix))
if (!is.null(threshold)) {
if (threshold < 0L) {
stop("threshold must be a real number >= 0")
}
}
threshold.arg <- ifelse(is.null(threshold), " ", paste0(" -threshold ", threshold))
seg.arg <- ifelse(seg == "no", " ", paste0(" -seg ", seg))
soft.masking.arg <- ifelse(soft.masking == FALSE, " ", " -soft_masking true")
if (!is.null(culling.limit)) {
if (culling.limit < 0L) {
stop("culling.limit must be an integer >= 0")
}
}
culling.limit.arg <- ifelse(is.null(culling.limit), " ", paste0(" -culling_limit ", culling.limit))
if (!is.null(best.hit.overhang)) {
if (best.hit.overhang < 0L | best.hit.overhang > 1.5) {
stop("best.hit.overhang must be a real number >= 0 and =< 0.5")
}
}
best.hit.overhang.arg <- ifelse(is.null(best.hit.overhang), " ", paste0(" -best_hit_overhang ", best.hit.overhang))
if (!is.null(best.hit.score.edge)) {
if (best.hit.score.edge < 0L | best.hit.score.edge > 1.5) {
stop("best.hit.score.edge must be a real number >= 0 and =< 0.5")
}
}
best.hit.score.edge.arg <- ifelse(is.null(best.hit.score.edge), " ", paste0(" -best_hit_score_edge ", best.hit.score.edge))
xdrop.ungap.arg <- ifelse(is.null(xdrop.ungap), " ", paste0(" -xdrop_ungap ", xdrop.ungap))
xdrop.gap.arg <- ifelse(is.null(xdrop.gap), " ", paste0(" -xdrop_gap ", xdrop.gap))
xdrop.gap.final.arg <- ifelse(is.null(xdrop.gap.final), " ", paste0(" -xdrop_gap_final ", xdrop.gap.final))
window.size.arg <- ifelse(is.null(window.size), " ", paste0(" -window_size ", window.size))
gap.trigger.arg <- ifelse(gap.trigger == 22L, " ", paste0(" -gap_trigger ", gap.trigger))
num.threads.arg <- ifelse(num.threads == 1L, " ", paste0(" -num_threads ", num.threads))
pseudocount.arg <- ifelse(pseudocount == 0L, " ", paste0(" -pseudocount ", pseudocount))
inclusion.ethresh.arg <- ifelse(inclusion.ethresh == 0.002, " ", paste0(" -inclusion_ethresh ", inclusion.ethresh))
# Run PSI-Blast
cmdpsi <- paste(
paste0(
shQuote(psiblast.path),
" -comp_based_stats 1 -db ", shQuote(tmpdb),
" -query ", shQuote(queryFasta), " -num_iterations ", iter,
" -out_ascii_pssm ", shQuote(PSSMfile)
),
paste0(
evalue.arg, word.size.arg, gapopen.arg, gapextend.arg,
matrix.arg, threshold.arg, seg.arg, soft.masking.arg,
culling.limit.arg, best.hit.overhang.arg, best.hit.score.edge.arg,
xdrop.ungap.arg, xdrop.gap.arg, xdrop.gap.final.arg, window.size.arg,
gap.trigger.arg, num.threads.arg, pseudocount.arg,
inclusion.ethresh.arg
)
)
if (silent == TRUE) system(cmdpsi, ignore.stdout = TRUE) else system(cmdpsi)
PSSMraw <- readLines(PSSMfile)
PSSMClean <- function(x) {
y <- unlist(strsplit(PSSMraw[x + 3], split = "\\s+"))
-as.numeric(y[4L:23L])
}
PSSMmat <- sapply(start.pos:end.pos, PSSMClean)
AADict <- c(
"A", "R", "N", "D", "C", "Q", "E", "G", "H", "I",
"L", "K", "M", "F", "P", "S", "T", "W", "Y", "V"
)
rownames(PSSMmat) <- AADict
PSSMmat
}