/
scoremats.r
305 lines (279 loc) · 12.1 KB
/
scoremats.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
#' Return scores (or distances) for given query and target neurons
#'
#' Scores can either be returned as raw numbers, normalised such that a self-hit
#' has score 1, or as the average of the normalised scores in both the forwards
#' & reverse directions (i.e. \code{|query->target| + |target->query| / 2}).
#' Distances are returned as either \code{1 - normscore} in the forwards
#' direction, or as \code{1 - normscorebar}, where \code{normscorebar} is
#' \code{normscore} averaged across both directions.
#' @param query,target character vectors of neuron identifiers.
#' @param scoremat a matrix, ff matrix, \code{bigmatrix} or a character vector
#' specifying the name of an ff matrix containing the all by all score
#' matrix.
#' @param distance logical indicating whether to return distances or scores.
#' @param normalisation the type of normalisation procedure that should be
#' carried out, selected from \code{'raw'}, \code{'normalised'} or
#' \code{'mean'} (i.e. the average of normalised scores in both directions).
#' If \code{distance=TRUE} then this cannot be raw.
#' @importFrom spam as.matrix
#' @export
#' @seealso \code{\link{sub_dist_mat}}
sub_score_mat <- function(query, target, scoremat=NULL, distance=FALSE, normalisation=c('raw', 'normalised', 'mean')) {
# Check arguments
normalisation <- match.arg(normalisation)
if(distance && normalisation == 'raw') stop("Raw scores are always similarity scores.")
if(is.null(scoremat)) stop("A score matrix must be provided!")
if(!identical(length(dim(scoremat)),2L)) stop("scoremat must be a matrix!")
# Check for missing query and target neurons
rnames <- rownames(scoremat)
if(missing(target)) target <- rnames
else {
target_missing <- setdiff(target, rnames)
if(length(target_missing) > 0) {
warning("Dropping ", length(target_missing), " target neurons.")
target <- intersect(target, rnames)
}
}
cnames=colnames(scoremat)
if(missing(query)) query <- cnames
else {
query_missing <- setdiff(query, cnames)
if(length(query_missing) > 0) {
warning("Dropping ", length(query_missing), " query neurons.")
query <- intersect(query, cnames)
}
}
# Subsetting large matrices by name is slow, so pre-calculate indices
qidxs <- match(query, cnames)
tidxs <- match(target, rnames)
fwd_scores <- scoremat[tidxs, qidxs, drop=FALSE]
if(inherits(fwd_scores, 'spam')) {
fwd_scores <- as.matrix(fwd_scores)
rownames(fwd_scores) <- rnames[tidxs]
colnames(fwd_scores) <- cnames[qidxs]
}
# Check if we have been asked to provide a square matrix
square_mat <- length(qidxs) == length(tidxs) && all(qidxs==tidxs)
x <- if(normalisation %in% c('mean', 'normalised')) {
# Normalise forward scores
self_matches <- if(square_mat) diag(fwd_scores) else diagonal(scoremat, qidxs)
fwd_scores <- scale(fwd_scores, center=FALSE, scale=self_matches)
if(normalisation == 'mean') {
if(square_mat) {
(fwd_scores + t(fwd_scores)) / 2
} else {
rev_scores <- scoremat[qidxs, tidxs, drop=FALSE]
if(inherits(rev_scores, 'spam')) {
rev_scores <- as.matrix(rev_scores)
rownames(rev_scores) <- rnames[qidxs]
colnames(rev_scores) <- cnames[tidxs]
}
self_matches <- diagonal(scoremat, tidxs)
rev_scores <- scale(rev_scores, center=FALSE, scale=self_matches)
(fwd_scores + t(rev_scores)) / 2
}
} else {
fwd_scores
}
} else {
fwd_scores
}
# Drop dimensions in the standard R way (including names, etc.)
if(inherits(scoremat, 'spam')) x <- as.matrix(x)
if(nrow(x) == 1 || ncol(x) == 1) x <- x[seq_len(nrow(x)), seq_len(ncol(x))]
if(distance) 1-x else x
}
#' Convert (a subset of) a raw score matrix to a distance matrix
#'
#' @description This function can convert a raw score matrix returned by
#' \code{nblast} into a square distance matrix or \code{dist} object. It can
#' be used with file-backed matrices as well as regular R matrices resident in
#' memory.
#'
#' @details Note that if \code{neuron_names} is missing then the rownames of
#' \code{scoremat} will be used i.e. every neuron in \code{scoremat} will be
#' used.
#'
#' @inheritParams nhclust
#' @param form the type of object to return.
#' @param maxneurons set this to a sensible value to avoid loading huge (order
#' N^2) distances directly into memory.
#' @return return An object of class matrix or dist (as determined by the form
#' argument), corresponding to a subset of the distance matrix
#' @export
#' @family scoremats
#' @importFrom stats as.dist
sub_dist_mat <- function(neuron_names, scoremat=NULL, form=c('matrix', 'dist'), maxneurons=NA){
form <- match.arg(form)
if(missing(neuron_names) || is.null(neuron_names)){
if(nrow(scoremat)!=ncol(scoremat))
stop("scoremat must be square if neuron_names is missing")
neuron_names=rownames(scoremat)
}
if(!is.na(maxneurons) && length(neuron_names) > maxneurons) {
stop("Too many neurons! Use maxneurons to override if you're sure.")
}
d <- sub_score_mat(neuron_names, neuron_names, scoremat=scoremat, distance=TRUE, normalisation='mean')
if(form=='matrix') d
else as.dist(d)
}
#' Extract diagonal terms from a variety of matrix types
#'
#' @details Insists that input matrix is square. Uses the \code{'diagonal'}
#' attribute when available and has specialised handling of \code{ff},
#' \code{big.matrix}, \code{dgCMatrix} matrices. Does not check that row and
#' column names are identical for those matrix classes (unlike the base
#' \code{\link{diag}} function, but always uses rownames.
#' @param x A square matrix
#' @param indices specifies a subset of the diagonal using a character vector of
#' names, a logical vector or integer indices. The default (\code{NULL})
#' implies all elements.
#' @return a named vector containing the diagonal elements.
#' @export
#' @examples
#' m=fill_in_sparse_score_mat(letters[1:5])
#' diagonal(m)
diagonal <- function(x, indices=NULL) UseMethod('diagonal')
#' @importFrom spam diag
#' @rdname diagonal
#' @export
diagonal.default <- function(x, indices=NULL) {
if(!isTRUE(nrow(x)==ncol(x))) stop("x is not a square matrix!")
if(is.character(indices)) indices=match(indices,rownames(x))
if(!is.null(xdiag<-attr(x,'diagonal'))){
return(if(is.null(indices)) xdiag else xdiag[indices])
}
if(is.logical(indices)) indices=which(indices)
if(inherits(x,"ff")){
# convert array indices to vector indices
if(is.null(indices)) indices=seq_len(nrow(x))
vidxs=ff::arrayIndex2vectorIndex(cbind(indices,indices),dim=dim(x))
# by default we don't get the names back
structure(x[vidxs], .Names=rownames(x)[indices])
} else if(inherits(x,"big.matrix")) {
fast_disk_diag(x, indices, use.names=TRUE)
} else if(inherits(x, 'spam')) {
spam_diag <- diag(x)
names(spam_diag) <- rownames(x)
if(is.null(indices)) spam_diag else spam_diag[indices]
} else {
if(is.null(indices)) diag(x) else diag(x)[indices]
}
}
# Utility function to get diagonal of on disk faster than it can manage by itself
# by reading in reasonble size chunks
fast_disk_diag<-function(x, indices=NULL, chunksize=300, use.names=TRUE) {
if(is.null(indices)) indices=seq_len(nrow(x))
ninds=length(indices)
diags=rep(NA,ninds)
if(ninds>chunksize){
for(i in seq.int(from=0,by=chunksize,to=ninds-chunksize)) {
sq=x[indices[i+1:chunksize],indices[i+1:chunksize]]
diags[i+1:chunksize]=diag(sq)
}
# next index will be
i=i+chunksize+1
} else {
# we'll be starting from scratch
i=1
}
if(i<=ninds){
sq=x[indices[i:ninds],indices[i:ninds], drop=FALSE]
diags[i:ninds]=diag(sq)
}
if(use.names) names(diags)=rownames(x)[indices]
diags
}
#' Convert a subset of a square score matrix to a sparse representation
#'
#' This can be useful for storing raw forwards and reverse NBLAST scores for a
#' set of neurons without having to store all the uncomputed elements in the
#' full score matrix.
#'
#' @param neuron_names a character vector of neuron names to save scores for.
#' @param dense_matrix the original, dense version of the full score matrix.
#'
#' @return A spare matrix, in compressed, column-oriented form, as an R object
#' inheriting from both \code{\link[Matrix]{CsparseMatrix-class}} and
#' \code{\link[Matrix]{generalMatrix-class}}.
#' @export
#' @importFrom spam spam
#' @seealso fill_in_sparse_score_mat
#' @examples
#' data(kcs20, package = "nat")
#' scores=nblast_allbyall(kcs20)
#' scores.3.sparse=sparse_score_mat(names(kcs20)[3], scores)
#' scores.3.sparse
#' # can also add additional submatrices
#' fill_in_sparse_score_mat(scores.3.sparse,scores[3:6,3:4])
sparse_score_mat <- function(neuron_names, dense_matrix) {
col_num <- which(colnames(dense_matrix) %in% neuron_names)
row_num <- which(rownames(dense_matrix) %in% neuron_names)
spmat <- spam(list(i=1:nrow(dense_matrix), j=1:nrow(dense_matrix), diagonal(dense_matrix)))
spmat[row_num, ] <- dense_matrix[row_num, ]
spmat[, col_num] <- dense_matrix[, col_num]
dimnames(spmat) <- dimnames(dense_matrix)
spmat
}
# Utility function to convert a vector of scores into named row or column matrix
neuron_scores_to_mat <- function(scores, query, target) {
if(!missing(query)){
matrix(scores, nrow=length(scores), dimnames=c(names(scores), query))
} else {
matrix(scores, ncol=length(scores), dimnames=c(target, names(scores)))
}
}
#' Add one or more submatrices to a sparse score matrix
#'
#' @param sparse_matrix either an existing (square) sparse matrix or a character
#' vector of names that will be used to define an empty sparse matrix.
#' @param diag optional full diagonal for sparse matrix i.e. self-match scores.
#' @param ... Additional matrices to insert into \code{sparse_matrix}. Row and
#' column names must have matches in \code{sparse_matrix}.
#' @seealso sparse_score_mat
#' @importFrom spam spam
#' @export
fill_in_sparse_score_mat <- function(sparse_matrix, ..., diag=NULL) {
if(is.character(sparse_matrix)) {
spmat <- spam(0, ncol=length(sparse_matrix), nrow=length(sparse_matrix))
dimnames(spmat) <- list(sparse_matrix, sparse_matrix)
sparse_matrix <- spmat
}
stored_dimnames <- dimnames(sparse_matrix)
if(!is.null(diag)) {
diag_inds <- seq.int(from=1, by = nrow(sparse_matrix)+1, length.out=ncol(sparse_matrix))
sparse_matrix[diag_inds] <- diag
}
dense_matrices <- list(...)
for(dense_matrix in dense_matrices) {
if(!is.matrix(dense_matrix)) stop("I only work with matrices.\\nMaybe you need to use neuron_scores_to_mat to convert a vector of scores to a matrix!")
row_nums <- match(rownames(dense_matrix), rownames(sparse_matrix))
col_nums <- match(colnames(dense_matrix), colnames(sparse_matrix))
sparse_matrix[row_nums, col_nums] <- dense_matrix
}
dimnames(sparse_matrix) <- stored_dimnames
sparse_matrix
}
#' Add forwards, reverse and self scores for a pair of neurons to a sparse score matrix
#'
#' @param sparse_matrix the sparse matrix to fill in.
#' @param n1 the name of the query neuron.
#' @param n2 the name of the target neuron.
#' @param dense_matrix the score matrix from which to extract scores.
#' @param reverse logical indicating that the reverse score should also be filled in (default \code{TRUE}).
#' @param self logical indicating that the self-score of the query should also be filled in (used for normalised scores; default \code{TRUE}).
#' @param reverse_self logical indicating that the self-score of the target should also be filled in (used for mean scores; default \code{TRUE}).
#'
#' @return A sparse matrix (of class \code{\link[spam]{spam}}) with the specified score entries filled.
#' @export
fill_pairs_sparse_score_mat <- function(sparse_matrix, n1, n2, dense_matrix, reverse=TRUE, self=TRUE, reverse_self=(reverse && self)) {
stored_dimnames <- dimnames(sparse_matrix)
n1s <- match(n1, colnames(sparse_matrix))
n2s <- match(n2, rownames(sparse_matrix))
sparse_matrix[n1s, n2s] <- dense_matrix[n1, n2]
if(reverse) sparse_matrix[n2s, n1s] <- dense_matrix[n2, n1]
if(self) sparse_matrix[n1s, n1s] <- dense_matrix[n1, n1]
if(reverse_self) sparse_matrix[n2s, n2s] <- dense_matrix[n2, n2]
dimnames(sparse_matrix) <- stored_dimnames
sparse_matrix
}