-
Notifications
You must be signed in to change notification settings - Fork 10
/
SimilarityM.R
197 lines (172 loc) · 5.45 KB
/
SimilarityM.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
#' Compute the similarity matrix
#'
#' Computes low dim embedding, constructs KNN graph on the embedding -> unweighted adjacency
#' Calls manifold learning algorithm which uses the normalized sample vectors and the
#' unweighted adjacency matrix to compute a low rank approximation of the data.
#'
#' @param data the expression data, where each column is treated as a normalized vector
#' @param lambda the balance term between the rank of Z and the error, default is 0.5
#' @param pre_embed_method how the initial non-linear embedding is performed, default is 'umap'
#' @param comps_knn number of components to use for knn, overrides eigengap-based inference
#' @param use_umap_indices use the knn indices computed during umap embedding to impose sparsity on L2R2, instead of recomputing based on the layout.
#' @param ... extra arguments passed to umap or Rtsne
#'
#' @return a list containing
#' \item{W}{the similarity matrix}
#' \item{E}{the error of the ADMM step}
#' \item{nl_embedding}{the KNN sparsity constraint is based on this embedding}
#'
#' @importFrom Rtsne Rtsne
#' @importFrom Matrix as.matrix norm
#' @importFrom FNN get.knnx
#' @importFrom stats prcomp
#' @import umap
#'
#' @export
#'
SimilarityM <- function(lambda = 0.5, data, comps_knn = NULL,
use_umap_indices = FALSE, pre_embed_method = 'umap', ...){
set.seed(1)
m = nrow(data)
n = ncol(data)
X <- data - min(data)
X <- X / max(X)
for(i in 1:n){
X[,i] <- X[,i] / norm(X[,i], "2")
}
if (m >= 60){
pca_data = prcomp(t(X))
pca_eigvalue1 <- pca_data$sdev^2
} else {
pca_data = prcomp(t(X))
pca_eigvalue1 <- pca_data$sdev^2
}
eigengaps = abs(pca_eigvalue1[2:(length(pca_eigvalue1)-1)] - pca_eigvalue1[-(1:2)])
No_Comps1 = which(max(eigengaps)==eigengaps)
if (No_Comps1>=1){
No_Comps1 = 1
}
No_Comps1 = No_Comps1 + 2
if(!is.null(comps_knn)){
comps_knn = No_Comps1
}
cc = cumsum(pca_eigvalue1[-(1)])
dd = cc[-(1)]/sum(pca_eigvalue1[-(1)])
K1 = length(which(dd<=0.3))
if (K1 <= 10){
K = 10
} else if (K1 >=30){
K = 30
} else {
K = K1+1
}
if(pre_embed_method == 'tsne'){
X2 <- Rtsne(X = t(as.matrix(X)), ...)
D <- matrix(1, n, n)
knn_object = get.knnx(X2$Y[,1:(No_Comps1)], X2$Y[,1:(No_Comps1)], k = K)
IDX = knn_object$nn.index
} else if (pre_embed_method == 'umap'){
data.umap <- umap(d = t(as.matrix(data)), n_components = comps_knn, ...)
X2 <- data.umap$layout
D <- matrix(1, n, n)
knn_object = get.knnx(X2[,1:No_Comps1], X2[,1:No_Comps1], k = K)
IDX <- knn_object$nn.index
if(use_umap_indices){
IDX <- data.umap$knn$indexes
}
}
for (jj in 1:n){
D[jj,IDX[jj,]] = 0
}
Z <- computeM(D,X,lambda)
Z$Z[Z$Z <= .Machine$double.eps] <- 0
W <- 0.5*(abs(Z$Z)+abs(t(Z$Z)))
return(list(W = W, E = Z$E, nl_embedding = X2))
}
#' Perform ADMM
#'
#' Compute cell-to-cell similarity matrix by solving the following
#' optimization problem via ADMM
#' \deqn{
#' min_{Z,E} ||Z||_* + lambda ||E||_{2,1}\\
#' s.t. X = XZ + E\\
#' Z'1 = 1\\
#' Z_{i,j} = 0 for (i,j) \in Omega}
#'
#' @param D the unweighted KNN adjacency matrix
#' @param X normalized sample vectors
#' @param lambda the balance term between the rank of Z and the error
#' @return a list containing the low rank approximation of X and manifold learning error
computeM <- function(D,X,lambda){
## ADMM iteration
maxiter = 100
Err <- matrix(0, maxiter, 2)
rho <- 5 # 5
mu <- 10^(-6) # 10^(-6)
mumax <- 10^(6)
epsilon <- 10^(-5)
m <- nrow(X)
n <- ncol(X)
Z <- matrix(0,n,n)
E <- matrix(0, m, n)
Y1 <- matrix(0, m, n)
Y2 <- matrix(0, 1, n)
Y3 <- matrix(0, n, n)
iter <- 0
XXZ <- X-X%*%Z
while(TRUE){
iter <- iter + 1
if (iter >= maxiter){
print("reached max iter")
return(list(Z = Z, E = norm(XXZ,"f")))
}
# step 1: Update J
mu <- min(rho*mu,mumax)
svd_data <- svd(Z-Y3/mu, nu = nrow(Z-Y3/mu), nv = ncol(Z-Y3/mu))
U <- svd_data$u
S <- diag(svd_data$d, nrow=nrow(Z), ncol=ncol(Z))
V <- svd_data$v
R <- length(diag(S))
Dmu <- matrix(0, R, R)
MM <- pmax(diag(S)-(1/mu)*matrix(1, R, 1), matrix(0, R,1))
for (i in 1:R){
Dmu[i,i] <- MM[i]
}
J <- U%*%Dmu%*%t(V)
if (iter >= 3){
if (Err[iter-1,1] >= Err[iter-2,1] || norm(as.matrix(XXZ),"2") <= epsilon){
print("convergence by error min")
return(list(Z = Z, E = norm(XXZ,"f")))
}
}
# Update E
QQ <- XXZ+Y1/mu
E <- apply(QQ, 2, function(x){
normX <- norm(as.matrix(x), type = '2')
as.vector(((normX - lambda/mu) / normX)) * as.matrix(x)
})
normsQQ <- apply(QQ, 2, function(x){
norm(x, type = '2')
})
for(j in 1:n){
if(normsQQ[j] <= lambda/mu){
E[,j] <- matrix(0,length(QQ[,j]))
}
}
eta <- norm(X, "2")^2 + norm(matrix(1,n,1),"2")^2+1
H <- -t(X)%*%(XXZ-E + (1/mu)*Y1) - matrix(1,n,1)%*%(matrix(1,1,n)- matrix(1,1,n)%*%Z + (1/mu)*Y2) + (Z - J + (1/mu)*Y3)
Z <- Z - (1/eta)*H
Z[D>0] <- 0
XXZ <- X-X%*%Z
# Update Dual variable
Y1 <- Y1 + mu*(XXZ-E)
Y2 <- Y2 + mu*(matrix(1,1,n) - matrix(1,1,n)%*%Z)
Y3 <- Y3 + mu*(Z - J)
Err[iter,] <- c(norm(XXZ-E,"2"), norm(Z-J,"2"))
print(paste0("Err = ", Err[iter,1]))
if (max(Err[iter,]) <= epsilon){
print("reached epsilon")
return(list(Z = Z, E = norm(XXZ,"f")))
}
}
}