forked from cran/SNFtool
-
Notifications
You must be signed in to change notification settings - Fork 15
/
SNF.R
175 lines (162 loc) · 6.84 KB
/
SNF.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
#' Similarity Network Fusion
#'
#' Similarity Network Fusion takes multiple views of a network and fuses them
#' together to construct an overall status matrix. The input to our algorithm
#' can be feature vectors, pairwise distances, or pairwise similarities. The
#' learned status matrix can then be used for retrieval, clustering, and
#' classification.
#'
#'
#' @param Wall List of matrices. Each element of the list is a square,
#' symmetric matrix that shows affinities of the data points from a certain
#' view.
#' @param K Number of neighbors in K-nearest neighbors part of the algorithm.
#' @param t Number of iterations for the diffusion process.
#' @param layer_bias Numerical vector of length length(Wall) containing the
#' relative weights for each layer.
#' @param parallel Should the algorithm attempt to run in parallel? A parallel
#' backend needs to be set up first.
#' @param auto_stop Should the algorithm stop early if it believes it has
#' converged?
#' @return W is the overall status matrix derived
#' @author Dr. Anna Goldenberg, Bo Wang, Aziz Mezlini, Feyyaz Demir
#' @references B Wang, A Mezlini, F Demir, M Fiume, T Zu, M Brudno, B
#' Haibe-Kains, A Goldenberg (2014) Similarity Network Fusion: a fast and
#' effective method to aggregate multiple data types on a genome wide scale.
#' Nature Methods. Online. Jan 26, 2014
#'
#' Concise description can be found here:
#' http://compbio.cs.toronto.edu/SNF/SNF/Software.html
#' @examples
#'
#'
#' ## First, set all the parameters:
#' K = 20; # number of neighbors, usually (10~30)
#' alpha = 0.5; # hyperparameter, usually (0.3~0.8)
#' T = 20; # Number of Iterations, usually (10~20)
#'
#' ## Data1 is of size n x d_1,
#' ## where n is the number of patients, d_1 is the number of genes,
#' ## Data2 is of size n x d_2,
#' ## where n is the number of patients, d_2 is the number of methylation
#' data(Data1)
#' data(Data2)
#'
#' ## Here, the simulation data (SNFdata) has two data types. They are complementary to each other.
#' ## And two data types have the same number of points.
#' ## The first half data belongs to the first cluster; the rest belongs to the second cluster.
#' truelabel = c(matrix(1,100,1),matrix(2,100,1)); ## the ground truth of the simulated data
#'
#' ## Calculate distance matrices
#' ## (here we calculate Euclidean Distance, you can use other distance, e.g,correlation)
#'
#' ## If the data are all continuous values, we recommend the users to perform
#' ## standard normalization before using SNF,
#' ## though it is optional depending on the data the users want to use.
#' # Data1 = standardNormalization(Data1);
#' # Data2 = standardNormalization(Data2);
#'
#'
#'
#' ## Calculate the pair-wise distance;
#' ## If the data is continuous, we recommend to use the function "dist2" as follows
#' Dist1 = dist2(as.matrix(Data1),as.matrix(Data1));
#' Dist2 = dist2(as.matrix(Data2),as.matrix(Data2));
#'
#' ## next, construct similarity graphs
#' W1 = affinityMatrix(Dist1, K, alpha)
#' W2 = affinityMatrix(Dist2, K, alpha)
#'
#' ## These similarity graphs have complementary information about clusters.
#' displayClusters(W1,truelabel);
#' displayClusters(W2,truelabel);
#'
#' ## next, we fuse all the graphs
#' ## then the overall matrix can be computed by similarity network fusion(SNF):
#' W = SNF(list(W1,W2), K, T)
#'
#' ## With this unified graph W of size n x n,
#' ## you can do either spectral clustering or Kernel NMF.
#' ## If you need help with further clustering, please let us know.
#'
#' ## You can display clusters in the data by the following function
#' ## where C is the number of clusters.
#' C = 2 # number of clusters
#' group = spectralClustering(W,C); # the final subtypes information
#' displayClusters(W, group)
#'
#' ## You can get cluster labels for each data point by spectral clustering
#' labels = spectralClustering(W, C)
#'
#' plot(Data1, col=labels, main='Data type 1')
#' plot(Data2, col=labels, main='Data type 2')
#'
SNF <- function(Wall,K=20,t=20,layer_bias = rep.int(1, length(Wall)),parallel=FALSE, auto_stop=FALSE) {
###This function is the main function of our software. The inputs are as follows:
# Wall : List of affinity matrices
# K : number of neighbors
# t : number of iterations for fusion
# layer_bias : Numerical vector of length length(Wall) containing the relative weights for each layer.
# parallel : Should the algorithm attempt to run in parallel? A parallel backend needs to be set up first.
# auto_stop : Should the algorithm stop early if it believes it has converged?
###The output is a unified similarity graph. It contains both complementary information and common structures from all individual network.
###You can do various applications on this graph, such as clustering(subtyping), classification, prediction.
LW = length(Wall)
normalize <- function(X) X / rowSums(X)
# makes elements other than largest K zero
newW <- vector("list", LW)
nextW <- vector("list", LW)
###First, normalize different networks to avoid scale problems.
for( i in 1: LW){
Wall[[i]] = normalize(Wall[[i]]);
Wall[[i]] = (Wall[[i]]+t(Wall[[i]]))/2;
}
### Calculate the local transition matrix.
for( i in 1: LW){
newW[[i]] = (.dominateset(Wall[[i]],K))
}
# Set up convergence detection
devs <- NULL
converged <- FALSE
# perform the diffusion for t iterations
for (i in 1:t) {
nextW <- plyr::llply(1:LW, function(j){
sumWJ = matrix(0, dim(Wall[[j]])[1], dim(Wall[[j]])[2])
adjusted_bias <- layer_bias/sum(layer_bias[-j])
for (k in setdiff(1:LW, j)) {
sumWJ = sumWJ + (Wall[[k]]*adjusted_bias[k])
}
return(newW[[j]] %*% (sumWJ/(LW - 1)) %*% t(newW[[j]]))
}, .parallel=parallel)
###Normalize each new obtained networks.
for(j in 1 : LW){
Wall[[j]] = nextW[[j]] + diag(nrow(Wall[[j]]));
Wall[[j]] = (Wall[[j]] + t(Wall[[j]]))/2;
}
# Check for convergence
wallarray <- array(unlist(Wall),dim=c(dim(Wall[[1]]),length(Wall)))
means <- base::rowMeans(wallarray, dims=2)
dev <- mean(abs(wallarray-array(means,dim=dim(wallarray))) / mean(means))
devs <- c(devs,dev)
d_devs <- (devs-lag(devs))
d2_devs <- (d_devs-lag(d_devs))
if(!is.na(d2_devs[length(d2_devs)]) &
abs(d_devs[length(d_devs)])<0.01 &
abs(d2_devs[length(d2_devs)])<0.01){
converged <- TRUE
}
if(auto_stop & converged){
break
}
}
# construct the combined affinity matrix by summing diffused matrices
W = matrix(0,nrow(Wall[[1]]), ncol(Wall[[1]]))
for(i in 1:LW){
W = W + Wall[[i]]
}
W = W/LW;
W = normalize(W);
# ensure affinity matrix is symmetrical
W = (W + t(W)+diag(nrow(W))) / 2;
return(W)
}