-
Notifications
You must be signed in to change notification settings - Fork 1
/
nmfbin.R
375 lines (268 loc) · 9.11 KB
/
nmfbin.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
#' Logistic Non-negative Matrix Factorization
#'
#' This function performs Logistic Non-negative Matrix Factorization (NMF) on a binary matrix.
#'
#' @param X A binary matrix (m x n) to be factorized.
#' @param k The number of factors (components, topics).
#' @param optimizer Type of updating algorithm. `mur` for NMF multiplicative update rules, `gradient` for gradient descent, `sgd` for stochastic gradient descent.
#' @param init Method for initializing the factorization. By default Nonnegative Double Singular Value Decomposition with average densification.
#' @param max_iter Maximum number of iterations for optimization.
#' @param tol Convergence tolerance. The optimization stops when the change in loss is less than this value.
#' @param learning_rate Learning rate (step size) for the gradient descent optimization.
#' @param verbose Print convergence if `TRUE`.
#' @param loss_fun Choice of loss function: `logloss` (negative log-likelihood, also known as binary cross-entropy) or `mse` (mean squared error).
#' @param loss_normalize Normalize loss by matrix dimensions if `TRUE`.
#' @param epsilon Constant to avoid log(0).
#'
#' @return A list containing:
#' \itemize{
#' \item \code{W}: The basis matrix (m x k). The document-topic matrix in topic modelling.
#' \item \code{H}: The coefficient matrix (k x n). Contribution of features to factors (topics).
#' \item \code{c}: The global threshold. A constant.
#' \item \code{convergence}: Divergence (loss) from `X` at every `iter` until `tol` or `max_iter` is reached.
#' }
#'
#' @examples
#' \dontrun{
#' # Generate a binary matrix
#' m <- 100
#' n <- 50
#' X <- matrix(sample(c(0, 1), m * n, replace = TRUE), m, n)
#'
#' # Set the number of factors
#' k <- 4
#'
#' # Factorize the matrix with default settings
#' result <- nmfbin(X, k)
#' }
#' @export
nmfbin <- function(X, k, optimizer = "mur", init = "nndsvd", max_iter = 1000, tol = 1e-6, learning_rate = 0.001,
verbose = FALSE, loss_fun = "logloss", loss_normalize = TRUE, epsilon = 1e-10) {
# Check if X contains only 0s and 1s
if (!all(X %in% c(0, 1))) {
stop("The input matrix X must only contain 0s and 1s.")
}
# Check if X contains at least some 1s
if (all(X %in% c(0))) {
stop("The input matrix X must contain at least some 1s.")
}
# matrix dimensions
m <- nrow(X)
n <- ncol(X)
# Check if k < ncol(X)
if (k >= n) {
stop("Chosen k must be smaller than ncol(X).")
}
# Initialize W, H, and threshold c
init_vals <- initial_values(X, k, m, n, method = init)
W = init_vals$W
H = init_vals$H
c = init_vals$c
# SGD does not require initial X_hat
if (optimizer == "sgd"){
# placeholder value
X_hat <- 0
} else {
# Compose into initial X_hat
X_hat <- recombine_matrix(W, H, c)
}
# previous loss
prev_loss <- Inf
# record convergence
convergence <- NA
# iterate
for (iter in 1:max_iter) {
# update W, H, c
updated <- update_step(X, W, H, c, X_hat, m, n, type = optimizer, max_iter = max_iter, tol = tol, learning_rate = learning_rate, epsilon = epsilon)
W = updated$W
H = updated$H
c = updated$c
# Compose into new X_hat
X_hat <- recombine_matrix(W, H, c)
# compute loss
loss <- loss_function(X_true = X, X_hat = X_hat, type = loss_fun, normalize = loss_normalize, epsilon = epsilon)
# record loss
convergence[iter] <- loss
# Print loss and change
if (verbose == TRUE){
cat(sprintf("Iteration %d | Divergence from source matrix (log loss) = %f | Change from previous iteration = %f \n", iter, loss, loss - prev_loss))
}
# Check for convergence
if (abs(prev_loss - loss) < tol) {
break
}
prev_loss <- loss
}
# return
return(list(W = updated$W, H = updated$H, c = updated$c, convergence = convergence))
}
#' Recombine factors into sigmoidal matrix
#'
#' @noRd
#'
recombine_matrix <- function(W, H, c){
# matrix multiplication with constant added
WHc <- W %*% H + c
# sigmoid transformation
X_hat <- 1 / (1 + exp(-WHc))
# return
return(X_hat)
}
#' Loss functions
#'
#' @noRd
#'
loss_function <- function(X_true, X_hat, type = "logloss", normalize = TRUE, epsilon = 1e-10){
# dimensions for normalization
m <- nrow(X_true)
n <- ncol(X_true)
# log likelihood loss function, also known as binary cross-entropy loss
if (type == "logloss"){
# epsilon is added to avoid log(0)
loss <- -sum(X_true * log(X_hat + epsilon) + (1 - X_true) * log(1 - X_hat + epsilon))
}
# mean squared error
else if (type == "mse"){
loss <- sum((X_true - X_hat)^2)
}
# normalization (to give per element loss)
if (normalize == TRUE){
loss <- loss / (m * n)
}
# return
return(loss)
}
#' Optimization
#'
#' @noRd
#'
update_step <- function(X_true, W, H, c, X_hat, m, n, type = "mur", max_iter = 1000, tol = 1e-5, learning_rate = 0.001, epsilon = 1e-10){
# optimize with NMF multiplicative update rules
if (type == "mur"){
# Update rules for W, H, and c
W <- W * (X_true %*% t(H)) / (X_hat %*% t(H) + epsilon)
H <- H * (t(W) %*% X_true) / (t(W) %*% X_hat + epsilon)
c <- c * sum(X_true - X_hat) / (m * n)
}
# optimize with gradient descent
else if (type == "gradient"){
# Compute the gradients
dW <- -(X_true - X_hat) %*% t(H)
dH <- -t(W) %*% (X_true - X_hat)
dc <- -sum(X_true - X_hat)
# Update W, H, and c using gradient descent
W <- W - learning_rate * dW
H <- H - learning_rate * dH
c <- c - learning_rate * dc
# Ensure non-negativity by setting negative values to a small positive value
W[W < 0] <- epsilon
H[H < 0] <- epsilon
}
# optimize with stochastic gradient descent
else if (type == "sgd"){
# Randomly select a data point from X
i <- sample(1:m, 1)
j <- sample(1:n, 1)
xij_true <- X_true[i, j]
# Compute the current estimate for the selected data point
WHc_ij <- sum(W[i,] * H[,j]) + c
xij_hat <- 1 / (1 + exp(-WHc_ij))
# Compute the gradient for the selected data point
error <- xij_true - xij_hat
dW <- -error * H[,j]
dH <- -error * W[i,]
dc <- -error
# Update W, H, and c
W[i,] <- W[i,] - learning_rate * dW
H[,j] <- H[,j] - learning_rate * dH
c <- c - learning_rate * dc
# Ensure non-negativity
W[W < 0] <- epsilon
H[H < 0] <- epsilon
}
# return
return(list(W = W, H = H, c = c))
}
#' Initialization
#' NNDSVD credit to Renaud Gaujoux for R code
#' and C. Boutsidis and E. Gallopoulos for paper and original MATLAB code
#'
#' @noRd
#'
initial_values <- function(X, k, m, n, method = "nndsvd", densify = c("base", "average", "random")){
# constant always random
c <- stats::runif(1) # Global constant approach
# check densify argument, average by default
if (missing(densify)){
densify = "average"
}
# random non-negative values
if (method == "random"){
W <- matrix(stats::runif(m * k), m, k)
H <- matrix(stats::runif(k * n), k, n)
}
# SVD initialization
else if (method == "svd"){
# Perform SVD on X
s <- svd(X)
# Use the first k singular values and vectors
U_k <- s$u[, 1:k]
S_k <- diag(sqrt(s$d[1:k]))
V_k <- s$v[, 1:k]
# Initialize matrices W and H using absolute values
W <- abs(U_k %*% S_k)
H <- abs(t(V_k))
}
# Nonnegative Double Singular Value Decomposition
else if (method == "nndsvd"){
# Perform SVD on X
s <- svd(X)
U <- s$u
S <- s$d
V <- s$v
# Initialize matrices W and H
W <- matrix(0, m, k)
H <- matrix(0, k, n)
# Handle the first singular triplet, must be nonnegative
W[,1] = sqrt(S[1]) * abs(U[,1])
H[1,] = sqrt(S[1]) * abs(t(V[,1]))
# second SVD for the other k
for (i in seq(2, k)){
uu = U[,i]
vv = V[,i]
uup = pmax(uu, 0)
uun = pmax(-uu, 0)
vvp = pmax(vv, 0)
vvn = pmax(-vv, 0)
n_uup = sqrt(drop(crossprod(uup)))
n_vvp = sqrt(drop(crossprod(vvp)))
n_uun = sqrt(drop(crossprod(uun)))
n_vvn = sqrt(drop(crossprod(vvn)))
termp = n_uup * n_vvp
termn = n_uun * n_vvn
if (termp >= termn){
W[,i] = sqrt(S[i] * termp) * uup / n_uup
H[i,] = sqrt(S[i] * termp) * vvp / n_vvp
} else{
W[,i] = sqrt(S[i] * termn) * uun / n_uun
H[i,] = sqrt(S[i] * termn) * vvn / n_vvn
}
}
# average variant of NNDSVD
if (densify == "average"){
average <- mean(X)
W[W == 0] <- average
H[H == 0] <- average
}
# random fill variant of NNDVSD
else if (densify == "random"){
n1 <- sum(W == 0)
n2 <- sum(H == 0)
average <- mean(X)
W[W == 0] <- average * stats::runif(n1, min = 0, max = 1) / 100
H[H == 0] <- average * stats::runif(n2, min = 0, max = 1) / 100
}
}
# return
return(list(W = W, H = H, c = c))
}