-
Notifications
You must be signed in to change notification settings - Fork 0
/
query-simil.R
73 lines (72 loc) · 2.45 KB
/
query-simil.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
#' Calculate similarities from keys to keys
#'
#' @param conn a Magnitude connection.
#' @param keys character vector.
#' @param q character vector.
#' @param normalized logical;
#' whether or not vector embeddings should be normalized?
#' @param method string; method to compute similarity.
#' @param ... other arguments are passed to \code{proxyC::simil}.
#' @returns a sparse Matrix of 'Matrix' package.
#' @export
calc_simil <- function(conn, keys, q, normalized = TRUE,
method = c(
"cosine",
"correlation",
"jaccard",
"ejaccard",
"dice",
"edice",
"hamann",
"simple matching",
"faith"
),
...) {
method <- rlang::arg_match(method)
x <- query(conn, keys, normalized) %>%
tibble::column_to_rownames("key") %>%
as.matrix()
y <- query(conn, q, normalized) %>%
tibble::column_to_rownames("key") %>%
as.matrix()
proxyC::simil(x, y, method = method, ...)
}
#' Order keys by their similarity to a key
#'
#' @param conn a Magnitude connection.
#' @param key string.
#' @param q character vector.
#' elements exact same with key will be dropped from result.
#' @param n integer.
#' @param normalized logical;
#' whether or not vector embeddings should be normalized?
#' @param method string; method to compute similarity.
#' @returns a tibble.
#' @export
most_similar <- function(conn, key, q, n = 1L,
normalized = TRUE,
method = c(
"cosine",
"correlation",
"jaccard",
"ejaccard",
"dice",
"edice",
"hamann",
"simple matching",
"faith"
)) {
if (length(key) != 1L) {
rlang::warn("length of `key` is not 1L. the first element will be used.")
}
q <- unique(q[which(!q %in% key, arr.ind = TRUE)])
n <- ifelse(n > length(q), length(q), n)
simil <-
as.matrix(calc_simil(conn, key[1], q, normalized, method))
tibble::tibble(
keys = colnames(simil),
similarity = simil[1, ]
) %>%
dplyr::arrange(desc(.data$similarity)) %>%
dplyr::slice_head(n = n)
}