/
phenotypes.R
147 lines (143 loc) · 6.58 KB
/
phenotypes.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
#' Retrieve phenotypes by entity, quality, taxon, and study
#'
#' Retrieves "semantic phenotypes", i.e., phenotypes encoded as ontological
#' expressions. Filtering is possible by anatomical entity (optionally including
#' entities related by certain properties, see `includeRels`), phenotypic
#' quality, taxonomic group where the phenotypes have been recorded, and study
#' (a.k.a. publication).
#'
#' Entity, quality, and taxon can be given as IRI or as name (i.e, term label).
#' In the latter case, names will be resolved to IRIs against anatomy ontologies,
#' PATO, and taxonomy ontologies, respectively. Warnings will be issued if only
#' a partial match can be found. The study must be given as IRI.
#' @param entity character, the anatomical entity by which to filter, if any.
#' @param quality character, the phenotypic quality by which to filter, if any.
#' @param taxon character, the taxon by which to filter, if any.
#' @param study character, the identifier of the study by which to filter, if any.
#' @param includeRels character or vector of characters. The names of relationships
#' for anatomical entities to include in addition to subtype (`rdfs:subClassOf`).
#' Defaults to `"part of"`. Set to `FALSE` to not include any additional relationships.
#' Otherwise one or more of `"part of"`, `"historical homologous to"`, and
#' `"serially homologous to"`, or set to `TRUE` to include all possible ones. It
#' is acceptable to use unambiguous prefixes, for example `"historical"`.
#' @param .withTaxon logical, whether to include taxa in the result if `taxon` is
#' provided. If TRUE, only the combination of phenotype and taxon will be
#' unique in the returned data frame. Default is FALSE, meaning by default
#' providing a value for `taxon` only acts as another filter but does not
#' change format or redundancy of the result. Ignored if `taxon` is
#' not provided as a character value.
#' @param verbose logical, whether to print messages informing about potentially
#' time-consuming operations. Default is FALSE.
#' @examples
#' phens1 <- get_phenotypes(entity = "pelvic fin")
#' head(phens1)
#'
#' # by default, parts are already included
#' phens2 <- get_phenotypes(entity = "pelvic fin", includeRels = c("part"))
#' nrow(phens1) == nrow(phens2)
#' table(phens2$id %in% phens1$id)
#'
#' # but historical homologues are not
#' phens2 <- get_phenotypes(entity = "pelvic fin", includeRels = c("part", "hist"))
#' table(phens2$id %in% phens1$id)
#'
#' # neither are serially homologous
#' phens2 <- get_phenotypes(entity = "pelvic fin", includeRels = TRUE)
#' table(phens2$id %in% phens1$id)
#'
#' # filter also by quality
#' phens2 <- get_phenotypes(entity = "pelvic fin", quality = "shape")
#' table(phens1$id %in% phens2$id)
#'
#' # filter also by quality and taxon
#' phens2 <- get_phenotypes(entity = "pelvic fin", quality = "shape", taxon = "Siluriformes")
#' table(phens1$id %in% phens2$id)
#'
#' # filter by entity, quality and taxon, and return taxa as well (resulting in
#' # (phenotype, taxon) "tuples")
#' phens2a <- get_phenotypes(entity = "pelvic fin", quality = "shape", taxon = "Siluriformes",
#' .withTaxon = TRUE)
#' head(phens2a)
#' nrow(phens2a) - nrow(phens2) # lots of redundancy due to n:n relationship
#' nrow(unique(phens2a[,c("id", "label")])) == nrow(phens2) # but same #phenotypes
#'
#' # can compute and visualize similarity
#' sm <- jaccard_similarity(terms = phens2$id, .labels = phens2$label, .colnames = "label")
#' plot(hclust(as.dist(1-sm)))
#' @return A data frame with columns "id" and "label".
#'
#' If a character value for `taxon` was provided, and `.withTaxon` is TRUE’,
#' columns "taxon.id" and "taxon.label" will be returned as well. While
#' (phenotypes, taxon) tuples will be unique, both phenotypes and taxa
#' individually will then be redundant in the returned data frame (the
#' association is n:n).
#' @export
get_phenotypes <- function(entity = NA, quality = NA, taxon = NA, study = NA,
includeRels = c("part of"),
.withTaxon = FALSE,
verbose = FALSE) {
argsInCall <- as.list(match.call())[-1]
# need to make sure to apply our defaults where they differ
argsInCall$includeRels <- includeRels
# note that evaluation needs to be in this function's parent frame, or
# otherwise using it in apply() and friends won't work
queryseq <- do.call(pkb_args_to_query, argsInCall, envir = parent.frame())
queryseq <- c(queryseq, limit = "0")
mssg(verbose, "Querying for phenotypes ...")
if (is.na(taxon) || ! .withTaxon)
endp <- "/phenotype/query"
else
endp <- "/taxon/annotations"
out <- get_json_data(pkb_api(endp), queryseq)
res <- out$results
if (length(res) > 0) {
nms <- sub("@", "", x = colnames(res))
isTaxonCol <- startsWith(nms, "taxon")
if (any(isTaxonCol)) {
# remove 'phenotype' prefix for phenotype ID and label
nms <- sub("phenotype.", "", x = nms)
# ensure that the additional taxon ID and label columns are last, so
# we're only adding columns to the end
i <- seq(1, ncol(res))
reordering <- c(i[! isTaxonCol], i[isTaxonCol])
res <- res[, reordering]
nms <- nms[reordering]
}
colnames(res) <- nms
}
res
}
#' Which phenotypes match filter
#'
#' Determines which of one or more phenotype IDs match the given filter.
#'
#' At present, the only supported query filter is a list of studies (as their
#' IDs). A phenotype matches if it is linked to at least one of the studies.
#' @param x character, vector of phenotype IDs to test. A data.frame is
#' allowed if it has a column labeled "id".
#' @param studies character, vector of study IDs for which to test. A data.frame
#' is allowed if it has a column labeled "id".
#' @return a logical vector of the same length as the vector of phenotypes, with
#' TRUE as element if the corresponding phenotype matches, and FALSE otherwise.
#' @examples
#' x <- get_phenotypes(entity = "basihyal bone")
#' nrow(x)
#' # which of these are in the same study or studies as the first one?
#' phenotype_matches(x, get_studies(phenotype = x$id[1]))
#' @export
phenotype_matches <- function(x, studies) {
if ("id" %in% colnames(x)) x <- x$id
if ("id" %in% colnames(studies)) studies <- studies$id
if (! is.character(x))
stop("x is not a vector of phenotype IDs", call. = FALSE)
if (length(studies) == 0)
rep(FALSE, times = length(x))
else
sapply(x, function(phen) {
phen.studies <- get_studies(phenotype = phen)
if (is.logical(phen.studies))
FALSE
else
any(phen.studies$id %in% studies)
}, USE.NAMES = FALSE)
}