/
term-type.R
187 lines (182 loc) · 7.35 KB
/
term-type.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
#' Determine the general category of terms
#'
#' Terms in the Phenoscape KB fall into different general categories: entity
#' (anatomical entities), quality, phenotype (which typically are entity-quality
#' compositions), and taxon. The category is sometimes needed to plug a term IRI
#' into the right parameter for a function or API call.
#'
#' The implementation uses the following successive steps until a determination
#' is made, or all possibilities are exhausted:
#' - Try infer the category from the object type and the ontology for terms of
#' certain OBO ontologies.
#' - Consider subsumption by specific upper ontology terms, specifically
#' the BFO terms "independent continuant" (for entity terms) and "quality"
#' (for quality terms).
#' - If a label can be obtained, and it matches the pattern "p some X" with
#' p being a property used for composing classes (e.g., part of, has part, etc),
#' extract X and recursively apply the algorithm to X.
#' - If superclasses are retrievable and any of them has a label starting with
#' "phenotype of", determine category as phenotype.
#' - If superclasses are retrievable, apply the algorithm recursively to each
#' superclass until a positive determination is made.
#'
#' Due to requiring potentially multiple KB API calls per term for those for which
#' the first step fails, this algorithm can be slow.
#' @param x a vector of one or more term IRIs, or a list of such IRIs or term
#' objects (such as phenotype objects)
#' @return A character vector with the term categories ("entity", "quality",
#' "phenotype", or "taxon") of the terms in the input list. The category is
#' NA for terms for which no determination could be made.
#' @examples
#' term_category(c("http://purl.obolibrary.org/obo/UBERON_0011618",
#' "http://purl.obolibrary.org/obo/PATO_0002279",
#' "http://purl.obolibrary.org/obo/VTO_0071642",
#' "http://purl.obolibrary.org/obo/MP_0030825"))
#' phens <- get_phenotypes("basihyal bone")
#' term_category(phens$id[1:3])
#' @export
term_category <- function(x) {
if (is.phenotype(x)) return("phenotype")
# try to infer from OBO ontology ID
onts <- obo_prefix(x)
types <- obo_ont_type(onts)
# for those unresolved, check for phenotype objects
terms <- x[is.na(types)]
if (length(terms) > 0) {
is.phen <- sapply(terms, is.phenotype)
types[is.na(types)] <- ifelse(is.phen, "phenotype", NA)
}
# for those remaining unresolved, try to determine by upper ontology ancestor
terms <- x[is.na(types)]
if (length(terms) > 0) {
types[is.na(types)] <- sapply(terms, term_category_detective)
}
types
}
#' Perform some detective work to determine the category of a term
#'
#' This function does the following steps as part of the algorithm used by
#' [term_category()].
#' - If a label can be obtained, and it matches the pattern "p some X" with
#' p being a property used for composing classes (e.g., part of, has part, etc),
#' extract X and recursively apply the algorithm to X.
#' - If superclasses are retrievable and any of them has a label starting with
#' "phenotype of", determine category as phenotype.
#' - If superclasses are retrievable, apply the algorithm recursively to each
#' superclass until a positive determination is made.
#' @param term The IRI of the term to do detective work for.
#' @return The term category as a character vector, or NA if category determination
#' was unsuccessful.
#' @seealso [term_category()]
#' @noRd
term_category_detective <- function(term) {
if (term %in% entity_roots())
"entity"
else if (term %in% quality_roots())
"quality"
else if (any(startsWith(term, semweb_ns())))
"entity"
else {
isE <- is_ancestor(term, candidates = entity_roots(), includeRels = "part_of")
if (all(isE))
"entity"
else if (any(is_ancestor(term, candidates = quality_roots())))
"quality"
else if (any(isE))
"entity"
else {
ti <- term_classification(term)
if (length(ti) == 0)
NA
else {
categ <- NA
# if {part of, has part, develops from} some X, then should be of same
# category as X (provided we can find X)
ent_expr_prefixes <- c("part of some ", "has part some ", "develops from some ")
if (! is.null(ti$label)) {
label_matches <- startsWith(ti$label, ent_expr_prefixes)
if (any(label_matches)) {
cand_ent_label <- sub(ent_expr_prefixes[label_matches], "", ti$label)
cand_ents <- find_term(cand_ent_label, matchBy = "rdfs:label", matchTypes = c("exact"))
# no match by default returns NA
if (is.data.frame(cand_ents))
categ <- term_category(cand_ents$id[1])
}
}
# is that failed
if (is.na(categ)) {
# if subClassOf of X, then should be of same category as X
superCls <- ti$subClassOf
if (is.data.frame(superCls) && nrow(superCls) > 0) {
# are the superclasses with labels starting with "phenotype of"?
if (any(startsWith(as.character(superCls$label), "phenotype of"))) {
categ <- "phenotype"
} else {
# otherwise recursively try superclasses
for (cls in superCls$id) {
categ <- term_category(cls)
if (! is.na(categ)) break
}
}
}
}
categ
}
}
}
}
entity_roots <- function() {
c(term_iri("independent continuant", preferOntologies = c("BFO")),
term_iri("anatomical structure", firstOnly = FALSE))
}
quality_roots <- function() {
term_iri("quality", preferOntologies = c("BFO", "PATO"), firstOnly = FALSE)
}
obo_ont_type <- function(ont) {
ont[is.na(ont)] <- ""
ontIRIs <- ontology_iri(ont)
is.ent <- ontIRIs %in% anatomy_ontology_iris()
is.taxon <- ontIRIs %in% taxon_ontology_iris()
ifelse(is.ent,
"entity",
ifelse(is.taxon,
"taxon",
sapply(ont, function(o) {
switch(o,
PATO = "quality",
MP = "phenotype",
HP = "phenotype",
NCBITaxon = "taxon",
NA)
})))
}
#' Extract the OBO ontology prefix from IRIs
#'
#' @param x a list or vector of IRIs, and/or objects that have an "id" key.
#' @return
#' A character vector of the same length as the input
#' vector or list, with NA in the positions where extracting the OBO
#' ontology prefix failed.
#' @examples
#' tt <- c("http://purl.obolibrary.org/obo/UBERON_0011618",
#' "http://purl.obolibrary.org/obo/PATO_0002279",
#' "http://purl.obolibrary.org/obo/VTO_0071642",
#' "http://purl.obolibrary.org/obo/MP_0030825",
#' "http://purl.obolibrary.org/obo/NCBITaxon_7955")
#' obo_prefix(tt)
#' @importFrom stringi stri_match_first_regex
#' @export
obo_prefix <- function(x) {
is.iri <- sapply(x, is.character)
if (! all(is.iri))
x[! is.iri] <- sapply(x[! is.iri],
function(term) {
if ("id" %in% names(term))
term$id
else
as.character(term)
})
m <- stringi::stri_match_first_regex(x,
"^https?://purl.obolibrary.org/obo/([A-Za-z0-9]+)_[0-9]+")
m[, 2]
}