-
Notifications
You must be signed in to change notification settings - Fork 5
/
mutual_exclusivity.R
437 lines (394 loc) · 22 KB
/
mutual_exclusivity.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
# define mutual exclusivity types
exclusivity_types <- list(strong_compatibility='strong_compatibility',
weak_compatibility='weak_compatibility',
inconclusive_evidence='inconclusive_evidence',
weak_exclusivity='weak_exclusivity',
strong_exclusivity='strong_exclusivity')
#' Determine whether two phenotypes are mutually exclusive
#'
#' Determines whether the data in the KB includes evidence for mutual exclusivity
#' between two given phenotypes.
#'
#' See [mutually_exclusive()] for details on which links in the data are considered
#' as strong or weak evidence for mutual exclusivity and compatibility.
#'
#' @param phenotype.a character or phenotype object, phenotype ID (IRI) if character, or
#' a phenotype object obtained by passing phenotype ID to as.phenotype() function.
#' @param phenotype.b character or phenotype object, phenotype ID (IRI) if character, or
#' a phenotype object obtained by passing phenotype ID to as.phenotype() function.
#' @param studies character, a vector of study IDs. This is an optional
#' parameter that acts as a filter in case the determination of
#' mutual exclusivity is to be based only on the evidence found in a particular set of studies.
#' The default is NULL, which means
#' that all studies present in the KB database will be considered for evidence.
#' @param charstates dataframe, a dataframe obtained from [charstates()] by
#' passing a list of the two phenotypes as argument. This optional parameter exists to speed up
#' the computation for the [mutually_exclusive()] function since `mutually_exclusive()`
#' repeatedly calls mutual_exclusivity_pairwise(). The default is NULL. _Note that passing this argument
#' but doing so incorrectly can result in wrong output._
#' @param quality_opposites dataframe, an optional dataframe containing columns
#' "quality.a" and "quality.b" to denote pairs of phenotypic quality terms, in
#' the form of their term IRIs, to be considered opposites of each others.
#' See documentation under [mutually_exclusive()] for more details.
#'
#' @return A character (string), the mutual exclusivity type among the two phenotypes.
#' See [mutually_exclusive()] for documentation on the possible values, although note
#' that this function returns these as a character vector, not levels of an ordered factor.
mutual_exclusivity_pairwise <- function(phenotype.a, phenotype.b, studies=NULL, charstates=NULL,
quality_opposites=NULL){
# convert phenotypes to phenotype objects for faster computation
if (!is.phenotype(phenotype.a)) {
phenotype.a <- as.phenotype(phenotype.a, withTaxa=TRUE)
}
if (!is.phenotype(phenotype.b)) {
phenotype.b <- as.phenotype(phenotype.b, withTaxa=TRUE)
}
if (!is.null(quality_opposites)) {
# check validity of the quality_opposites dataframe
if (!all(c("quality.a","quality.b") %in% colnames(quality_opposites))) {
stop("Missing a required column for the quality_opposites parameter. The quality_opposites dataframe parameter requires 'quality.a' and 'quality.b' columns to be present.")
}
# trim white space from quality opposite IRIs
quality_opposites$quality.a <- trimws(quality_opposites$quality.a)
quality_opposites$quality.b <- trimws(quality_opposites$quality.b)
}
is_pair_mutually_exclusive <- exclusivity_types$inconclusive_evidence
# load charstates dataframe that contains both the phenotypes
if (is.null(charstates)) {
character_states <- charstates(list(phenotype.a, phenotype.b))
} else {
character_states <- charstates
}
# filter the character_states dataframe by studies
if (!is.null(studies)) {
character_states <- character_states[character_states$study.id %in% studies, ]
}
# store the charstate dataframe associated with each phenotype separately for easier computation
charstates.a <- character_states[(character_states$phenotype.id == phenotype.a$id), ]
charstates.b <- character_states[(character_states$phenotype.id == phenotype.b$id), ]
# determine mutual exclusivity only if both phenotypes have non-empty states
num_states.a <- length(charstates.a$state.id)
num_states.b <- length(charstates.b$state.id)
if (num_states.a > 0 && num_states.b > 0) {
# compute the intersection of the two characters vectors
chars.a <- unique(charstates.a$character.id)
chars.b <- unique(charstates.b$character.id)
mutual_characters_present <- length(intersect(chars.a, chars.b)) > 0
if (mutual_characters_present) {
# If the two phenotypes have mutual characters, then
# there is enough evidence to decide mutual exclusivity among the two phenotypes.
# Therefore, the mutual exclusion in this case is called 'strong'.
# Mutual exclusivity is decided by looking at whether the two phenotypes have mutual states.
states.a <- charstates.a$state.id
states.b <- charstates.b$state.id
mutual_states_present <- length(intersect(states.a, states.b)) > 0
if (mutual_states_present) {
# strong compatibility if phenotypes share states
is_pair_mutually_exclusive <- exclusivity_types$strong_compatibility
} else {
# strong exclusivity if phenotypes do not share states
is_pair_mutually_exclusive <- exclusivity_types$strong_exclusivity
}
} else {
# If there are no mutal characters among the two phenotypes, then
# there is lack (but not absence) of evidence to determine mutual exclusivity.
# Therefore, the mutual exclusion in this case is called 'weak'.
# Mutual exclusivity is decided by looking at whether the two phenotypes have mutual taxa.
mutual_taxa_present <- length(intersect(phenotype.a$taxa$id, phenotype.b$taxa$id)) > 0
if(mutual_taxa_present){
# weak compatibility
is_pair_mutually_exclusive <- exclusivity_types$weak_compatibility
} else {
# weak exclusivity
is_pair_mutually_exclusive <- exclusivity_types$weak_exclusivity
}
}
} else {
# If either (or both) phenotype(s) have empty state(s), then we say that
# there is no evidence to compute mutual exclusivity.
# This block raises a warning and is_pair_mutually_exclusive is returned
# with the default value: 'inconclusive_evidence'.
if (num_states.a == 0) {
warning("Phenotype A does not have any states in the charstates dataframe.")
} else if (num_states.b == 0){
warning("Phenotype B does not have any states in the charstates dataframe.")
} else {
warning("Both phenotypes do not have any states in the charstates dataframe.")
}
}
# if we have quality opposite data and phenotype entities match
if (!is.null(quality_opposites) && phenotype.a$eqs$entities == phenotype.b$eqs$entities) {
# only check phenotypes that both have a single quality
if (length(phenotype.a$eqs$qualities) == 1 && length(phenotype.b$eqs$qualities) == 1) {
# find the list of opposites for phenotype.a
phenotype.a.opposites <- find_quality_opposites(phenotype.a$eqs$qualities, quality_opposites)
if (any(phenotype.a.opposites == phenotype.b$eqs$qualities)) {
# strong exclusivity if phenotypes are opposite
is_pair_mutually_exclusive <- exclusivity_types$strong_exclusivity
}
}
}
# return mutual exclusivity
is_pair_mutually_exclusive
}
find_quality_opposites <- function(phenotype_quality, quality_opposites) {
# returns a vector of quality IRIs that are opposite of phenotype_qualities
union(
# for phenotype_quality == quality.a return matching quality.b opposites
quality_opposites[phenotype_quality == quality_opposites$quality.a]$quality.b,
# for phenotype_quality == quality.b return matching quality.a opposites
quality_opposites[phenotype_quality == quality_opposites$quality.b]$quality.a
)
}
#' Determine mutual exclusivity between two or more phenotypes
#'
#' Determines whether the data in the KB includes evidence for mutual exclusivity
#' between all distinct pairs of phenotypes in the provided list.
#'
#' Evidence for mutual exclusivity or compatibility between a pair of phenotypes
#' is taken from how they are or aren't linked to the same taxon, character, and
#' character state. If two phenotypes are found to be linked to the same character
#' state at least once, it is considered _strong evidence for mutual compatibility_.
#' If they are found to be linked to the same character at least once, but not
#' to the same character state (implying they must be linked to different states of
#' the same character at least once), it is considered _strong evidence for mutual exclusivity_.
#' Otherwise, if there is no character to which both are linked, the presence and absence
#' of data showing co-occurrence in taxa is used as _weak evidence_. That is, if they are
#' linked to character states of different characters, but the states are both exhibited
#' by the same taxon at least once, it is considered as _weak evidence for mutual compatibility_
#' (weak because the co-occurrence may be due to taxon-level polymorphism),
#' and if there is no such taxon, this is considered as _weak evidence for mutual exclusivity_
#' (because lack of evidence could still be due to gaps in the data, and does therefore not
#' imply evidence of absence).
#'
#' It is possible that for any pair of phenotypes the KB contains conflicting
#' evidence on mutual exclusivity. For example, two phenotypes could be linked to
#' the same character state for one character, and different character states for
#' another character (possibly from another study by another author), the first of
#' which would present strong evidence for mutual compatibility and the latter
#' strong evidence of mutual exclusivity. The current implementation does not detect
#' and thus ignores this possibility.
#'
#' @param phenotypes character or phenotype, a vector of phenotype IDs, or
#' phenotype objects for which one wants to determine the mutual exclusivity.
#' @param studies character, a vector of study IDs (optional) if one is
#' interested in determining mutual exclusivity only based on the evidence from a
#' particular set of studies. By default _all_ studies in the KB are considered.
#' @param progress_bar logical, whether to print progress of the function.
#'
#' WARNING: setting progress_bar to TRUE clears the R console by executing the
#' cat('\014') command before printing the progress.
#'
#' @param quality_opposites dataframe, an optional dataframe containing columns
#' "quality.a" and "quality.b" to denote pairs of phenotypic quality terms, in
#' the form of their term IRIs, to be considered opposites of each others. If
#' provided, two phenotypes will be determined to have _strong exclusivity_ if
#' their qualities match a pair of opposites. The test will only be applied to
#' pairs of phenotypes in which the EQ expressions of both comprise of the same
#' number of entities and only a single quality term. Columns included in the
#' dataframe other than "quality.a" and "quality.b" will be ignored.
#'
#' @return A list consisting a matrix and a dataframe that contain mutual exclusivity
#' results for the phenotypes.
#'
#' The mutual exclusivity type between two phenotypes is encoded as an ordered factor with the following
#' levels (in this order):
#' - 'strong_compatibility' : the data show strong evidence of mutual compatibility;
#' - 'weak_compatibility' : the data suggest mutual compatibility, but the evidence is weak;
#' - 'inconclusive_evidence' : the default for phenotypes that lack links to character states;
#' - 'weak_exclusivity' : the data suggest mutual exclusivity, but the evidence is weak;
#' - 'strong_exclusivity' : the data show strong evidence of mutual exclusivity.
#'
#' The first item in the list is a square matrix with each dimension equal to the
#' number of phenotypes. The cells (_i_,_j_) of the matrix are the integer representation of
#' the factor level for the mutual exclusivity type between the _i_ th and _j_ th phenotype
#' in the list of phenotypes provided as input. The matrix is symmetric.
#'
#' The second item in the list is a dataframe with five columns, the labels and ids
#' of each phenotype of a pair, and the mutual exclusivity type of the pair is in the
#' fifth column.
#'
#' @examples
#' # Example 1: pass phenotypes as characters with phenotype IDs
#' # get a specific study of interest based on which mutual exclusivity is to be determined
#' studies <- get_studies()
#' study <- studies$id[studies$label == 'Dillman et al. (2016)']
#'
#' # get phenotypes ids present in the study
#' phenotypes <- get_phenotypes(study = study)
#' # select 10 phenotype ids randomly from those in the study
#' phenotypes_ids <- sample(phenotypes$id, 10)
#'
#' # determine mutual exclusivity
#' exclusivity <- mutually_exclusive(phenotypes_ids)
#'
#' # resultant matrix
#' exclusivity$matrix
#'
#' # resultant dataframe
#' str(exclusivity$dataframe)
#'
#' # filter dataframe to contain only mutually exclusive phenotypes
#' exclusivity_df <- exclusivity$dataframe
#' exclusive_phenotype_pairs <- exclusivity_df[
#' exclusivity_df$mutual_exclusivity == "weak_exclusivity" |
#' exclusivity_df$mutual_exclusivity == "strong_exclusivity", ]
#'
#' # Example 2: pass phenotypes as phenotype objects
#' # sample some phenotypes objects present in the study
#' phenotypes_objs <- as.phenotype(sample(phenotypes$id, 4), withTaxa=TRUE)
#'
#' # determine mutual exclusivity
#' exclusivity <- mutually_exclusive(phenotypes_objs)
#'
#' # resultant matrix
#' exclusivity$matrix
#'
#' # resultant dataframe
#' str(exclusivity$dataframe)
#'
#' # Example 3: determine mutual exclusivity for two phenotypes
#' # fetch phenotypes for an entity/taxon combination
#' phenotypes <- get_phenotypes(entity="hyomandibular bone", taxon="Hypancistrus")
#'
#' # select two phenotype ids based on the labels
#' phenotypes_ids <- c(
#' phenotypes$id[phenotypes$label == 'hyomandibular bone in contact with prootic bone'],
#' phenotypes$id[phenotypes$label == 'hyomandibular bone in contact with quadrate bone']
#' )
#'
#' # determine mutual exclusivity
#' exclusivity <- mutually_exclusive(phenotypes_ids)
#'
#' # exclusivity value
#' exclusivity$dataframe$mutual_exclusivity
#'
#' # Example 4: determine mutual exclusivity for two phenotypes including opposite quality data
#'
#' # create a list of phenotypes to compare (femur elongated vs femur decreased length)
#' phens <- get_phenotypes(entity="femur", quality="elongated")
#' femur_elongated_iri <- phens$id[phens$label == "femur elongated"]
#' phens <- get_phenotypes(entity="femur", quality="decreased length")
#' femur_decreased_length_iri <- phens$id[phens$label == "femur decreased length"]
#' phenotypes_to_compare <- c(femur_elongated_iri, femur_decreased_length_iri)
#'
#' # compare the phenotypes without using opposite quality data
#' exclusivity <- mutually_exclusive(phenotypes_to_compare)
#' exclusivity$dataframe$mutual_exclusivity
#'
#' # create a dataframe containing the quality opposites
#' elongated_iri <- find_term("elongated", matchTypes = "exact")$id
#' decreased_length_iri <- find_term("decreased length", matchTypes = "exact")$id
#' quality_opposites <- data.frame(
#' quality.a = elongated_iri,
#' quality.b = decreased_length_iri
#' )
#'
#' # compare the phenotypes using opposite quality data
#' exclusivity <- mutually_exclusive(phenotypes_to_compare, quality_opposites=quality_opposites)
#' exclusivity$dataframe$mutual_exclusivity
#' @export
mutually_exclusive <- function(phenotypes, studies=NULL, progress_bar=FALSE, quality_opposites=NULL){
# make sure that at least two phenotypes are passed
if (is.null(phenotypes) || length(phenotypes) == 1) {
stop("Less than two phenotypes passed. The function expects at least two phenotypes to compute mutual exclusivity.")
}
# initialize a square matrix to store mutual exclusivity result among each pair of phenotypes
num_phenotypes <- length(phenotypes)
mutual_exclusivity_matrix <- matrix(as.integer(factor(exclusivity_types$inconclusive_evidence,
levels = exclusivity_types,
ordered = TRUE)),
nrow=num_phenotypes,
ncol=num_phenotypes)
rownames(mutual_exclusivity_matrix) <- 1:num_phenotypes
colnames(mutual_exclusivity_matrix) <- 1:num_phenotypes
# determine the number of combinations of phenotypes taken 2 at a time
num_combinations = choose(length(phenotypes), 2)
# create a pre-allocated set of vectors to store mutual exclusive pair data
id.1 <- character(num_combinations)
label.1 <- character(num_combinations)
id.2 <- character(num_combinations)
label.2 = character(num_combinations)
mutual_exclusivity_vector = character(num_combinations)
# initialize variables to keep track of progress of the function
if (progress_bar) {
iteration <- 1
total_iterations <- choose(num_phenotypes, 2) + num_phenotypes
num_mutual_exclusive_pairs <- 0
num_mutual_compatible_pairs <- 0
num_inconclusive_evidence <- 0
}
# convert phenotypes to phenotype objects for faster computation
if (!all(sapply(phenotypes, is.phenotype))) {
phenotypes <- as.phenotype(phenotypes, withTaxa=TRUE)
}
# load charstates dataframe that contains all the phenotypes
character_states <- charstates(phenotypes)
vec_idx <- 1
# find mutual exclusivity for each pair in the phenotype vector
for (row in 1:num_phenotypes) {
for (column in row:num_phenotypes) {
# print progress of the loop
if (progress_bar) {
# prevent flickering by only displaying every 100 iterations
if (iteration %% 100 == 1) {
cat('\014')
cat(paste0(round(iteration / (total_iterations) * 100), '% completed\n',
'Mutually exclusive pairs = ', num_mutual_exclusive_pairs, '\n',
'Mutually compatible pairs = ', num_mutual_compatible_pairs, '\n',
'Inconclusive evidence = ', num_inconclusive_evidence, '\n'))
}
iteration <- iteration + 1
}
# compute mutual exclusivity among distinct phenotypes only
if (row != column) {
# compute mutual exclusivity
mutual_exclusivity <- mutual_exclusivity_pairwise(phenotypes[[row]],
phenotypes[[column]],
studies=studies,
charstates=character_states,
quality_opposites=quality_opposites)
# store exclusivity result in matrix
mutual_exclusivity_integer <- match(mutual_exclusivity, exclusivity_types)
mutual_exclusivity_matrix[row, column] <- mutual_exclusivity_integer
mutual_exclusivity_matrix[column, row] <- mutual_exclusivity_integer
# store exclusivity result in vectors
id.1[vec_idx] <- phenotypes[[row]]$id
label.1[vec_idx] <- phenotypes[[row]]$label
id.2[vec_idx] <- phenotypes[[column]]$id
label.2[vec_idx] <- phenotypes[[column]]$label
mutual_exclusivity_vector[vec_idx] <- mutual_exclusivity
# increment vectors index
vec_idx <- vec_idx + 1
# update progress bar statistics
if (progress_bar) {
if (mutual_exclusivity == exclusivity_types$strong_exclusivity || mutual_exclusivity == exclusivity_types$weak_exclusivity) {
# keep track of the number of exclusive phenotype pairs for printing in the progress bar
num_mutual_exclusive_pairs <- num_mutual_exclusive_pairs + 1
} else if (mutual_exclusivity == exclusivity_types$inconclusive_evidence) {
# keep track of the number of phenotype pairs with inconclusive evidence for printing in the progress bar
num_inconclusive_evidence <- num_inconclusive_evidence + 1
} else if (mutual_exclusivity == exclusivity_types$strong_compatibility || mutual_exclusivity == exclusivity_types$weak_compatibility) {
# keep track of the number of compatible phenotype pairs for printing in the progress bar
num_mutual_compatible_pairs <- num_mutual_compatible_pairs + 1
}
}
} else if (row == column) {
# a phenotype is strongly compatible with itself
mutual_exclusivity_matrix[row, column] <- factor(exclusivity_types$strong_compatibility,
levels = exclusivity_types,
ordered = TRUE)
}
}
}
mutual_exclusivity_factors <- factor(mutual_exclusivity_vector, levels = exclusivity_types, ordered = TRUE)
mutual_exclusivity_df <- data.frame(id.1 = id.1,
label.1 = label.1,
id.2 = id.2,
label.2 = label.2,
mutual_exclusivity = mutual_exclusivity_factors,
stringsAsFactors = FALSE)
# return a list containing the resultant matrix and the dataframe
list(matrix=mutual_exclusivity_matrix,
dataframe=mutual_exclusivity_df)
}