-
Notifications
You must be signed in to change notification settings - Fork 1
/
left_censorship.R
173 lines (149 loc) · 5.73 KB
/
left_censorship.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
#' Test for left censorship
#'
#' Does a binomial test to check if the most likely cause of missing values
#' is due to values being below the limit of detection, or coming from a
#' left-censored distribution.
#'
#' @param data_matrix matrix or data.frame of numeric data
#' @param global_na what represents zero or missing?
#' @param sample_classes which samples are in which class
#'
#' @details
#' For each feature that is missing in a group of samples, we save as a possibility
#' to test. For each sample, we calculate the median value with any missing values
#' removed. Each feature that had a missing value, we test whether the remaining
#' non-missing values are below the sample median for those samples where the
#' feature is non-missing. A binomial test considers the total number of features
#' instances (minus missing values) as the number of trials, and the number of
#' of features below the sample medians as the number of successes.
#'
#' There is a bit more detail in the vignette: `vignette("testing-for-left-censorship", package = "ICIKendallTau")`
#'
#' @seealso [ici_kendalltau()]
#'
#' @examples
#' # this example has 80% missing due to left-censorship
#' data(missing_dataset)
#' missingness = test_left_censorship(missing_dataset)
#' missingness$values
#' missingness$binomial_test
#'
#' @export
#' @return data.frame of trials / successes, and binom.test result
test_left_censorship = function(data_matrix,
global_na = c(NA, Inf, 0),
sample_classes = NULL)
{
if (inherits(data_matrix, "data.frame")) {
data_matrix = as.matrix(data_matrix)
}
if (is.null(sample_classes)) {
sample_classes = rep("A", ncol(data_matrix))
}
split_indices = split(seq_len(ncol(data_matrix)), sample_classes)
missing_loc = setup_missing_matrix(data_matrix, global_na)
data_matrix_missing = data_matrix
data_matrix_missing[missing_loc] = NA
# split the dataset by group
split_counts = purrr::imap(split_indices, \(in_split, split_id){
# in_split = split_indices[[1]]
# grab the group we want to work with
split_missing = data_matrix_missing[, in_split, drop = FALSE]
# count the number of missing samples for each feature,
# and keep those that have at least one
n_miss = rowSums(is.na(split_missing))
if (sum(n_miss) == 0) {
out_res = data.frame(trials = 0, success = 0, class = split_id)
return(out_res)
}
keep_miss = split_missing[n_miss > 0, , drop = FALSE]
# get sample medians
sample_medians = calculate_matrix_medians(split_missing, use = "col", na.rm = TRUE)
# turn the medians into a matrix to make life easier
median_matrix = matrix(sample_medians, nrow = nrow(keep_miss),
ncol = ncol(keep_miss), byrow = TRUE)
# do the comparison
keep_miss_updown = keep_miss < median_matrix
# count how many trials we ran, and how many successes we have
all_trials = (nrow(keep_miss_updown) * ncol(keep_miss_updown)) - sum(is.na(keep_miss_updown))
all_success = sum(keep_miss_updown, na.rm = TRUE)
out_res = data.frame(trials = all_trials, success = all_success, class = split_id)
return(out_res)
}) |>
purrr::list_rbind()
total_trials = sum(split_counts$trials)
total_success = sum(split_counts$success)
binom_res = stats::binom.test(total_success, total_trials, p = 0.5, alternative = "greater")
if (is.null(sample_classes)) {
total_success$class = NULL
}
return(list(values = split_counts,
binomial_test = binom_res))
}
#' Calculate matrix medians
#'
#' Given a matrix of data, calculates the median value in each column or row.
#'
#' @param in_matrix numeric matrix of values
#' @param use character of "col" or "row" defining columns or rows
#' @param ... extra parameters to the median function
#'
#' @export
#' @return numeric
calculate_matrix_medians = function(in_matrix, use = "col", ...)
{
if (use %in% "row") {
in_matrix = t(in_matrix)
}
out_medians = purrr::map_dbl(seq_len(ncol(in_matrix)), \(in_col){
stats::median(in_matrix[, in_col], ...)
})
return(out_medians)
}
#' Add uniform noise
#'
#' Adds uniform noise to values, generating replicates with noise added to
#' the original.
#'
#' @param value a single or vector of numeric values
#' @param n_rep the number of replicates to make (numeric). Default is 1.
#' @param sd the standard deviation of the data
#' @param use_zero logical, should returned values be around zero or not?
#'
#' @export
#'
#' @return numeric matrix
add_uniform_noise = function(value, n_rep, sd, use_zero = FALSE){
n_value = length(value)
n_sd = n_rep * n_value
out_sd = stats::rnorm(n_sd, 0, sd)
out_sd = matrix(out_sd, nrow = n_value, ncol = n_rep)
if (!use_zero) {
tmp_value = matrix(value, nrow = n_value, ncol = n_rep, byrow = FALSE)
out_value = tmp_value + out_sd
} else {
out_value = out_sd
}
return(out_value)
}
#' Example Dataset With Missingness
#'
#' An example dataset that has missingness from left-censorship
#'
#' @format ## `missing_dataset`
#' A matrix with 1000 rows and 20 columns, where rows are features and
#' columns are samples.
#'
#' @source Robert M Flight
"missing_dataset"
#' Example RNA-Seq Dataset With Missingness
#'
#' An example dataset from RNA-seq experiment on yeast, created by Gierliński et al.,
#' "Statistical models for RNA-seq data derived from a two-condition 48-replicate experiment",
#' Bioinformatics, 31, 2015 <https://doi.org/10.1093/bioinformatics/btv425>.
#'
#' @format ## `yeast_missing`
#' A matrix with 6887 rows (genes) and 96 columns (samples).
#'
#' @source <https://dx.doi.org/10.6084/M9.FIGSHARE.1425502.V1> <https://dx.doi.org/10.6084/M9.FIGSHARE.1425503.V1>
"yeast_missing"