/
indexing.R
198 lines (177 loc) · 6.48 KB
/
indexing.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
#' Create index of subsets of a data
#'
#' Index of the samples grouped by batches.
#' @param size_subset A numeric value with the amount of samples per batch.
#' @param n A numeric value with the number of batches.
#' @param size_data A numeric value of the amount of samples to distribute.
#' @param name A character used to name the subsets, either a single one or a
#' vector the same size as `n`.
#' @return A random list of indices of the samples.
#' @seealso [batch_names()], [use_index()] if you already
#' have a factor to be used as index.
#' @export
#' @examples
#' index <- create_subset(100, 50, 2)
create_subset <- function(size_data, size_subset = NULL, n = NULL, name = "SubSet") {
if (is.null(size_subset) && is.null(n)) {
stop("Either size.subset or n should numeric")
}
if (is.null(n)) {
n <- optimum_batches(size_data, size_subset)
}
if (is.null(size_subset)) {
size_subset <- optimum_subset(size_data, n)
}
if (!valid_sizes(size_data, size_subset, n)) {
stop("Please provide a higher number of batches or more samples per batch.")
}
size_batches <- internal_batches(size_data, size_subset, n)
create_index(size_data, size_batches, n, name)
}
# The workhorse function without any check
# size_batches is a vector with the number of elements in each batch.
create_index <- function(size_data, size_batches, n, name = "SubSet") {
# The size of each batch
stopifnot("Batches match the length" = length(size_batches) == n)
i <- distribute_samples(size_data, size_batches)
names(i) <- id2batch_names(name, n)
i
}
# Shuffle sample within index to improve positioning
create_index4index <- function(index, name) {
m <- matrix(data = NA, nrow = length(name), ncol = length(index),
dimnames = list(name, names(index)))
# Assign each row number from each batch to a position:
for (batch in seq_along(index)) {
positions <- sample(index[[batch]])
rows <- sample(seq_along(positions))
m[rows, batch] <- positions
}
# Transform to a list omitting the empty values
index_out <- apply(m, 1, function(x){x[!is.na(x)]}, simplify = FALSE)
index_out[lengths(index_out) != 0]
}
id2batch_names <- function(name, n) {
if (length(name) != 1L && length(name) != n) {
stop("Provide a single character or a vector the same size of the batches.",
call. = FALSE)
}
if (length(name) == 1L) {
name <- paste0(name, seq_len(n))
}
name
}
distribute_samples <- function(size_data, size_subsets) {
# Create the subsets
i <- vector("list", length = length(size_subsets))
vec <- seq_len(size_data)
for (j in seq_along(size_subsets)) {
out <- sort(sample(vec, size = size_subsets[j]))
vec <- vec[!vec %in% out]
i[[j]] <- out
}
i
}
#' Convert a factor to index
#'
#' Convert a given factor to an accepted index
#' @param x A character or a factor to be used as index
#' @export
#' @seealso You can use [evaluate_index()] to evaluate how good an
#' index is. For the inverse look at [batch_names()].
#' @examples
#' plates <- c("P1", "P2", "P1", "P2", "P2", "P3", "P1", "P3", "P1", "P1")
#' use_index(plates)
use_index <- function(x) {
stopifnot(is.character(x))
if (anyNA(x)) {
warning("NAs present in the index. Some samples weren't assigned to a batch?")
}
.use_index(x)
}
.use_index <- function(x) {
factors <- x
factors[is.na(x)] <- "NA"
split(seq_along(x), factors)
}
#' Name the batch
#'
#' Given an index return the name of the batches the samples are in
#' @param i A list of numeric indices.
#' @return A character vector with the names of the batch for each the index.
#' @seealso [create_subset()], for the inverse look at
#' [use_index()].
#' @export
#' @examples
#' index <- create_subset(100, 50, 2)
#' batch <- batch_names(index)
#' head(batch)
batch_names <- function(i) {
ui <- unlist(i, use.names = FALSE)
if (any(table(ui) > 1L)) {
warning("There are replicates measures.\n\tUpdating index to the expected output")
i <- translate_index(i)
ui <- unlist(i, use.names = FALSE)
}
names <- rep(names(i), lengths(i))
names[order(ui)]
}
#' Compares two indexes
#'
#' Compare the distribution of samples with two different batches.
#' @param index1,index2 A list with the index for each sample, the name of the
#' column in `pheno` with the batch subset or the character .
#' @param pheno A data.frame of the samples with the characteristics to normalize.
#' @returns A matrix with the variables and the columns of of each batch.
#' Negative values indicate `index1` was better.
#' @export
#' @seealso [check_index()]
#' @examples
#' index1 <- create_subset(50, 24)
#' index2 <- batch_names(create_subset(50, 24))
#' metadata <- expand.grid(height = seq(60, 80, 5), weight = seq(100, 300, 50),
#' sex = c("Male","Female"))
#' compare_index(metadata, index1, index2)
compare_index <- function(pheno, index1, index2) {
if (is.character(index1) && length(index1) == nrow(pheno)) {
index1 <- use_index(index1)
} else if (is.character(index1) && length(index1) == 1 && index1 %in% colnames(pheno)) {
index0 <- index1
index1 <- use_index(pheno[[index1]])
pheno <- pheno[, !colnames(pheno) %in% index0, drop = FALSE]
} else if (is.character(index1)) {
stop("index1 is not present")
}
if (is.character(index2) && length(index2) == nrow(pheno)) {
index2 <- use_index(index2)
} else if (is.character(index2) && length(index2) == 1 && index2 %in% colnames(pheno)) {
index0 <- index2
index2 <- use_index(pheno[[index2]])
pheno <- pheno[, !colnames(pheno) %in% index0, drop = FALSE]
} else if (is.character(index1)) {
stop("index2 is not present")
}
if (sum(lengths(index1)) != nrow(pheno)) {
stop("Indices do not match the number of samples in pheno.")
}
if (sum(lengths(index1)) != sum(lengths(index2))) {
stop("Indices don't seem from the same data, their numbers are not equivalent.")
}
if (length(index1) != length(index2)) {
stop("Different number of batches in the indices.")
}
batches <- length(index1)
num <- is_num(pheno)
eval_n <- evaluations(num)
original_pheno <- .evaluate_orig(pheno, num)
original_pheno["na", ] <- original_pheno["na", ]/batches
ci1 <- .check_index(index1, pheno, num, eval_n, original_pheno)
ci2 <- .check_index(index2, pheno, num, eval_n, original_pheno)
ci1 - ci2
}
apply_index <- function(pheno, index, name = "old_rows") {
stopifnot(is.character(name) && length(name) == 1)
old_rows <- sort(unlist(index, FALSE, FALSE))
pheno <- pheno[old_rows, , drop = FALSE]
add_column(pheno, old_rows, name)
}