/
tax_expand_time.R
139 lines (129 loc) · 5.59 KB
/
tax_expand_time.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
#' Generate pseudo-occurrences from temporal range data
#'
#' A function to generate interval-level pseudo-occurrences for taxa based on
#' temporal ranges (e.g. the output of \code{\link{tax_range_time}}). While the
#' resulting pseudo-occurrences should not be treated as equivalent to actual
#' occurrence data (e.g. like that from the Paleobiology Database), such
#' pseudo-occurrences may be useful for performing statistical analyses where
#' the row representing a taxon must be replicated for each interval through
#' which the taxon persisted.
#'
#' @param taxdf \code{dataframe}. A dataframe of taxa (such as that
#' produced by \code{\link{tax_range_time}}) with columns for the maximum and
#' minimum ages (FADs and LADs). Each row should represent a unique taxon.
#' Additional columns may be included (e.g. taxon names, additional taxonomy,
#' etc) and will be included in the returned \code{data.frame}. If required,
#' `numeric` ages can be generated from interval names via the
#' \code{\link{look_up}} function.
#' @param max_ma \code{character}. The name of the column you wish to be treated
#' as the maximum limit (FADs) of the age range (e.g. "max_ma").
#' @param min_ma \code{character}. The name of the column you wish to be treated
#' as the minimum limit (LADs) of the age range (e.g. "min_ma").
#' @param bins \code{dataframe}. A dataframe of the bins that you wish to
#' allocate pseudo-occurrences to such as that returned by
#' \code{\link[palaeoverse:time_bins]{time_bins()}}. This dataframe must
#' contain at least the following named columns: "bin", "max_ma" and
#' "min_ma". Columns "max_ma" and "min_ma" must be `numeric` values.
#' @param scale \code{character}. Specify the desired geological timescale to be
#' used, either "GTS2020" or "GTS2012". Passed to
#' \code{\link[palaeoverse:time_bins]{time_bins()}} if \code{bins} is not
#' specified.
#' @param rank \code{character}. Specify the desired stratigraphic rank. Choose
#' from: "stage", "epoch", "period", "era", and "eon". Passed to
#' \code{\link[palaeoverse:time_bins]{time_bins()}} if \code{bins} is not
#' specified.
#' @param ext_orig \code{logical}. Should two additional columns be added to
#' identify the intervals in which taxa originated and went extinct?
#'
#' @return A \code{dataframe} where each row represents an interval during which
#' a taxon in the original user-supplied data persisted. The columns are
#' identical to those in the user-supplied data with additional columns
#' included to identify the intervals. If \code{ext_orig} is \code{TRUE},
#' two additional columns are added to identify in which intervals taxa
#' originated and went extinct.
#' @section Developer(s):
#' William Gearty & Lewis A. Jones
#' @section Reviewer(s):
#' Lewis A. Jones
#' @export
#' @examples
#' taxdf <- data.frame(name = c("A", "B", "C"),
#' max_ma = c(150, 60, 30),
#' min_ma = c(110, 20, 0))
#' ex <- tax_expand_time(taxdf)
#'
#' bins <- time_bins(scale = "GTS2012", rank = "stage")
#' ex2 <- tax_expand_time(taxdf, bins = bins)
tax_expand_time <- function(
taxdf,
max_ma = "max_ma",
min_ma = "min_ma",
bins = NULL,
scale = "GTS2020",
rank = "stage",
ext_orig = TRUE) {
# Handle errors
if (is.data.frame(taxdf) == FALSE) {
stop("`taxdf` should be a dataframe")
}
if (!all(c(min_ma, max_ma) %in% colnames(taxdf))) {
stop("Either `min_ma` or `max_ma` is not a named column in `taxdf`")
}
if (!is.numeric(taxdf[, max_ma])) {
stop("The class of the max_ma column must be numeric.")
}
if (!is.numeric(taxdf[, min_ma])) {
stop("The class of the min_ma column must be numeric.")
}
if (any(taxdf[, c(min_ma, max_ma)] < 0)) {
stop("Maximum and minimum ages must be positive.")
}
if (any(taxdf[, max_ma] < taxdf[, min_ma])) {
stop("Maximum ages must be larger than or equal to minimum ages.")
}
if (length(rank) > 1 ||
!(rank %in% c("stage", "epoch", "period", "era", "eon"))) {
stop("`rank` must be either: stage, epoch, period, era, or eon")
}
if (!is.logical(ext_orig)) {
stop("`ext_orig` should be logical (TRUE/FALSE)")
}
if (is.null(bins) && (is.null(scale) || is.null(rank))) {
stop("Either `bin` or `scale` and `rank` must be specified.")
}
if (!is.null(bins)) {
if (is.data.frame(bins) == FALSE) {
stop("`bins` should be a dataframe.")
}
if (!all(c("bin", "max_ma", "min_ma") %in% colnames(bins))) {
stop("bin, max_ma and/or min_ma do not exist in `bins`.")
}
} else {
# get the desired timescale at the desired rank
bins <- time_bins(rank = rank, scale = scale)
}
if (any(duplicated(taxdf))) {
stop("Not all rows in `taxdf` are unique!")
}
# add a taxon index column (since we can't guarantee there's a "name" column)
# use a very unique column name so we don't clobber any existing columns
taxdf$this_is_a_unique_index_column_name <- seq_len(nrow(taxdf))
# replicate taxon rows for each interval they span
dat_list <- lapply(seq_len(nrow(bins)), function(i) {
int_tax <- taxdf[taxdf[, min_ma] < bins$max_ma[i] &
taxdf[, max_ma] > bins$min_ma[i], ]
if (ext_orig) {
int_tax$ext <- int_tax[, min_ma] >= bins$min_ma[i] &
int_tax[, min_ma] > 0
int_tax$orig <- int_tax[, max_ma] <= bins$max_ma[i]
}
if (nrow(int_tax) == 0) return(NULL)
cbind(int_tax, bins[i, ])
})
dat <- do.call(rbind, dat_list)
rownames(dat) <- NULL
dat <- dat[order(dat$this_is_a_unique_index_column_name), ]
# remove the index column
dat$this_is_a_unique_index_column_name <- NULL
dat
}