Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion .github/workflows/ASan.yml
Original file line number Diff line number Diff line change
Expand Up @@ -52,4 +52,11 @@ jobs:
# adds no coverage to TreeSearch's own compiled code. All Rogue use
# (two vignettes + the Shiny consensus module) is requireNamespace-
# guarded and skips cleanly when absent.
exclude-packages: Rogue
#
# MaxMin is GitHub-only (Remotes: ms609/MaxMin); pak::pkg_install()
# resolves the vignettes leg's Suggests as plain CRAN refs and can't
# see the local DESCRIPTION's Remotes mapping, so it fails the whole
# dependency solve with "Can't find package called MaxMin". All
# MaxMin use (WideSample()) is requireNamespace-guarded and skips
# cleanly when absent.
exclude-packages: Rogue,MaxMin
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ Config/Needs/check:
pkgbuild,
rcmdcheck,
Config/Needs/coverage: covr, spelling
Config/Needs/memcheck: devtools, pkgdown, remotes, testthat
Config/Needs/memcheck: devtools, pkgdown, testthat, phangorn
Config/Needs/metadata: codemeta
Config/Needs/revdeps: revdepcheck
Config/Needs/website:
Expand Down
37 changes: 37 additions & 0 deletions R/MaximizeParsimony.R
Original file line number Diff line number Diff line change
Expand Up @@ -92,13 +92,22 @@

consMat <- matrix(unlist(constraint, use.names = FALSE),
nrow = length(constraint), byrow = TRUE)
# For each constraint character, record the tips unambiguously in the "1"
# group (derived state present, ancestral absent) and, separately, those in
# the "0" group (ancestral present, derived absent). Tips ambiguous for the
# character ("?", or unconstrained taxa) belong to neither group and are free
# to plot on either side of the split.
consSplits <- matrix(0L, nrow = ncol(consMat), ncol = length(constraint))
consZero <- matrix(0L, nrow = ncol(consMat), ncol = length(constraint))
for (ch in seq_len(ncol(consMat))) {
for (tip in seq_len(length(constraint))) {
token <- consMat[tip, ch]
if (consContrast[token, nConsStates] == 1 &&
consContrast[token, 1] == 0) {
consSplits[ch, tip] <- 1L
} else if (consContrast[token, 1] == 1 &&
consContrast[token, nConsStates] == 0) {
consZero[ch, tip] <- 1L
}
}
}
Expand All @@ -108,8 +117,36 @@
s >= 1 && s < length(constraint) - 1
})
consSplits <- consSplits[keep, , drop = FALSE]
consZero <- consZero[keep, , drop = FALSE]
if (nrow(consSplits) == 0L) return(list())

# Every returned tree must display all constraint splits simultaneously.
# Two splits are jointly displayable iff they are compatible in the
# four-gamete sense: treating each as a bipartition of the tips it
# constrains (its "1" group vs its "0" group, ambiguous tips excluded), the
# pair is compatible iff at least one of the four group intersections is
# empty. A laminar (nested-or-disjoint) test alone is too strict: it rejects
# the case where the two "0" groups are disjoint -- i.e. the splits' "1"
# sides jointly cover the constrained tips -- which is perfectly displayable,
# e.g. ab | cef and abcd | ef coexist on ((a,b),(d,(c,(e,f)))).
nSplits <- nrow(consSplits)
if (nSplits > 1L) {
for (i in seq_len(nSplits - 1L)) {
aOne <- consSplits[i, ] == 1L
aZero <- consZero[i, ] == 1L
for (j in seq(i + 1L, nSplits)) {
bOne <- consSplits[j, ] == 1L
bZero <- consZero[j, ] == 1L
compatible <- !any(aOne & bOne) || !any(aOne & bZero) ||
!any(aZero & bOne) || !any(aZero & bZero)
if (!compatible) {
stop("Constraint is impossible to satisfy: splits ", i, " and ", j,
" are incompatible (all four taxon groupings co-occur)")
}
}
}
}

consWeight <- attr(constraint, "weight")
consExpectedScore <- sum(
MinimumLength(constraint, compress = TRUE) * consWeight
Expand Down
9 changes: 7 additions & 2 deletions src/ts_tree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,13 @@ void TreeState::load_tip_states(const DataSet& ds) {
// T-262: bulk memcpy replaces per-element loop. Tip states occupy the
// first n_tip * total_words entries of prelim/final_ (contiguous).
size_t tip_bytes = static_cast<size_t>(n_tip) * total_words * sizeof(uint64_t);
std::memcpy(prelim.data(), ds.tip_states.data(), tip_bytes);
std::memcpy(final_.data(), ds.tip_states.data(), tip_bytes);
// All characters can be uninformative (e.g. every state a singleton),
// leaving zero blocks and a zero-length tip_states vector; its .data()
// is then permitted to be null, which memcpy's nonnull attribute forbids.
if (tip_bytes > 0) {
std::memcpy(prelim.data(), ds.tip_states.data(), tip_bytes);
std::memcpy(final_.data(), ds.tip_states.data(), tip_bytes);
}

// Initialise tip subtree_actives: applicable states only (NA word = 0).
// For {-,X} tips, applicable state bits are preserved here; the
Expand Down
37 changes: 27 additions & 10 deletions src/ts_wagner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -478,13 +478,26 @@ WagnerResult wagner_tree(TreeState& tree, const DataSet& ds,
int tip = order[i];
int new_internal = n_tip + i - 1;

const uint64_t* tip_prelim =
&ds.tip_states[static_cast<size_t>(tip) * ds.total_words];
// When there are no Fitch words — e.g. every character is a hierarchy
// character recoded to a Sankoff character under inapplicable = "xform",
// leaving the equal-weights dataset with zero blocks — the per-word state
// vectors (ds.tip_states, edge_set) are empty and every Fitch insertion
// cost is identically zero. Skip the edge-set precompute and the indirect
// length evaluation, which would otherwise take the address of element 0 of
// an empty vector (undefined behaviour; aborts under _GLIBCXX_ASSERTIONS).
// The DFS below still runs, so constraints are honoured and — all costs
// being equal — the first legal edge is chosen.
const bool have_words = ds.total_words > 0;
const uint64_t* tip_prelim = have_words
? &ds.tip_states[static_cast<size_t>(tip) * ds.total_words]
: nullptr;

// Exact insertion cost: edge_set[D] = combine(prelim[D], up[D]). Replaces
// the union-of-finals (final_[node] | final_[child]) approximation that
// undercut insertion cost and produced ~+30% Wagner trees.
compute_insertion_edge_sets(tree, ds, edge_set, edge_set_up, edge_set_pre);
if (have_words) {
compute_insertion_edge_sets(tree, ds, edge_set, edge_set_up, edge_set_pre);
}
const int tw = tree.total_words;

// Find best insertion edge via DFS from root
Expand All @@ -508,9 +521,11 @@ WagnerResult wagner_tree(TreeState& tree, const DataSet& ds,
if (!constrained ||
!wagner_edge_violates_constraint(tree, lc, tip, *cd,
added_tips)) {
int extra = fitch_indirect_length_cached(
tip_prelim, &edge_set[static_cast<size_t>(lc) * tw],
ds, best_extra);
int extra = have_words
? fitch_indirect_length_cached(
tip_prelim, &edge_set[static_cast<size_t>(lc) * tw],
ds, best_extra)
: 0;
if (extra < best_extra) {
best_extra = extra;
best_above = node;
Expand All @@ -522,9 +537,11 @@ WagnerResult wagner_tree(TreeState& tree, const DataSet& ds,
if (!constrained ||
!wagner_edge_violates_constraint(tree, rc, tip, *cd,
added_tips)) {
int extra = fitch_indirect_length_cached(
tip_prelim, &edge_set[static_cast<size_t>(rc) * tw],
ds, best_extra);
int extra = have_words
? fitch_indirect_length_cached(
tip_prelim, &edge_set[static_cast<size_t>(rc) * tw],
ds, best_extra)
: 0;
if (extra < best_extra) {
best_extra = extra;
best_above = node;
Expand Down Expand Up @@ -1068,7 +1085,7 @@ void random_constrained_tree(TreeState& tree, const DataSet& ds,

// Child splits whose parent is this split
for (int j = 0; j < i; ++j) {
if (parent_split[j] == i) {
if (parent_split[j] == i && split_root[j] >= 0) {
items.push_back(split_root[j]);
}
}
Expand Down
38 changes: 38 additions & 0 deletions tests/testthat/test-ts-hsj.R
Original file line number Diff line number Diff line change
Expand Up @@ -768,3 +768,41 @@ test_that("MaximizeParsimony HSJ + sectorial search stays score-consistent", {
expect_equal(attr(result2, "score"), reported)
expect_equal(length(result2), length(result))
})


# =========================================================================
# Test: all-hierarchy data -> zero Fitch words (regression)
# =========================================================================
# HSJ, like xform, zero-weights every hierarchy character, so a dataset whose
# characters are ALL hierarchical leaves the equal-weights dataset empty
# (DataSet::total_words == 0, empty per-word state vectors). wagner_tree()
# indexed element 0 of those empty vectors (&ds.tip_states[0]) — undefined
# behaviour that aborted under the hardened libstdc++ assertions in the
# gcc-ASAN CI (run 28662381835). The HSJ search must still build a valid tree
# from the hierarchy DP term alone.

test_that("HSJ search handles all-hierarchy data (zero Fitch words)", {
mat <- matrix(c(
"1", "0", "0", "-", "-",
"1", "1", "1", "0", "1",
"0", "-", "1", "1", "0",
"1", "0", "1", "0", "0",
"0", "-", "0", "-", "-",
"1", "1", "0", "-", "-"
), nrow = 6, byrow = TRUE, dimnames = list(paste0("t", 1:6), NULL))
ds <- make_hsj_dat(mat)
# Two blocks span ALL five characters -> no non-hierarchy chars remain.
h <- CharacterHierarchy("1" = 2L, "3" = 4:5)
expect_length(setdiff(seq_len(5L), HierarchyChars(h)), 0L)

set.seed(42)
res <- MaximizeParsimony(ds, hierarchy = h, inapplicable = "hsj",
hsj_alpha = 1.0, maxReplicates = 4L,
targetHits = 3L, verbosity = 0L)
expect_s3_class(res[[1]], "phylo")
expect_equal(length(res[[1]]$tip.label), 6L)
for (tr in res) {
expect_s3_class(tr, "phylo")
expect_true(TreeIsRooted(tr))
}
})
105 changes: 105 additions & 0 deletions tests/testthat/test-ts-random-constrained.R
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,111 @@ test_that("RANDOM_TREE with IW scoring + constraints", {
}
})

## T-329: a genuinely incompatible constraint must error cleanly rather than
## reach random_constrained_tree(), which cannot represent splits that no tree
## can display simultaneously. Incompatibility is the four-gamete condition
## (all four taxon groupings co-occur), NOT mere non-laminarity: non-laminar
## splits whose "0" sides are disjoint (e.g. {t1,t2,t3} & {t3,t4,t5}, which
## coexist on ((t1,t2),t3,(t4,t5))) ARE displayable and must be accepted.
test_that("impossible (four-gamete-incompatible) constraint errors cleanly", {
ds5 <- make_ds5()

# Split A = {t1,t2,t3}, split B = {t2,t3,t4}: every taxon grouping
# (A1&B1={t2,t3}, A1&B0={t1}, A0&B1={t4}, A0&B0={t5}) is non-empty, so the
# four-gamete test fails and no tree can display both splits at once.
consMat <- matrix(c("1", "1", "1", "0", "0",
"0", "1", "1", "1", "0"),
nrow = 5, dimnames = list(paste0("t", 1:5), NULL))
cons <- phangorn::phyDat(consMat, type = "USER", levels = c("0", "1"))

expect_error(
TreeSearch:::.PrepareConstraint(cons, ds5),
"impossible"
)
expect_error(
MaximizeParsimony(ds5, constraint = cons,
maxReplicates = 1L, verbosity = 0L),
"impossible"
)
})

## T-329 (regression): a non-laminar constraint whose splits are nevertheless
## four-gamete-COMPATIBLE (their "0" sides are disjoint) must be ACCEPTED. The
## original laminar-only check wrongly rejected {t1,t2,t3} & {t3,t4,t5}, which
## coexist on the ordinary unrooted tree ((t1,t2),t3,(t4,t5)).
test_that("non-laminar but compatible constraint is accepted", {
ds5 <- make_ds5()

consMat <- matrix(c("1", "1", "1", "0", "0",
"0", "0", "1", "1", "1"),
nrow = 5, dimnames = list(paste0("t", 1:5), NULL))
cons <- phangorn::phyDat(consMat, type = "USER", levels = c("0", "1"))

expect_silent(TreeSearch:::.PrepareConstraint(cons, ds5))
})

## T-329 (defensive fix): random_constrained_tree() must never inject a
## collapsed split's phantom node (-1) into its parent's item list.
##
## A split "collapses" (split_root == -1) when every tip it owns is stolen
## by tighter, non-laminar splits that are not its formal children. If that
## collapsed split still has a strict-superset parent, the parent's child
## collection loop (src/ts_wagner.cpp, "Child splits whose parent is this
## split") must skip it rather than push -1.
##
## This constructs that scenario directly via the internal flat
## ts_driven_search() interface, bypassing .PrepareConstraint()'s R-level
## laminarity check (T-329's primary fix) so the C++ guard itself is
## exercised:
## D1 = {t1,t6}, D2 = {t2,t7}, D3 = {t3,t8} (tight, non-laminar splits)
## C = {t1,t2,t3} (loses every tip to D1-D3)
## S = {t1,t2,t3,t6,t7,t8} (strict superset of C, D1-D3)
## C collapses (split_root == -1) but is still S's formal child, so S's
## build loop must skip it. wagnerBias = 3L forces the RANDOM_TREE
## strategy so random_constrained_tree() is actually invoked.
test_that("collapsed constraint split with superset parent does not corrupt tree", {
tipNames <- c("t9", "t1", "t2", "t3", "t6", "t7", "t8")
ds <- phangorn::phyDat(
matrix(c("0", "1", "0", "1", "0", "1", "0",
"1", "0", "1", "0", "1", "0", "1"),
nrow = 7, dimnames = list(tipNames, NULL)),
type = "USER", levels = c("0", "1")
)
at <- attributes(ds)

consSplitMatrix <- matrix(c(
0, 1, 0, 0, 1, 0, 0, # D1 = {t1, t6}
0, 0, 1, 0, 0, 1, 0, # D2 = {t2, t7}
0, 0, 0, 1, 0, 0, 1, # D3 = {t3, t8}
0, 1, 1, 1, 0, 0, 0, # C = {t1, t2, t3} (collapses)
0, 1, 1, 1, 1, 1, 1 # S = {t1, t2, t3, t6, t7, t8}
), nrow = 5, byrow = TRUE)

for (s in c(1L, 2L, 3L)) {
set.seed(s)
res <- TreeSearch:::ts_driven_search(
contrast = at$contrast,
tip_data = matrix(unlist(ds, use.names = FALSE), nrow = 7,
byrow = TRUE),
weight = at$weight,
levels = at$levels,
maxReplicates = 1L, targetHits = 1L, verbosity = 0L,
wagnerBias = 3L,
consSplitMatrix = consSplitMatrix
)
edge <- res$trees[[1]]
n_tip <- 7L
n_node <- 2L * n_tip - 1L

expect_equal(nrow(edge), 2L * n_tip - 2L, info = paste("seed", s))
expect_true(all(edge >= 1L & edge <= n_node), info = paste("seed", s))
# Every node 1..n_node bar the root appears as a child exactly once.
expect_equal(sort(unique(edge[, 2])), setdiff(seq_len(n_node),
n_tip + 1L),
info = paste("seed", s))
}
})

test_that("parallel RANDOM_TREE with multiple seeds (nThreads=2)", {
ds5 <- make_ds5()
cons <- ape::read.tree(text = "((t1,t2),(t3,(t4,t5)));")
Expand Down
37 changes: 37 additions & 0 deletions tests/testthat/test-ts-xform.R
Original file line number Diff line number Diff line change
Expand Up @@ -304,3 +304,40 @@ test_that("Xform scores heterogeneous-n_states blocks consistently (SK-01)", {
TreeLength(res[[1]], ds, hierarchy = h, inapplicable = "xform")
)
})


# ===== All-hierarchy data: zero Fitch words (regression) =====================
# When EVERY character belongs to a hierarchy block, recoding removes all
# characters from the equal-weights (Fitch) dataset, so DataSet::total_words
# is 0 and the per-word state vectors (tip_states, edge_set) are empty.
# wagner_tree() took the address of element 0 of these empty vectors
# (&ds.tip_states[0], &edge_set[0]) — undefined behaviour that aborted under
# the hardened libstdc++ assertions in the gcc-ASAN CI (run 28662381835,
# ts-xform group). With no Fitch signal the search must still build a valid
# tree from the Sankoff term alone. This test exercises that path so the
# hardened/ASAN CI covers it.

test_that("Xform search handles all-hierarchy data (zero Fitch words)", {
mat <- matrix(c(
"1", "0", "0", "-", "-",
"1", "1", "1", "0", "1",
"0", "-", "1", "1", "0",
"1", "0", "1", "0", "0",
"0", "-", "0", "-", "-",
"1", "1", "0", "-", "-"
), nrow = 6, byrow = TRUE, dimnames = list(paste0("t", 1:6), NULL))
ds <- make_dat(mat)
# Two blocks span ALL five characters -> no non-hierarchy chars remain.
h <- CharacterHierarchy("1" = 2L, "3" = 4:5)
expect_length(setdiff(seq_len(5L), HierarchyChars(h)), 0L)

set.seed(42)
res <- MaximizeParsimony(ds, hierarchy = h, inapplicable = "xform",
maxReplicates = 4L, targetHits = 3L, verbosity = 0L)
expect_s3_class(res[[1]], "phylo")
expect_equal(length(res[[1]]$tip.label), 6L)
for (tr in res) {
expect_s3_class(tr, "phylo")
expect_true(TreeIsRooted(tr))
}
})
Loading