diff --git a/DESCRIPTION b/DESCRIPTION index 5f92b1dcb..04beda37a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: TreeTools Title: Create, Modify and Analyse Phylogenetic Trees -Version: 2.1.0.9003 +Version: 2.1.0.9004 Authors@R: c( person("Martin R.", 'Smith', role = c("aut", "cre", "cph"), email = "martin.smith@durham.ac.uk", diff --git a/NEWS.md b/NEWS.md index b5dd67246..bf4bb119a 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,7 @@ +# TreeTools 2.1.0.9004 (2026-03-12) # + +- Rewrite popcount calculation for more efficient `TipsInSplits()`. + # TreeTools 2.1.0.9003 (2026-03-09) # - `inst/include/TreeTools/tree_number.h` added to support downstream packages diff --git a/benchmark/_compare_results.R b/benchmark/_compare_results.R index be1f731f9..b0aded68c 100644 --- a/benchmark/_compare_results.R +++ b/benchmark/_compare_results.R @@ -50,12 +50,18 @@ for (pr_file in pr_files) { main_iqr) threshold_percent <- 6 # Changes of ~5% are frequent + # Sub-millisecond benchmarks are dominated by system jitter on CI runners; + # require an absolute difference floor before flagging. + min_meaningful_diff <- 2e-4 # 0.2 ms (times are in seconds) + abs_diff <- abs(median_pr - median_main) is_faster <- matched && + abs_diff > min_meaningful_diff && median_pr < median_main - 2 * mad_main && median_pr < noise_range[[1]] is_slower <- matched && + abs_diff > min_meaningful_diff && median_pr > median_main + 2 * mad_main && median_pr > noise_range[[2]] diff --git a/benchmark/_init.R b/benchmark/_init.R index 170ac736e..11af7c836 100644 --- a/benchmark/_init.R +++ b/benchmark/_init.R @@ -1,8 +1,9 @@ library("TreeTools") -Benchmark <- function(..., min_iterations = NULL) { - result <- bench::mark(..., min_iterations = min_iterations %||% 3, - time_unit = "us") +Benchmark <- function(..., min_iterations = NULL, min_time = NULL) { + args <- list(..., min_iterations = min_iterations %||% 3, time_unit = "us") + if (!is.null(min_time)) args[["min_time"]] <- min_time + result <- do.call(bench::mark, args) if (interactive()) { print(result) } else { diff --git a/benchmark/bench-PathLengths.R b/benchmark/bench-PathLengths.R index 5bac89394..82a3b8e72 100644 --- a/benchmark/bench-PathLengths.R +++ b/benchmark/bench-PathLengths.R @@ -2,12 +2,13 @@ source("benchmark/_init.R") # sets seed tr80 <- rtree(80) -Benchmark(TreeTools:::path_lengths(tr80$edge, tr80$edge.length, FALSE)) -Benchmark(PathLengths(tr80, full = TRUE)) +Benchmark(TreeTools:::path_lengths(tr80$edge, tr80$edge.length, FALSE), + min_time = 2) +Benchmark(PathLengths(tr80, full = TRUE), min_time = 2) tr80Unif <- tr80 tr80Unif[["edge.length"]] <- NULL -Benchmark(PathLengths(tr80Unif, full = TRUE)) +Benchmark(PathLengths(tr80Unif, full = TRUE), min_time = 2) tr2000 <- rtree(2000) Benchmark(PathLengths(tr2000, full = TRUE)) diff --git a/inst/include/TreeTools/SplitList.h b/inst/include/TreeTools/SplitList.h index e9217d562..e827d8d9d 100644 --- a/inst/include/TreeTools/SplitList.h +++ b/inst/include/TreeTools/SplitList.h @@ -38,26 +38,25 @@ namespace TreeTools { return T(1) << bit_pos; } -#if __cplusplus >= 202002L -#include // C++20 header for std::popcount +// Hardware POPCNT: available on all x86-64 since 2008 (Nehalem / Barcelona). + // Inline asm emits the instruction directly, without requiring -mpopcnt. +#if (defined(__GNUC__) || defined(__clang__)) && defined(__x86_64__) inline int32 count_bits(splitbit x) { - return static_cast(std::popcount(x)); + uint64_t result; + __asm__ ("popcnt %1, %0" : "=r" (result) : "r" (x)); + return static_cast(result); } - // Option 2: Fallback for C++17 and older -#else -#if defined(__GNUC__) || defined(__clang__) - // GCC and Clang support __builtin_popcountll for long long - inline int32 count_bits(splitbit x) { - return static_cast(__builtin_popcountll(x)); - } -#elif defined(_MSC_VER) +#elif defined(_MSC_VER) && defined(_M_X64) #include inline int32 count_bits(splitbit x) { return static_cast(__popcnt64(x)); } +#elif defined(__GNUC__) || defined(__clang__) + // Non-x86 (ARM, etc.): builtin maps to efficient native instruction + inline int32 count_bits(splitbit x) { + return static_cast(__builtin_popcountll(x)); + } #else - // A slower, but safe and highly portable fallback for all other compilers - // This is a last resort if no built-in is available. inline int32_t count_bits(splitbit x) { int32_t count = 0; while (x != 0) { @@ -66,9 +65,7 @@ namespace TreeTools { } return count; } -#endif // Compiler check for builtins - -#endif // C++20 check +#endif class SplitList { public: diff --git a/src/tips_in_splits.cpp b/src/tips_in_splits.cpp index 0b46014e5..942f23e48 100644 --- a/src/tips_in_splits.cpp +++ b/src/tips_in_splits.cpp @@ -22,6 +22,10 @@ Rcpp::IntegerVector tips_in_splits(Rcpp::RawMatrix splits) { const unsigned char* splits_i = splits.begin() + i; for (int32 bin = 0; bin < n_bin; ++bin) { const unsigned char* in_bin = splits_i + bin * n_split; + // __builtin_popcount on 8-bit values; the inline asm POPCNT in + // count_bits (SplitList.h) handles 64-bit splitbit values. + // For 8-bit values, __builtin_popcount is fine — it's a tiny + // fraction of runtime and not worth inline asm. ret[i] += static_cast(__builtin_popcount(*in_bin)); } }