diff --git a/.Rbuildignore b/.Rbuildignore index 9c0acea0..2ed440ba 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -9,6 +9,7 @@ CONTRIBUTING README.md ^\.github +^\.lintr cran-comments.md man-roxygen data-raw diff --git a/.github/workflows/R-CMD-check.yml b/.github/workflows/R-CMD-check.yml index d77ec88f..9784dbfd 100644 --- a/.github/workflows/R-CMD-check.yml +++ b/.github/workflows/R-CMD-check.yml @@ -50,7 +50,7 @@ jobs: - {os: macOS-latest, r: 'release'} - {os: ubuntu-22.04, r: '4.1', rspm: "https://packagemanager.rstudio.com/cran/__linux__/jammy/latest"} - - {os: ubuntu-latest, r: 'release', + - {os: ubuntu-24.04, r: 'release', rspm: "https://packagemanager.rstudio.com/cran/__linux__/noble/latest"} - {os: ubuntu-latest, r: 'devel', rspm: "https://packagemanager.rstudio.com/cran/__linux__/noble/latest"} @@ -76,12 +76,6 @@ jobs: r-version: ${{ matrix.config.r }} extra-repositories: https://ms609.github.io/packages/ - - name: Install apt packages (Linux) - if: runner.os == 'Linux' - run: | - sudo apt-get update - sudo apt-get install -y texlive-latex-base libglpk-dev texlive-fonts-recommended - - name: Set up R dependencies (Windows) if: runner.os == 'Windows' uses: r-lib/actions/setup-r-dependencies@v2 @@ -126,3 +120,23 @@ jobs: if: runner.os == 'Windows' run: covr::codecov() shell: Rscript {0} + + - name: Notify on failure + if: failure() && github.event_name == 'schedule' + uses: actions/github-script@v7 + with: + github-token: ${{ secrets.GITHUB_TOKEN }} + script: | + await github.rest.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: 164, + body: 'Scheduled workflow has failed: https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}' + }); + + await github.rest.issues.update({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: 164, + state: 'open' + }); diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 6e6334e2..5e8c0e2f 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -3,10 +3,28 @@ name: Benchmark on: workflow_dispatch: pull_request: + paths: + - "src/**" + - "R/**" + - "**.R" + - "**.cpp" + - "**.c" + - "**.h" + - "**.hpp" + - "configure*" + - "Makevars*" paths-ignore: + - "DESCRIPTION" + - ".**" - "Meta**" - "memcheck**" - "docs**" + - "inst**" + - "man**" + - "man-roxygen**" + - "memcheck**" + - "tests**" + - "vignettes**" - "**.git" - "**.json" - "**.md" diff --git a/.github/workflows/memcheck.yml b/.github/workflows/memcheck.yml index 4c5f687d..4b9665a6 100644 --- a/.github/workflows/memcheck.yml +++ b/.github/workflows/memcheck.yml @@ -34,12 +34,8 @@ on: name: mem-check jobs: - mem-check: - runs-on: ubuntu-20.04 - # stringi requires libicui18n - apt get libicu-dev too recent, - # libicu66 deprecated in ubuntu 22.04 - # Reinstalling stringi seems not to help - + mem-check-templated: + runs-on: ubuntu-24.04 name: valgrind ${{ matrix.config.test }}, ubuntu, R release strategy: @@ -53,10 +49,32 @@ jobs: env: R_REMOTES_NO_ERRORS_FROM_WARNINGS: true _R_CHECK_FORCE_SUGGESTS_: false - RSPM: https://packagemanager.rstudio.com/cran/__linux__/focal/latest + RSPM: https://packagemanager.rstudio.com/cran/__linux__/noble/latest GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} steps: + - uses: ms609/actions/memcheck@main + with: + test: ${{ matrix.config.test}} + + mem-check-legacy: + runs-on: ubuntu-24.04 + name: valgrind ${{ matrix.config.test }}, ubuntu, R release + + strategy: + fail-fast: false + matrix: + config: + - {test: 'tests'} + - {test: 'examples'} + - {test: 'vignettes'} + + env: + R_REMOTES_NO_ERRORS_FROM_WARNINGS: true + _R_CHECK_FORCE_SUGGESTS_: false + RSPM: https://packagemanager.rstudio.com/cran/__linux__/noble/latest + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + - uses: actions/checkout@v5 - uses: r-lib/actions/setup-r@v2 diff --git a/.github/workflows/rhub.yaml b/.github/workflows/rhub.yaml index 053f72c9..843b1ad9 100644 --- a/.github/workflows/rhub.yaml +++ b/.github/workflows/rhub.yaml @@ -1,8 +1,8 @@ -# R-hub's generic GitHub Actions workflow file. Its canonical location is at -# https://github.com/r-hub/rhub2/blob/v1/inst/workflow/rhub.yaml +# R-hub's generic GitHub Actions workflow file. It's canonical location is at +# https://github.com/r-hub/actions/blob/v1/workflows/rhub.yaml # You can update this file to a newer version using the rhub2 package: # -# rhub2::rhub_setup() +# rhub::rhub_setup() # # It is unlikely that you need to modify this file manually. @@ -33,7 +33,7 @@ jobs: steps: # NO NEED TO CHECKOUT HERE - - uses: r-hub/rhub2/actions/rhub-setup@v1 + - uses: r-hub/actions/setup@v1 with: config: ${{ github.event.inputs.config }} id: rhub-setup @@ -51,27 +51,17 @@ jobs: image: ${{ matrix.config.container }} steps: - - name: Check distribution - run: | - echo "distribution=$(awk -F= '/^ID=/{print $2}' /etc/os-release)" \ - >> $GITHUB_OUTPUT; - id: check_distribution - - name: apt-get install sudo (Ubuntu, for clang-asan) - run: apt-get install sudo - if: ${{ steps.check_distribution.outputs.distribution == 'ubuntu' }} - - uses: r-lib/actions/setup-pandoc@v2 - if: ${{ steps.check_distribution.outputs.distribution != 'fedora' }} - - uses: r-hub/rhub2/actions/rhub-checkout@v1 - - uses: r-hub/rhub2/actions/rhub-platform-info@v1 + - uses: r-hub/actions/checkout@v1 + - uses: r-hub/actions/platform-info@v1 with: token: ${{ secrets.RHUB_TOKEN }} job-config: ${{ matrix.config.job-config }} - - uses: r-hub/rhub2/actions/rhub-setup-deps@v1 + - uses: r-hub/actions/setup-deps@v1 with: token: ${{ secrets.RHUB_TOKEN }} job-config: ${{ matrix.config.job-config }} extra-packages: TreeDistData=?ignore-before-r=99.0.0 - - uses: r-hub/rhub2/actions/rhub-run-check@v1 + - uses: r-hub/actions/run-check@v1 with: token: ${{ secrets.RHUB_TOKEN }} job-config: ${{ matrix.config.job-config }} @@ -87,25 +77,21 @@ jobs: config: ${{ fromJson(needs.setup.outputs.platforms) }} steps: - - uses: teatimeguest/setup-texlive-action@v3 - with: - packages: scheme-basic - - uses: r-lib/actions/setup-pandoc@v2 - - uses: r-hub/rhub2/actions/rhub-checkout@v1 - - uses: r-hub/rhub2/actions/rhub-setup-r@v1 + - uses: r-hub/actions/checkout@v1 + - uses: r-hub/actions/setup-r@v1 with: job-config: ${{ matrix.config.job-config }} token: ${{ secrets.RHUB_TOKEN }} - - uses: r-hub/rhub2/actions/rhub-platform-info@v1 + - uses: r-hub/actions/platform-info@v1 with: token: ${{ secrets.RHUB_TOKEN }} job-config: ${{ matrix.config.job-config }} - - uses: r-hub/rhub2/actions/rhub-setup-deps@v1 + - uses: r-hub/actions/setup-deps@v1 with: job-config: ${{ matrix.config.job-config }} token: ${{ secrets.RHUB_TOKEN }} extra-packages: TreeDistData=?ignore-before-r=99.0.0 - - uses: r-hub/rhub2/actions/rhub-run-check@v1 + - uses: r-hub/actions/run-check@v1 with: job-config: ${{ matrix.config.job-config }} token: ${{ secrets.RHUB_TOKEN }} diff --git a/.gitignore b/.gitignore index e713cc8c..d0a0bafd 100644 --- a/.gitignore +++ b/.gitignore @@ -18,6 +18,7 @@ src/*.dll *.utf8.md /* *.Rd */ revdep/ +inst/__pycache__* vignettes/*.html vignettes/*.pdf vignettes/*_files diff --git a/DESCRIPTION b/DESCRIPTION index 02f42848..1d378c98 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: TreeDist Type: Package Title: Calculate and Map Distances Between Phylogenetic Trees -Version: 2.10.1.9001 +Version: 2.10.1.9002 Authors@R: c(person("Martin R.", "Smith", email = "martin.smith@durham.ac.uk", role = c("aut", "cre", "cph", "prg"), @@ -23,6 +23,8 @@ Description: Implements measures of tree similarity, including including the Nye et al. (2006) metric ; the Matching Split Distance (Bogdanowicz & Giaro 2012) ; + the Hierarchical Mutual Information (Perotti et al. 2015) + ; Maximum Agreement Subtree distances; the Kendall-Colijn (2016) distance , and the Nearest Neighbour Interchange (NNI) distance, approximated per Li et al. diff --git a/NAMESPACE b/NAMESPACE index a38cb730..fbbb11bb 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -27,6 +27,8 @@ S3method(NNIDiameter,list) S3method(NNIDiameter,multiPhylo) S3method(NNIDiameter,numeric) S3method(NNIDiameter,phylo) +S3method(NTip,HPart) +S3method(RenumberTips,HPart) S3method(SPRDist,list) S3method(SPRDist,multiPhylo) S3method(SPRDist,phylo) @@ -35,8 +37,17 @@ S3method(SplitwiseInfo,Splits) S3method(SplitwiseInfo,list) S3method(SplitwiseInfo,multiPhylo) S3method(SplitwiseInfo,phylo) +S3method(as.HPart,HPart) +S3method(as.HPart,default) +S3method(as.HPart,list) +S3method(as.HPart,phylo) +S3method(as.phylo,HPart) +S3method(clone,HPart) S3method(median,multiPhylo) +S3method(plot,HPart) +S3method(print,HPart) export(.TreeDistance) +export(AHMI) export(AllSplitPairings) export(CalculateTreeDistance) export(ClusteringEntropy) @@ -49,10 +60,15 @@ export(DifferentPhylogeneticInfo) export(DisplayMatching) export(DistFromMed) export(DistanceFromMedian) +export(EHMI) export(Entropy) export(ExpectedVariation) export(GeneralizedRF) export(GetParallel) +export(HH) +export(HMI) +export(HierarchicalMutualInfo) +export(HierarchicalMutualInformation) export(InfoRobinsonFoulds) export(InfoRobinsonFouldsSplits) export(Islands) @@ -105,6 +121,7 @@ export(RobinsonFouldsInfo) export(RobinsonFouldsMatching) export(RobinsonFouldsSplits) export(SPRDist) +export(SelfHMI) export(SetParallel) export(SharedPhylogeneticInfo) export(SharedPhylogeneticInfoSplits) @@ -127,7 +144,10 @@ export(TreeDistance) export(TreesConsistentWithTwoSplits) export(VisualiseMatching) export(VisualizeMatching) +export(as.HPart) +export(clone) export(entropy_int) +export(is.HPart) importFrom(Rdpack,reprompt) importFrom(TreeTools,AllAncestors) importFrom(TreeTools,DropTip) @@ -141,6 +161,7 @@ importFrom(TreeTools,Log2Unrooted) importFrom(TreeTools,Log2Unrooted.int) importFrom(TreeTools,MSTEdges) importFrom(TreeTools,MSTLength) +importFrom(TreeTools,MatchStrings) importFrom(TreeTools,NRooted) importFrom(TreeTools,NSplits) importFrom(TreeTools,NTip) @@ -162,6 +183,7 @@ importFrom(TreeTools,as.ClusterTable) importFrom(TreeTools,as.Splits) importFrom(TreeTools,edge_to_splits) importFrom(ape,Nnode.phylo) +importFrom(ape,as.phylo) importFrom(ape,drop.tip) importFrom(ape,edgelabels) importFrom(ape,nodelabels) diff --git a/NEWS.md b/NEWS.md index a8cb9b6f..f20ca04a 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,7 @@ -# TreeDist 2.10.1.9001 (development) +# TreeDist 2.10.1.9002 (development) +- `HierarchicalMutualInformation()` calculates the information shared between + pairs of hierarchical partition structures . - Fix crash in `robinson_foulds_all_pairs()` and `RobinsonFoulds(list)`. - Fix bug in calculation of `MutualClusteringInfo()`: greedy optimization was not guaranteed to find globally optimal matching, causing distances to be diff --git a/R/HPart.R b/R/HPart.R new file mode 100644 index 00000000..fc4ce13a --- /dev/null +++ b/R/HPart.R @@ -0,0 +1,175 @@ +#' Hierarchical partition structure +#' +#' A structure of class `HPart` comprises a pointer to a C++ representation of +#' hierarchical partitions, with the attribute `tip.label` recording the +#' character labels of its leaves. `HPart` objects with identical tip labels +#' can be compared using [`HierarchicalMutualInfo()`]. +#' +#' +#' An `HPart` object may be created from various representations of hierarchical +#' structures: +#' +#' - a tree (possibly phylogenetic) of class `phylo` +#' - A hierarchical list of lists, in which elements are represented by integers +#' 1\dots{}n +#' - A vector, which will be interpreted as a flat structure +#' in which all elements bearing the same label are assigned to the same cluster +#' +#' @param tree An object to convert to an HPart structure, in a supported format +#' (see details). +#' @returns `HPart()` returns a structure containing a pointer to a C++ +#' representation of a hierarchical partition structure. +#' @name HPart +#' @export +as.HPart <- function(tree, tipLabels) { + UseMethod("as.HPart") +} + +#' @export +#' @rdname HPart +as.HPart.HPart <- function(tree, tipLabels = NULL) { + if (is.null(tipLabels) || identical(tipLabels, TipLabels(tree))) { + tree + } else { + RenumberTips(tree, tipLabels) + } +} + +#' @rdname HPart +#' @export +as.HPart.default <- function(tree, tipLabels = NULL) { + if (is.null(dim(tree))) { + structure(build_hpart_from_list( + lapply(unique(tree), function(x) as.list(which(tree == x))), + length(tree)), + tip.label = seq_along(tree), + class = "HPart" + ) + } else { + stop("no applicable method for 'as.HPart' applied to an object of class \"", + paste(class(tree), collapse = "\", \""), "\"") + } +} + + +#' @rdname HPart +#' @export +as.HPart.list <- function(tree, tipLabels = NULL) { + # Flatten to verify leaves + leaves <- unlist(tree, recursive = TRUE) + if (!all(is.numeric(leaves)) || any(leaves != as.integer(leaves))) { + stop("All leaves must be integers.") + } + tree <- rapply(tree, as.integer, how = "replace") + if (0 %in% leaves) { + tree <- rapply(tree, function(x) x + 1L, how = "replace") + leaves <- leaves + 1 + } + n_tip <- length(unique(leaves)) + expected <- seq_len(n_tip) + if (!isTRUE(all.equal(sort(leaves), expected))) { + stop("Leaves must contain each integer 1..n exactly once") + } + + hpart_ptr <- build_hpart_from_list(tree, n_tip) + ret <- structure(hpart_ptr, tip.label = as.character(expected), class = "HPart") + if (!is.null(tipLabels) && !is.list(tipLabels)) { + RenumberTips(ret, tipLabels) + } + ret +} + +#' @export +#' @inheritParams TreeTools::as.ClusterTable +#' @rdname HPart +as.HPart.phylo <- function(tree, tipLabels = TipLabels(tree)) { + if (!identical(TipLabels(tree), tipLabels)) { + tree <- RenumberTips(tree, tipLabels) + } + structure(build_hpart_from_phylo(tree), tip.label = tipLabels, + class = "HPart") +} + +#' @rdname HPart +#' @export +is.HPart <- function(x) { + inherits(x, "HPart") +} + +#' @export +NTip.HPart <- function(phy) { + length(TipLabels(phy)) +} + +#' @rdname HPart +#' @export +print.HPart <- function(x, ...) { + nTip <- NTip(x) + tips <- TipLabels(x) + cat("Hierarchical partition on", nTip, "leaves: ") + if (nTip > 5) { + cat(paste0(c(tips[1:2], "...", tips[length(tips) - 1:0]), collapse = ", ")) + } else { + cat(paste0(tips, collapse = ", ")) + } +} + +#' @rdname HPart +#' @importFrom ape as.phylo +#' @export +as.phylo.HPart <- function(x, ...) { + edge <- hpart_to_edge(x) + labels <- TipLabels(x) + nNode <- dim(edge)[[1]] - length(labels) + 1 + structure(list(edge = edge, Nnode = nNode, tip.label = labels), + class = "phylo", + order = "cladewise") +} + +#' @rdname HPart +#' @param x `HPart` object to plot. +#' @param \dots Additional arguments to \code{\link[ape:plot.phylo]{plot.phylo}}. +#' @export +plot.HPart <- function(x, ...) { + plot(as.phylo(x), ...) +} + +#' Clone / duplicate an object +#' `clone()` physically duplicates objects +#' @param x the object to be cloned +#' @param \dots additional parameters for methods +#' @return `clone()` typically returns an object of the same class and "value" +#' as the input `x`. +#' @export +clone <- function(x, ...) UseMethod("clone") + +#' @template MRS +#' @rdname clone +#' @inheritParams TreeTools::as.ClusterTable +#' @export +clone.HPart <- function(x, tipLabels = attr(x, "tip.label"), ...) { + structure(clone_hpart(x), tip.label = tipLabels, + class = "HPart") +} + +#' @importFrom TreeTools MatchStrings +#' @inheritParams TreeTools::RenumberTips +#' @export +RenumberTips.HPart <- function(tree, tipOrder) { + startOrder <- TipLabels(tree) + newOrder <- MatchStrings(TipLabels(tipOrder, single = TRUE), startOrder) + + if (!identical(newOrder, startOrder)) { + if (length(newOrder) != length(startOrder)) { + stop("Tree labels ", paste0(setdiff(startOrder, tipOrder), collapse = ", "), + " missing from `tipOrder`") + } + newIndices <- match(newOrder, startOrder) + tree <- clone(tree, newOrder) + relabel_hpart(tree, newIndices - 1L) + # Return: + tree + } else { + clone(tree) + } +} diff --git a/R/RcppExports.R b/R/RcppExports.R index 0b9352f3..e931b10e 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -13,6 +13,38 @@ robinson_foulds_all_pairs <- function(tables) { .Call(`_TreeDist_robinson_foulds_all_pairs`, tables) } +HMI_xptr <- function(ptr1, ptr2) { + .Call(`_TreeDist_HMI_xptr`, ptr1, ptr2) +} + +HH_xptr <- function(ptr) { + .Call(`_TreeDist_HH_xptr`, ptr) +} + +EHMI_xptr <- function(hp1_ptr, hp2_ptr, tolerance = 0.01, minResample = 36L) { + .Call(`_TreeDist_EHMI_xptr`, hp1_ptr, hp2_ptr, tolerance, minResample) +} + +build_hpart_from_phylo <- function(phy) { + .Call(`_TreeDist_build_hpart_from_phylo`, phy) +} + +build_hpart_from_list <- function(tree, n_tip) { + .Call(`_TreeDist_build_hpart_from_list`, tree, n_tip) +} + +hpart_to_edge <- function(hpart_xptr) { + .Call(`_TreeDist_hpart_to_edge`, hpart_xptr) +} + +clone_hpart <- function(hpart_ptr) { + .Call(`_TreeDist_clone_hpart`, hpart_ptr) +} + +relabel_hpart <- function(hpart_ptr, map) { + invisible(.Call(`_TreeDist_relabel_hpart`, hpart_ptr, map)) +} + #' Calculate entropy of integer vector of counts #' #' Wrapper for C++ function; no input checking is performed. diff --git a/R/hierarchical_mutual_information.R b/R/hierarchical_mutual_information.R new file mode 100644 index 00000000..a8242410 --- /dev/null +++ b/R/hierarchical_mutual_information.R @@ -0,0 +1,179 @@ +#' Hierarchical Mutual Information +#' +#' Calculate the Hierarchical Mutual Information (\acronym{HMI}) +#' between two trees, following the recursive algorithm of +#' \insertCite{Perotti2020;textual}{TreeDist}. +#' +#' `HierarchicalMutualInfo()` computes the hierarchical mutual content of trees +#' \insertCite{Perotti2015,Perotti2020}{TreeDist}, which accounts for the +#' non-independence of information represented by nested splits. +#' +#' `tree` is converted to a set of hierarchical partitions, and the mutual +#' information (in bits) is computed recursively; the contribution of a node is +#' given by: +#' +#' \deqn{I(t,s) = \log_2(n_{ts}) - \dfrac{H_{us} + H_{tv} - H_{uv}}{n_{ts}} + +#' \text{mean}(I_{uv})} +#' +#' Where: +#' \itemize{ +#' \item \eqn{n_{ts}} is the number of common elements between partitions +#' \item \eqn{H_{us}, H_{tv}, H_{uv}} are entropy terms from child comparisons +#' \item \eqn{I_{uv}} is the recursive \acronym{HMI} for child pairs +#' } +#' +#' @template sprint +#' +#' @param tree,tree1,tree2 An object that can be coerced to an [`HPart`] +#' object. +# (Not yet implemented: ) object, or a list of such objects. +# (Not yet implemented: ) If \code{tree2} is not provided, distances will be +# calculated between each pair of trees in the list \code{tree1}. +#' @param normalize If `FALSE`, return the raw \acronym{HMI}, in bits. +#' If `TRUE`, normalize to range \[0,1\] by dividing by +#' `max(SelfHMI(tree1), SelfHMI(tree2))`. +#' If a function, divide by `normalize(SelfHMI(tree1), SelfHMI(tree2))`. +#' +#' @return `HierarchicalMutualInfo()` returns a numeric value representing the +#' hierarchical mutual information between the input trees, in bits, +#' normalized as specified. +#' Higher values indicate more shared hierarchical structure. +#' +#' @examples +#' library("TreeTools", quietly = TRUE) +#' +#' tree1 <- BalancedTree(8) +#' tree2 <- PectinateTree(8) +#' +#' # Calculate HMI between two trees +#' HierarchicalMutualInfo(tree1, tree2) +#' +#' # HMI normalized against the mean information content of tree1 and tree2 +#' HierarchicalMutualInfo(tree1, tree2, normalize = mean) +#' +#' @references +#' \insertAllCited{} +#' +#' @family tree distances +#' @export +HierarchicalMutualInfo <- function(tree1, tree2 = NULL, normalize = FALSE) { + hp1 <- as.HPart(tree1) + if (is.null(tree2)) { + if (isFALSE(normalize)) { + SelfHMI(hp1) + } else { + warning("Normalized self-information == 1; did you mean to provide tree2?") + 1 + } + } else { + hp2 <- as.HPart(tree2, tree1) + hmi <- HMI_xptr(hp1, hp2) + if (isFALSE(normalize)) { + hmi / log(2) + } else { + if (isTRUE(normalize)) { + normalize <- max + } + if (!is.function(normalize)) { + stop("`normalize` must be logical, or a function") + } + denom <- normalize(HH_xptr(hp1), HH_xptr(hp2)) + hmi / denom + } + } +} + +#' Hierarchical mutual information +#' +#' An alias of `HierarchicalMutualInfo()` +#' @keywords internal +#' @export +HierarchicalMutualInformation <- HierarchicalMutualInfo + +#' @rdname HierarchicalMutualInfo +#' @export +HMI <- HierarchicalMutualInfo + +#' @return `SelfHMI()` returns the hierarchical mutual information of a tree +#' compared with itself, i.e. its hierarchical entropy (\acronym{HH}). +#' @examples +#' # Normalized HMI above is equivalent to: +#' HMI(tree1, tree2) / mean(SelfHMI(tree1), SelfHMI(tree2)) +#' @rdname HierarchicalMutualInfo +#' @export +SelfHMI <- function(tree) { + part <- as.HPart(tree) + HH_xptr(part) / log(2) +} + +#' Self hierarchical mutual information +#' +#' An alias of `SelfHMI()` +#' @export +#' @keywords internal +HH <- SelfHMI + +#' @param precision Numeric; Monte Carlo sampling will terminate once the +#' relative standard error falls below this value. +#' @param minResample Integer specifying minimum number of Monte Carlo samples +#' to conduct. Avoids early termination when sample size is too small to +#' reliably estimate the standard error of the mean. +#' @return `EHMI()` returns the expected \acronym{HMI} against a uniform +#' shuffling of element labels, estimated by performing Monte Carlo resampling +#' on the same hierarchical structure until the relative standard error of the +#' estimate falls below `precision`. +#' The attributes of the returned object list the variance (`var`), +#' standard deviation (`sd`), standard error of the mean (`sem`) and +#' relative error (`relativeError`) of the estimate, and the number of Monte +#' Carlo samples used to obtain it (`samples`). +#' @examples +#' # Expected mutual info for this pair of hierarchies +#' EHMI(tree1, tree2, precision = 0.1) +#' @rdname HierarchicalMutualInfo +#' @export +EHMI <- function(tree1, tree2, precision = 0.01, minResample = 36) { + EHMI_xptr(as.HPart(tree1), as.HPart(tree2), as.numeric(precision), + as.integer(minResample)) / log(2) +} + +.AHMISEM <- function(hmi, M, ehmi, ehmi_sem) { + deriv <- (hmi - M) / (M - ehmi)^2 + abs(deriv) * ehmi_sem +} + +#' @details `AHMI()` calculates the adjusted hierarchical mutual information: +#' \deqn{\text{AHMI}(t, s) = \dfrac{I(t, s) - \hat{I}(t, s)}{ +#' \text{mean}(H(t), H(s)) - \hat{I}(t, s)}} +#' Where: +#' - \eqn{I(t, s)} is the hierarchical mutual information between `tree1` and +#' `tree2` +#' - \eqn{\hat{I}(t, s)} is the expected \acronym{HMI} between `tree1` and +#' `tree2`, estimated by Monte Carlo sampling +#' - \eqn{H(t), H(s)} is the entropy (self-mutual information) of each tree +#' @param Mean Function by which to combine the self-information of the +#' two input hierarchies, in order to normalize the \acronym{HMI}. +#' @return `AHMI()` returns the adjusted \acronym{HMI}, normalized such that +#' zero corresponds to the expected \acronym{HMI} given a random shuffling +#' of elements on the same hierarchical structure. The attribute `sem` gives +#' the standard error of the estimate. +#' @examples +#' # The adjusted HMI normalizes against this expectation +#' AHMI(tree1, tree2, precision = 0.1) +#' @rdname HierarchicalMutualInfo +#' @export +AHMI <- function(tree1, tree2, Mean = max, precision = 0.01, minResample = 36) { + hp1 <- as.HPart(tree1) + hp2 <- as.HPart(tree2, hp1) + + ehmi <- EHMI_xptr(hp1, hp2, as.numeric(precision), as.integer(minResample)) + hmi <- HMI_xptr(hp1, hp2) + hh1 <- HH_xptr(hp1) + hh2 <- HH_xptr(hp2) + M <- Mean(hh1, hh2) + + num <- hmi - ehmi[[1]] + denom <- M - ehmi[[1]] + # Return: + structure(if (num < sqrt(.Machine$double.eps)) 0 else num / denom, + sem = .AHMISEM(hmi, M, ehmi[[1]], attr(ehmi, "sem"))) +} diff --git a/R/tree_distance.R b/R/tree_distance.R index 451680e0..10031413 100644 --- a/R/tree_distance.R +++ b/R/tree_distance.R @@ -12,7 +12,7 @@ #' @param PairScorer function taking four arguments, `splits1`, `splits2`, #' `nSplits1`, `nSplits2`, which should return the score of each pair of splits #' in a two-dimensional matrix. Additional parameters may be specified via -#' \code{\dots}. +#' \code{...}. #' @param maximize Logical specifying whether the optimal matching maximizes #' or minimizes the scores obtained by `PairScorer()`. #' @param \dots Additional parameters to `PairScorer()`. diff --git a/inst/REFERENCES.bib b/inst/REFERENCES.bib index fa6826a1..6614f645 100644 --- a/inst/REFERENCES.bib +++ b/inst/REFERENCES.bib @@ -415,6 +415,28 @@ @article{Nye2006 year = {2006} } +@article{Perotti2015, + title = {Hierarchical Mutual Information for the Comparison of Hierarchical Community Structures in Complex Networks}, + author = {Perotti, Juan Ignacio and Tessone, Claudio Juan and Caldarelli, Guido}, + year = {2015}, + journal = {Physical Review E - Statistical, Nonlinear, and Soft Matter Physics}, + volume = {92}, + number = {6}, + pages = {062825-1--062825-13}, + doi = {10.1103/PhysRevE.92.062825} +} + +@article{Perotti2020, + title = {Towards a Generalization of Information Theory for Hierarchical Partitions}, + author = {Perotti, Juan I. and Almeira, Nahuel and Saracco, Fabio}, + year = {2020}, + journal = {Physical Review E}, + volume = {101}, + number = {6}, + pages = {062148}, + doi = {10.1103/PhysRevE.101.062148} +} + @article{Phipps1971, title = {Dendrogram topology}, author = {Phipps, J. B.}, diff --git a/inst/WORDLIST b/inst/WORDLIST index 3691acba..ac56518c 100644 --- a/inst/WORDLIST +++ b/inst/WORDLIST @@ -3,8 +3,6 @@ Bocker Bogdanowicz Böcker Cai -Cao -ClusteringInfoDistance Colijn Colijn's DEFGH @@ -15,20 +13,19 @@ Farris Foulds Giaro Goos +HH +HMI +HPart Hartmanis Inc JRF JV Jaccard -JaccardRobinsonFoulds Kaski Kawa -KendallColijn LAPJV -Leeuwen LSAP -MASTInfo -MASTSize +Leeuwen MDS MKL MacKay @@ -46,20 +43,20 @@ NNIDist NyeTreeSimilarity OEIS ORCID +Perotti +PhysRevE PlotTools R's Rcpp RStudio -RdPack +Rcpp RdMacros +RdPack Regraft Sammon Sammon's Soneson's SPR -SharedPhylogeneticInfo -SmithDist -SmithSpace SPI Stamatakis TBR @@ -67,7 +64,6 @@ TBRDist TCBB TreeDistData TreeSearch -TreeSpace TreeTools Tromp Valiente @@ -86,11 +82,9 @@ codecov com cpp csoneson -deOliveira dist distory doi -dreval durham equiprobable etc @@ -102,14 +96,11 @@ hypervolumes ingroup interdecile jonker -leq magiclogic -matlab mergesort molbev msw multiPhylo -ndash outgroup partitionwise pectinate @@ -123,20 +114,16 @@ scic sensu shinyjs splitwise -spic syab sysbio textrm th tqDist -treeDist treespace uk -uninstall unrooted unsampled uspr vdiffr -yongyanghz yongyanglink -zig \ No newline at end of file +zig diff --git a/man-roxygen/sprint.R b/man-roxygen/sprint.R new file mode 100644 index 00000000..e0688690 --- /dev/null +++ b/man-roxygen/sprint.R @@ -0,0 +1,6 @@ +#' ## Experimental status +#' +#' This function was written during a code sprint: its documentation and test +#' cases have not yet been carefully scrutinized, and its implementation may +#' change without notice. +#' Please alert the maintainer to any issues you encounter. diff --git a/man/GeneralizedRF.Rd b/man/GeneralizedRF.Rd index 400e5581..23ff5229 100644 --- a/man/GeneralizedRF.Rd +++ b/man/GeneralizedRF.Rd @@ -27,7 +27,7 @@ split.} \item{PairScorer}{function taking four arguments, \code{splits1}, \code{splits2}, \code{nSplits1}, \code{nSplits2}, which should return the score of each pair of splits in a two-dimensional matrix. Additional parameters may be specified via -\code{\dots}.} +\code{...}.} \item{maximize}{Logical specifying whether the optimal matching maximizes or minimizes the scores obtained by \code{PairScorer()}.} diff --git a/man/HH.Rd b/man/HH.Rd new file mode 100644 index 00000000..953b8ec9 --- /dev/null +++ b/man/HH.Rd @@ -0,0 +1,12 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/hierarchical_mutual_information.R +\name{HH} +\alias{HH} +\title{Self hierarchical mutual information} +\usage{ +HH(tree) +} +\description{ +An alias of \code{SelfHMI()} +} +\keyword{internal} diff --git a/man/HPart.Rd b/man/HPart.Rd new file mode 100644 index 00000000..df6aebd2 --- /dev/null +++ b/man/HPart.Rd @@ -0,0 +1,65 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/HPart.R +\name{HPart} +\alias{HPart} +\alias{as.HPart} +\alias{as.HPart.HPart} +\alias{as.HPart.default} +\alias{as.HPart.list} +\alias{as.HPart.phylo} +\alias{is.HPart} +\alias{print.HPart} +\alias{as.phylo.HPart} +\alias{plot.HPart} +\title{Hierarchical partition structure} +\usage{ +as.HPart(tree, tipLabels) + +\method{as.HPart}{HPart}(tree, tipLabels = NULL) + +\method{as.HPart}{default}(tree, tipLabels = NULL) + +\method{as.HPart}{list}(tree, tipLabels = NULL) + +\method{as.HPart}{phylo}(tree, tipLabels = TipLabels(tree)) + +is.HPart(x) + +\method{print}{HPart}(x, ...) + +\method{as.phylo}{HPart}(x, ...) + +\method{plot}{HPart}(x, ...) +} +\arguments{ +\item{tree}{An object to convert to an HPart structure, in a supported format +(see details).} + +\item{tipLabels}{Character vector specifying sequence in which to order +tip labels.} + +\item{x}{\code{HPart} object to plot.} + +\item{\dots}{Additional arguments to \code{\link[ape:plot.phylo]{plot.phylo}}.} +} +\value{ +\code{HPart()} returns a structure containing a pointer to a C++ +representation of a hierarchical partition structure. +} +\description{ +A structure of class \code{HPart} comprises a pointer to a C++ representation of +hierarchical partitions, with the attribute \code{tip.label} recording the +character labels of its leaves. \code{HPart} objects with identical tip labels +can be compared using \code{\link[=HierarchicalMutualInfo]{HierarchicalMutualInfo()}}. +} +\details{ +An \code{HPart} object may be created from various representations of hierarchical +structures: +\itemize{ +\item a tree (possibly phylogenetic) of class \code{phylo} +\item A hierarchical list of lists, in which elements are represented by integers +1\dots{}n +\item A vector, which will be interpreted as a flat structure +in which all elements bearing the same label are assigned to the same cluster +} +} diff --git a/man/HierarchicalMutualInfo.Rd b/man/HierarchicalMutualInfo.Rd new file mode 100644 index 00000000..9e4788d7 --- /dev/null +++ b/man/HierarchicalMutualInfo.Rd @@ -0,0 +1,139 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/hierarchical_mutual_information.R +\name{HierarchicalMutualInfo} +\alias{HierarchicalMutualInfo} +\alias{HMI} +\alias{SelfHMI} +\alias{EHMI} +\alias{AHMI} +\title{Hierarchical Mutual Information} +\usage{ +HierarchicalMutualInfo(tree1, tree2 = NULL, normalize = FALSE) + +HMI(tree1, tree2 = NULL, normalize = FALSE) + +SelfHMI(tree) + +EHMI(tree1, tree2, precision = 0.01, minResample = 36) + +AHMI(tree1, tree2, Mean = max, precision = 0.01, minResample = 36) +} +\arguments{ +\item{normalize}{If \code{FALSE}, return the raw \acronym{HMI}, in bits. +If \code{TRUE}, normalize to range [0,1] by dividing by +\code{max(SelfHMI(tree1), SelfHMI(tree2))}. +If a function, divide by \code{normalize(SelfHMI(tree1), SelfHMI(tree2))}.} + +\item{tree, tree1, tree2}{An object that can be coerced to an \code{\link{HPart}} +object.} + +\item{precision}{Numeric; Monte Carlo sampling will terminate once the +relative standard error falls below this value.} + +\item{minResample}{Integer specifying minimum number of Monte Carlo samples +to conduct. Avoids early termination when sample size is too small to +reliably estimate the standard error of the mean.} + +\item{Mean}{Function by which to combine the self-information of the +two input hierarchies, in order to normalize the \acronym{HMI}.} +} +\value{ +\code{HierarchicalMutualInfo()} returns a numeric value representing the +hierarchical mutual information between the input trees, in bits, +normalized as specified. +Higher values indicate more shared hierarchical structure. + +\code{SelfHMI()} returns the hierarchical mutual information of a tree +compared with itself, i.e. its hierarchical entropy (\acronym{HH}). + +\code{EHMI()} returns the expected \acronym{HMI} against a uniform +shuffling of element labels, estimated by performing Monte Carlo resampling +on the same hierarchical structure until the relative standard error of the +estimate falls below \code{precision}. +The attributes of the returned object list the variance (\code{var}), +standard deviation (\code{sd}), standard error of the mean (\code{sem}) and +relative error (\code{relativeError}) of the estimate, and the number of Monte +Carlo samples used to obtain it (\code{samples}). + +\code{AHMI()} returns the adjusted \acronym{HMI}, normalized such that +zero corresponds to the expected \acronym{HMI} given a random shuffling +of elements on the same hierarchical structure. The attribute \code{sem} gives +the standard error of the estimate. +} +\description{ +Calculate the Hierarchical Mutual Information (\acronym{HMI}) +between two trees, following the recursive algorithm of +\insertCite{Perotti2020;textual}{TreeDist}. + +This function was written during a code sprint: its documentation and test +cases have not yet been carefully scrutinized, and its implementation may +change without notice. +Please alert the maintainer to any issues you encounter. +} +\details{ +\code{HierarchicalMutualInfo()} computes the hierarchical mutual content of trees +\insertCite{Perotti2015,Perotti2020}{TreeDist}, which accounts for the +non-independence of information represented by nested splits. + +\code{tree} is converted to a set of hierarchical partitions, and the mutual +information (in bits) is computed recursively; the contribution of a node is +given by: + +\deqn{I(t,s) = \log_2(n_{ts}) - \dfrac{H_{us} + H_{tv} - H_{uv}}{n_{ts}} + +\text{mean}(I_{uv})} + +Where: +\itemize{ +\item \eqn{n_{ts}} is the number of common elements between partitions +\item \eqn{H_{us}, H_{tv}, H_{uv}} are entropy terms from child comparisons +\item \eqn{I_{uv}} is the recursive \acronym{HMI} for child pairs +} + +\code{AHMI()} calculates the adjusted hierarchical mutual information: +\deqn{\text{AHMI}(t, s) = \dfrac{I(t, s) - \hat{I}(t, s)}{ + \text{mean}(H(t), H(s)) - \hat{I}(t, s)}} +Where: +\itemize{ +\item \eqn{I(t, s)} is the hierarchical mutual information between \code{tree1} and +\code{tree2} +\item \eqn{\hat{I}(t, s)} is the expected \acronym{HMI} between \code{tree1} and +\code{tree2}, estimated by Monte Carlo sampling +\item \eqn{H(t), H(s)} is the entropy (self-mutual information) of each tree +} +} +\examples{ +library("TreeTools", quietly = TRUE) + +tree1 <- BalancedTree(8) +tree2 <- PectinateTree(8) + +# Calculate HMI between two trees +HierarchicalMutualInfo(tree1, tree2) + +# HMI normalized against the mean information content of tree1 and tree2 +HierarchicalMutualInfo(tree1, tree2, normalize = mean) + +# Normalized HMI above is equivalent to: +HMI(tree1, tree2) / mean(SelfHMI(tree1), SelfHMI(tree2)) +# Expected mutual info for this pair of hierarchies +EHMI(tree1, tree2, precision = 0.1) +# The adjusted HMI normalizes against this expectation +AHMI(tree1, tree2, precision = 0.1) +} +\references{ +\insertAllCited{} +} +\seealso{ +Other tree distances: +\code{\link{JaccardRobinsonFoulds}()}, +\code{\link{KendallColijn}()}, +\code{\link{MASTSize}()}, +\code{\link{MatchingSplitDistance}()}, +\code{\link{NNIDist}()}, +\code{\link{NyeSimilarity}()}, +\code{\link{PathDist}()}, +\code{\link{Robinson-Foulds}}, +\code{\link{SPRDist}()}, +\code{\link{TreeDistance}()} +} +\concept{tree distances} diff --git a/man/HierarchicalMutualInformation.Rd b/man/HierarchicalMutualInformation.Rd new file mode 100644 index 00000000..998094c6 --- /dev/null +++ b/man/HierarchicalMutualInformation.Rd @@ -0,0 +1,12 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/hierarchical_mutual_information.R +\name{HierarchicalMutualInformation} +\alias{HierarchicalMutualInformation} +\title{Hierarchical mutual information} +\usage{ +HierarchicalMutualInformation(tree1, tree2 = NULL, normalize = FALSE) +} +\description{ +An alias of \code{HierarchicalMutualInfo()} +} +\keyword{internal} diff --git a/man/JaccardRobinsonFoulds.Rd b/man/JaccardRobinsonFoulds.Rd index 2a701cb1..5df32914 100644 --- a/man/JaccardRobinsonFoulds.Rd +++ b/man/JaccardRobinsonFoulds.Rd @@ -127,6 +127,7 @@ VisualizeMatching(JRF2, tree1, tree2, matchZeros = FALSE) } \seealso{ Other tree distances: +\code{\link{HierarchicalMutualInfo}()}, \code{\link{KendallColijn}()}, \code{\link{MASTSize}()}, \code{\link{MatchingSplitDistance}()}, diff --git a/man/KendallColijn.Rd b/man/KendallColijn.Rd index 54e912df..cfd506b5 100644 --- a/man/KendallColijn.Rd +++ b/man/KendallColijn.Rd @@ -126,6 +126,7 @@ is a more sophisticated, if more cumbersome, implementation that supports lambda > 0, i.e. use of edge lengths in tree comparison. Other tree distances: +\code{\link{HierarchicalMutualInfo}()}, \code{\link{JaccardRobinsonFoulds}()}, \code{\link{MASTSize}()}, \code{\link{MatchingSplitDistance}()}, diff --git a/man/MASTSize.Rd b/man/MASTSize.Rd index d4f4d425..5ecdd271 100644 --- a/man/MASTSize.Rd +++ b/man/MASTSize.Rd @@ -63,6 +63,7 @@ CompareAll(as.phylo(0:4, 8), MASTInfo) leaves contained within the subtree. Other tree distances: +\code{\link{HierarchicalMutualInfo}()}, \code{\link{JaccardRobinsonFoulds}()}, \code{\link{KendallColijn}()}, \code{\link{MatchingSplitDistance}()}, diff --git a/man/MatchingSplitDistance.Rd b/man/MatchingSplitDistance.Rd index 685bff3f..83af446f 100644 --- a/man/MatchingSplitDistance.Rd +++ b/man/MatchingSplitDistance.Rd @@ -85,6 +85,7 @@ VisualizeMatching(MatchingSplitDistance, TreeTools::BalancedTree(6), } \seealso{ Other tree distances: +\code{\link{HierarchicalMutualInfo}()}, \code{\link{JaccardRobinsonFoulds}()}, \code{\link{KendallColijn}()}, \code{\link{MASTSize}()}, diff --git a/man/NNIDist.Rd b/man/NNIDist.Rd index a0d8bc8f..70895e04 100644 --- a/man/NNIDist.Rd +++ b/man/NNIDist.Rd @@ -102,6 +102,7 @@ CompareAll(as.phylo(30:33, 8), NNIDist) } \seealso{ Other tree distances: +\code{\link{HierarchicalMutualInfo}()}, \code{\link{JaccardRobinsonFoulds}()}, \code{\link{KendallColijn}()}, \code{\link{MASTSize}()}, diff --git a/man/NyeSimilarity.Rd b/man/NyeSimilarity.Rd index 2e741925..892ed185 100644 --- a/man/NyeSimilarity.Rd +++ b/man/NyeSimilarity.Rd @@ -123,6 +123,7 @@ NyeSimilarity(as.phylo(0:5, nTip = 8), similarity = FALSE) } \seealso{ Other tree distances: +\code{\link{HierarchicalMutualInfo}()}, \code{\link{JaccardRobinsonFoulds}()}, \code{\link{KendallColijn}()}, \code{\link{MASTSize}()}, diff --git a/man/PathDist.Rd b/man/PathDist.Rd index 791c7018..5cbfe90c 100644 --- a/man/PathDist.Rd +++ b/man/PathDist.Rd @@ -69,6 +69,7 @@ PathDist(as.phylo(30:33, 8)) } \seealso{ Other tree distances: +\code{\link{HierarchicalMutualInfo}()}, \code{\link{JaccardRobinsonFoulds}()}, \code{\link{KendallColijn}()}, \code{\link{MASTSize}()}, diff --git a/man/Robinson-Foulds.Rd b/man/Robinson-Foulds.Rd index 6ffbad44..a425e8d1 100644 --- a/man/Robinson-Foulds.Rd +++ b/man/Robinson-Foulds.Rd @@ -145,6 +145,7 @@ VisualizeMatching(InfoRobinsonFoulds, balanced7, pectinate7) Display paired splits: \code{\link[=VisualizeMatching]{VisualizeMatching()}} Other tree distances: +\code{\link{HierarchicalMutualInfo}()}, \code{\link{JaccardRobinsonFoulds}()}, \code{\link{KendallColijn}()}, \code{\link{MASTSize}()}, diff --git a/man/SPRDist.Rd b/man/SPRDist.Rd index cad1671c..3ab40775 100644 --- a/man/SPRDist.Rd +++ b/man/SPRDist.Rd @@ -67,6 +67,7 @@ the \insertCite{deOliveira2008;textual}{TreeDist} algorithm but can crash when sent trees of certain formats, and tends to have a longer running time. Other tree distances: +\code{\link{HierarchicalMutualInfo}()}, \code{\link{JaccardRobinsonFoulds}()}, \code{\link{KendallColijn}()}, \code{\link{MASTSize}()}, diff --git a/man/TreeDistance.Rd b/man/TreeDistance.Rd index 3a4fa284..feede9e8 100644 --- a/man/TreeDistance.Rd +++ b/man/TreeDistance.Rd @@ -304,6 +304,7 @@ MutualClusteringInfoSplits(splits1, splits2) } \seealso{ Other tree distances: +\code{\link{HierarchicalMutualInfo}()}, \code{\link{JaccardRobinsonFoulds}()}, \code{\link{KendallColijn}()}, \code{\link{MASTSize}()}, diff --git a/man/clone.Rd b/man/clone.Rd new file mode 100644 index 00000000..79a20acb --- /dev/null +++ b/man/clone.Rd @@ -0,0 +1,32 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/HPart.R +\name{clone} +\alias{clone} +\alias{clone.HPart} +\title{Clone / duplicate an object +\code{clone()} physically duplicates objects} +\usage{ +clone(x, ...) + +\method{clone}{HPart}(x, tipLabels = attr(x, "tip.label"), ...) +} +\arguments{ +\item{x}{the object to be cloned} + +\item{\dots}{additional parameters for methods} + +\item{tipLabels}{Character vector specifying sequence in which to order +tip labels.} +} +\value{ +\code{clone()} typically returns an object of the same class and "value" +as the input \code{x}. +} +\description{ +Clone / duplicate an object +\code{clone()} physically duplicates objects +} +\author{ +\href{https://orcid.org/0000-0001-5660-1727}{Martin R. Smith} +(\href{mailto:martin.smith@durham.ac.uk}{martin.smith@durham.ac.uk}) +} diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index e04f32f9..fd04a6c3 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -45,6 +45,99 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// HMI_xptr +double HMI_xptr(SEXP ptr1, SEXP ptr2); +RcppExport SEXP _TreeDist_HMI_xptr(SEXP ptr1SEXP, SEXP ptr2SEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< SEXP >::type ptr1(ptr1SEXP); + Rcpp::traits::input_parameter< SEXP >::type ptr2(ptr2SEXP); + rcpp_result_gen = Rcpp::wrap(HMI_xptr(ptr1, ptr2)); + return rcpp_result_gen; +END_RCPP +} +// HH_xptr +double HH_xptr(SEXP ptr); +RcppExport SEXP _TreeDist_HH_xptr(SEXP ptrSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< SEXP >::type ptr(ptrSEXP); + rcpp_result_gen = Rcpp::wrap(HH_xptr(ptr)); + return rcpp_result_gen; +END_RCPP +} +// EHMI_xptr +Rcpp::NumericVector EHMI_xptr(SEXP hp1_ptr, SEXP hp2_ptr, double tolerance, int minResample); +RcppExport SEXP _TreeDist_EHMI_xptr(SEXP hp1_ptrSEXP, SEXP hp2_ptrSEXP, SEXP toleranceSEXP, SEXP minResampleSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< SEXP >::type hp1_ptr(hp1_ptrSEXP); + Rcpp::traits::input_parameter< SEXP >::type hp2_ptr(hp2_ptrSEXP); + Rcpp::traits::input_parameter< double >::type tolerance(toleranceSEXP); + Rcpp::traits::input_parameter< int >::type minResample(minResampleSEXP); + rcpp_result_gen = Rcpp::wrap(EHMI_xptr(hp1_ptr, hp2_ptr, tolerance, minResample)); + return rcpp_result_gen; +END_RCPP +} +// build_hpart_from_phylo +SEXP build_hpart_from_phylo(List phy); +RcppExport SEXP _TreeDist_build_hpart_from_phylo(SEXP phySEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< List >::type phy(phySEXP); + rcpp_result_gen = Rcpp::wrap(build_hpart_from_phylo(phy)); + return rcpp_result_gen; +END_RCPP +} +// build_hpart_from_list +SEXP build_hpart_from_list(RObject tree, const int n_tip); +RcppExport SEXP _TreeDist_build_hpart_from_list(SEXP treeSEXP, SEXP n_tipSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< RObject >::type tree(treeSEXP); + Rcpp::traits::input_parameter< const int >::type n_tip(n_tipSEXP); + rcpp_result_gen = Rcpp::wrap(build_hpart_from_list(tree, n_tip)); + return rcpp_result_gen; +END_RCPP +} +// hpart_to_edge +Rcpp::IntegerMatrix hpart_to_edge(SEXP hpart_xptr); +RcppExport SEXP _TreeDist_hpart_to_edge(SEXP hpart_xptrSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< SEXP >::type hpart_xptr(hpart_xptrSEXP); + rcpp_result_gen = Rcpp::wrap(hpart_to_edge(hpart_xptr)); + return rcpp_result_gen; +END_RCPP +} +// clone_hpart +SEXP clone_hpart(SEXP hpart_ptr); +RcppExport SEXP _TreeDist_clone_hpart(SEXP hpart_ptrSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< SEXP >::type hpart_ptr(hpart_ptrSEXP); + rcpp_result_gen = Rcpp::wrap(clone_hpart(hpart_ptr)); + return rcpp_result_gen; +END_RCPP +} +// relabel_hpart +void relabel_hpart(SEXP hpart_ptr, const std::vector& map); +RcppExport SEXP _TreeDist_relabel_hpart(SEXP hpart_ptrSEXP, SEXP mapSEXP) { +BEGIN_RCPP + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< SEXP >::type hpart_ptr(hpart_ptrSEXP); + Rcpp::traits::input_parameter< const std::vector& >::type map(mapSEXP); + relabel_hpart(hpart_ptr, map); + return R_NilValue; +END_RCPP +} // entropy_int double entropy_int(const Rcpp::IntegerVector& n); RcppExport SEXP _TreeDist_entropy_int(SEXP nSEXP) { @@ -289,6 +382,14 @@ static const R_CallMethodDef CallEntries[] = { {"_TreeDist_COMCLUST", (DL_FUNC) &_TreeDist_COMCLUST, 1}, {"_TreeDist_consensus_info", (DL_FUNC) &_TreeDist_consensus_info, 3}, {"_TreeDist_robinson_foulds_all_pairs", (DL_FUNC) &_TreeDist_robinson_foulds_all_pairs, 1}, + {"_TreeDist_HMI_xptr", (DL_FUNC) &_TreeDist_HMI_xptr, 2}, + {"_TreeDist_HH_xptr", (DL_FUNC) &_TreeDist_HH_xptr, 1}, + {"_TreeDist_EHMI_xptr", (DL_FUNC) &_TreeDist_EHMI_xptr, 4}, + {"_TreeDist_build_hpart_from_phylo", (DL_FUNC) &_TreeDist_build_hpart_from_phylo, 1}, + {"_TreeDist_build_hpart_from_list", (DL_FUNC) &_TreeDist_build_hpart_from_list, 2}, + {"_TreeDist_hpart_to_edge", (DL_FUNC) &_TreeDist_hpart_to_edge, 1}, + {"_TreeDist_clone_hpart", (DL_FUNC) &_TreeDist_clone_hpart, 1}, + {"_TreeDist_relabel_hpart", (DL_FUNC) &_TreeDist_relabel_hpart, 2}, {"_TreeDist_entropy_int", (DL_FUNC) &_TreeDist_entropy_int, 1}, {"_TreeDist_lapjv", (DL_FUNC) &_TreeDist_lapjv, 2}, {"_TreeDist_cpp_mast", (DL_FUNC) &_TreeDist_cpp_mast, 3}, diff --git a/src/hmi.cpp b/src/hmi.cpp new file mode 100644 index 00000000..85b9f2d0 --- /dev/null +++ b/src/hmi.cpp @@ -0,0 +1,227 @@ +#include "hpart.h" +#include +#include +using namespace Rcpp; + +namespace TreeDist { + +static inline double x_log_x(size_t x) { + return x > 1 ? x * std::log(x) : 0.0; +} + +static inline size_t intersection_size(const std::vector& A, + const std::vector& B) { + size_t count = 0; + ASSERT(A.size() == B.size()); + size_t size = A.size(); + + for (size_t i = 0; i < size; ++i) { + count += __builtin_popcountll(A[i] & B[i]); + } + return count; +} + +// Hierarchical Mutual Information core +std::pair hierarchical_mutual_info( + const std::vector& u_nodes, + size_t u_idx, + const std::vector& v_nodes, + size_t v_idx +) { + + const auto& Ut = u_nodes[u_idx]; + const auto& Us = v_nodes[v_idx]; + + if (Ut.all_kids_leaves || Us.all_kids_leaves) { + return {intersection_size(Ut.bitset, Us.bitset), 0.0}; + } + + const size_t Us_size = Us.children.size(); + + size_t n_ts = 0; + double H_uv = 0.0; + double H_us = 0.0; + double H_tv = 0.0; + std::vector n_tv(Us_size, 0); + double mean_I_ts = 0.0; + + for (size_t u_child_idx : Ut.children) { + size_t n_us = 0; + + for (size_t v = 0; v < Us_size; ++v) { + size_t v_child_idx = Us.children[v]; + + auto [n_uv, I_uv] = hierarchical_mutual_info(u_nodes, u_child_idx, + v_nodes, v_child_idx); + + n_ts += n_uv; + n_tv[v] += n_uv; + n_us += n_uv; + H_uv += x_log_x(n_uv); + mean_I_ts += n_uv * I_uv; + } + H_us += x_log_x(n_us); + } + + for (auto _n_tv : n_tv) { + H_tv += x_log_x(_n_tv); + } + + if (n_ts == 0) { + return {0, 0.0}; + } + + double local_I_ts = std::log(static_cast(n_ts)) - (H_us + H_tv - H_uv) / static_cast(n_ts); + mean_I_ts /= static_cast(n_ts); + double I_ts = local_I_ts + mean_I_ts; + + return {n_ts, I_ts}; +} + + +double hierarchical_self_info(const std::vector& nodes, size_t idx) { + const auto& n = nodes[idx]; + + if (n.all_kids_leaves) return 0.0; + + size_t n_ts = 0; + double H_uv = 0.0; + double H_us = 0.0; + double H_tv = 0.0; + const size_t n_children = n.children.size(); + std::vector n_tv(n_children, 0); + double mean_I_ts = 0.0; + + for (size_t i = 0; i < n_children; ++i) { + size_t u_idx = n.children[i]; + size_t n_us = 0; + for (size_t j = 0; j < n_children; ++j) { + size_t v_idx = n.children[j]; + size_t n_uv = (u_idx == v_idx) ? nodes[u_idx].leaf_count + : intersection_size(nodes[u_idx].bitset, nodes[v_idx].bitset); + + n_ts += n_uv; + n_tv[j] += n_uv; + n_us += n_uv; + H_uv += x_log_x(n_uv); + mean_I_ts += n_uv * hierarchical_self_info(nodes, u_idx); // recurse + } + H_us += x_log_x(n_us); + } + + for (auto _n_tv : n_tv) H_tv += x_log_x(_n_tv); + + if (n_ts == 0) return 0.0; + double local_I_ts = std::log(static_cast(n_ts)) - (H_us + H_tv - H_uv) / static_cast(n_ts); + mean_I_ts /= static_cast(n_ts); + + return local_I_ts + mean_I_ts; +} + + + +} // namespace TreeDist + +// [[Rcpp::export]] +double HMI_xptr(SEXP ptr1, SEXP ptr2) { + Rcpp::XPtr hp1(ptr1); + Rcpp::XPtr hp2(ptr2); + if (hp1->nodes[hp1->root].n_tip != hp2->nodes[hp2->root].n_tip) { + Rcpp::stop("Trees must have the same number of leaves"); + } + return TreeDist::hierarchical_mutual_info(hp1->nodes, hp1->root, + hp2->nodes, hp2->root).second; +} + +// [[Rcpp::export]] +double HH_xptr(SEXP ptr) { + Rcpp::XPtr hp(ptr); + if (hp->entropy == std::numeric_limits::min()) { + // When requring C++26, update to constexpr + const double eps = std::sqrt(std::numeric_limits::epsilon()); + const double value = hierarchical_self_info(hp->nodes, hp->root); + hp->entropy = std::abs(value) < eps ? 0 : value; + } + return hp->entropy; +} + +inline void fisher_yates_shuffle(std::vector& v) noexcept { + for (size_t i = v.size() - 1; i > 0; --i) { + size_t j = static_cast(std::floor(R::unif_rand() * (i + 1))); + std::swap(v[i], v[j]); + } +} + +// [[Rcpp::export]] +Rcpp::NumericVector EHMI_xptr(SEXP hp1_ptr, SEXP hp2_ptr, + double tolerance = 0.01, + int minResample = 36) { + + if (minResample < 2) { + Rcpp::stop("Must perform at least one resampling"); + } + if (tolerance < 1e-8) { + Rcpp::stop("Tolerance too low"); + } + + Rcpp::XPtr hp1(hp1_ptr); + Rcpp::XPtr hp2(hp2_ptr); + + const size_t n_tip = hp1->nodes[hp1->root].n_tip; + ASSERT(hp2->nodes[hp2->root].n_tip == n_tip); + + // Collect original leaf labels (1-based) + std::vector leaves; + for (size_t i = 1; i < hp1->nodes.size(); ++i) { + if (hp1->nodes[i].leaf_count == 1) + leaves.push_back(hp1->nodes[i].label + 1); // R 1-based + } + + double runMean = 0.0; + double runS = 0.0; + int runN = 0; + double relativeError = tolerance * 2; // Avoid -Wmaybe-uninitialized + + Rcpp::RNGScope scope; + + SEXP hp1_shuf = clone_hpart(hp1_ptr); + std::vector shuffled(n_tip); + std::iota(shuffled.begin(), shuffled.end(), 0); + + while (relativeError > tolerance || runN < minResample) { + // Shuffle leaves + fisher_yates_shuffle(shuffled); + + // Apply shuffled labels + relabel_hpart(hp1_shuf, shuffled); + + // Compute HMI + double x = HMI_xptr(hp1_shuf, hp2); + + // Welford update + runN++; + double delta = x - runMean; + runMean += delta / runN; + runS += delta * (x - runMean); + + double runVar = (runN > 1) ? runS / (runN - 1) : 0.0; + double runSD = std::sqrt(runVar); + double runSEM = runSD / std::sqrt(runN); + relativeError = std::abs(runMean) < 1e-6 ? + runSEM : + runSEM / std::abs(runMean); + } + + double runVar = (runN > 1) ? runS / (runN - 1) : 0.0; + double runSD = std::sqrt(runVar); + double runSEM = runSD / std::sqrt(runN); + + Rcpp::NumericVector result = Rcpp::NumericVector::create(runMean); + result.attr("var") = runVar; + result.attr("sd") = runSD; + result.attr("sem") = runSEM; + result.attr("samples") = runN; + result.attr("relativeError") = relativeError; + + return result; +} diff --git a/src/hpart.cpp b/src/hpart.cpp new file mode 100644 index 00000000..f26847fe --- /dev/null +++ b/src/hpart.cpp @@ -0,0 +1,232 @@ +#include "hpart.h" +#include + +// [[Rcpp::depends(TreeTools)]] +#include // for preorder_edges_and_nodes + +using namespace Rcpp; + +// [[Rcpp::export]] +SEXP build_hpart_from_phylo(List phy) { + IntegerMatrix edge = phy["edge"]; + CharacterVector tip_label = phy["tip.label"]; + int n_tip = tip_label.size(); + int n_node = phy["Nnode"]; + + IntegerMatrix reordered = TreeTools::preorder_edges_and_nodes(edge(_,0), edge(_,1)); + + const size_t vec_size = n_tip + n_node + 1; + std::vector> children(vec_size); + for (size_t i = 0; i < children.size(); ++i) { + children[i].reserve(2); + } + for (size_t i = 0; i < (size_t)reordered.nrow(); ++i) { + size_t p = reordered(i, 0); + size_t c = reordered(i, 1); + children[p].push_back(c); + } + + TreeDist::HPart* hpart = new TreeDist::HPart(); + hpart->nodes.resize(vec_size); + + // Initialize all nodes to empty + const int n_block = (n_tip + 63) / 64; + for (size_t i = 1; i < vec_size; ++i) { + hpart->nodes[i].n_tip = n_tip; + hpart->nodes[i].bitset.resize(n_block, 0); + } + + // Initialize tips + for (size_t i = 1; i <= (size_t)n_tip; ++i) { + const size_t bit_index = i - 1; // 0-based indexing + const size_t vector_pos = bit_index / 64; + const size_t bit_pos_in_block = bit_index % 64; + auto &node_i = hpart->nodes[i]; + node_i.bitset[vector_pos] = 1ULL << bit_pos_in_block; + node_i.leaf_count = 1; + node_i.label = i - 1; + } + + // Traverse nodes in postorder + for (size_t i = vec_size - 1; i > (size_t)n_tip; --i) { + auto &node_i = hpart->nodes[i]; + node_i.children.reserve(children[i].size()); + + for (size_t child_id : children[i]) { + const auto child_node = &hpart->nodes[child_id]; + + node_i.children.push_back(child_id); + const size_t child_leaves = child_node->leaf_count; + if (child_leaves > 1) { + node_i.all_kids_leaves = false; + } + node_i.leaf_count += child_leaves; + + for (size_t chunk = 0; chunk < node_i.bitset.size(); ++chunk) { + node_i.bitset[chunk] |= child_node->bitset[chunk]; + } + } + } + + hpart->root = n_tip + 1; + + return Rcpp::XPtr(hpart, true); +} + +#include +#include "hpart.h" + +using namespace Rcpp; + +// Forward declaration +size_t build_node_from_list(const RObject& node, + std::vector& nodes, + const int n_tip, + int& next_index, + const size_t n_block); + +// [[Rcpp::export]] +SEXP build_hpart_from_list(RObject tree, const int n_tip) { + const size_t vec_size = n_tip + n_tip + 2; // max nodes for a binary tree + 1 + + auto hpart = new TreeDist::HPart(); + hpart->nodes.resize(vec_size); + + const size_t n_block = (n_tip + 63) / 64; + + // Initialize leaves and internal nodes + for (size_t i = 1; i < vec_size; ++i) { + TreeDist::HNode& n = hpart->nodes[i]; + n.n_tip = n_tip; + n.bitset.assign(n_block, 0ULL); + n.leaf_count = 0; + n.all_kids_leaves = true; + n.label = -1; + n.children.clear(); + } + + // Start internal nodes at n_tip + 1 + int next_index = n_tip + 1; + + // Build recursively; returns index into nodes vector (1-based) + hpart->root = build_node_from_list(tree, hpart->nodes, n_tip, next_index, n_block); + + return Rcpp::XPtr(hpart, true); +} + + +// Recursive builder +size_t build_node_from_list(const RObject& node, + std::vector& nodes, + int n_tip, + int& next_index, + size_t n_block) { + + if (Rf_isInteger(node) || Rf_isReal(node)) { + const IntegerVector leaf_vec(node); + if (leaf_vec.size() != 1) { + Rcpp::stop("List must only contain integers, not vectors of integers"); // #nocov + } + const int leaf_label = leaf_vec[0]; // 1-based R leaf label + const size_t leaf_idx = leaf_label - 1; // 0-based label for HNode + TreeDist::HNode& leaf = nodes[leaf_label]; + leaf.label = leaf_idx; + leaf.leaf_count = 1; + leaf.bitset[leaf_idx / 64] = 1ULL << (leaf_idx % 64); + leaf.all_kids_leaves = true; + return leaf_label; + } else if (Rf_isVectorList(node)) { + int n_elements = Rf_length(node); + + // Special case: a single leaf inside a length-1 list + if (n_elements == 1 && + (Rf_isInteger(VECTOR_ELT(node,0)) || Rf_isReal(VECTOR_ELT(node,0)))) { + return build_node_from_list(VECTOR_ELT(node, 0), nodes, n_tip, next_index, n_block); + } + + // Allocate a new internal node + const size_t my_idx = static_cast(next_index++); + TreeDist::HNode& n = nodes[my_idx]; + n.children.reserve(n_elements); + n.leaf_count = 0; + n.all_kids_leaves = true; + n.n_tip = n_tip; + + // Recurse over children + for (int i = 0; i < n_elements; ++i) { + SEXP child = VECTOR_ELT(node, i); + const size_t child_idx = build_node_from_list(child, nodes, n_tip, next_index, n_block); + n.children.push_back(child_idx); + + // Merge bitsets + for (size_t j = 0; j < n.bitset.size(); ++j) { + n.bitset[j] |= nodes[child_idx].bitset[j]; + } + + n.leaf_count += nodes[child_idx].leaf_count; + if (nodes[child_idx].leaf_count > 1) n.all_kids_leaves = false; + } + + return my_idx; + } + + // Invalid node type + Rcpp::stop("Invalid node type"); // #nocov +} + + +// helper: get index of a node pointer within hpart->nodes +inline size_t node_index(const TreeDist::HNode* node, + const std::vector& nodes) { + return static_cast(node - &nodes[0]); +} + +// [[Rcpp::export]] +Rcpp::IntegerMatrix hpart_to_edge(SEXP hpart_xptr) { + Rcpp::XPtr hpart(hpart_xptr); + TreeDist::HPart* hp = hpart.get(); + + int n_tip = hp->nodes[1].n_tip; + int n_node = static_cast(hp->nodes.size()) - n_tip - 1; + int n_edge = n_tip + n_node - 1; + + // Assign IDs: tips get their label, internal nodes get sequential IDs + std::vector node_ids(hp->nodes.size(), -1); + int next_index = n_tip + 1; + for (int i = 1; i <= n_tip; ++i) { + node_ids[i] = hp->nodes[i].label + 1; // 1-based R tip index + } + for (size_t i = n_tip + 1; i < hp->nodes.size(); ++i) { + node_ids[i] = next_index++; + } + + // Collect edges + std::vector> edges; + edges.reserve(n_edge); + + for (size_t i = n_tip + 1; i < hp->nodes.size(); ++i) { + int parent_id = node_ids[i]; + for (size_t cidx : hp->nodes[i].children) { + int child_id = node_ids[cidx]; + edges.emplace_back(parent_id, child_id); + } + } + + // Build R matrix + IntegerMatrix edge_mat(edges.size(), 2); + for (size_t i = 0; i < edges.size(); ++i) { + edge_mat(i, 0) = edges[i].first; + edge_mat(i, 1) = edges[i].second; + } + + return edge_mat; +} + +// [[Rcpp::export]] +SEXP clone_hpart(SEXP hpart_ptr) { + Rcpp::XPtr src(hpart_ptr); + + TreeDist::HPart* copy = new TreeDist::HPart(*src); + + return Rcpp::XPtr(copy, true); +} diff --git a/src/hpart.h b/src/hpart.h new file mode 100644 index 00000000..063a2be8 --- /dev/null +++ b/src/hpart.h @@ -0,0 +1,26 @@ +// src/hpart.h +#pragma once +#include +#include +#include // for ASSERT +#include + +namespace TreeDist { +struct HNode { + std::vector children; // indices of children in HPart.nodes + int label = -1; // for tips; counting from zero + std::vector bitset; // leaf set + int leaf_count = 0; + bool all_kids_leaves = true; + int n_tip = 0; +}; + +struct HPart { + std::vector nodes; // owns all nodes + size_t root; + double entropy = std::numeric_limits::min(); +}; +} + +SEXP clone_hpart(SEXP hpart_ptr); +void relabel_hpart(SEXP hpart_ptr, const std::vector& map); diff --git a/src/hpart_relabel.cpp b/src/hpart_relabel.cpp new file mode 100644 index 00000000..6f7906ef --- /dev/null +++ b/src/hpart_relabel.cpp @@ -0,0 +1,59 @@ +#include "hpart.h" +#include +using namespace Rcpp; + +namespace TreeDist { + +// --- postorder recompute internal nodes --- +void recompute_bitsets_postorder(TreeDist::HPart &hpart, const size_t node_idx, + const std::vector &mapping, + const size_t n_block) { + auto &node = hpart.nodes[node_idx]; + + if (node.children.empty()) { + // Leaf node + if (node.leaf_count != 1) { + Rcpp::stop("Leaf node has leaf_count != 1"); // #nocov + } + int new_index = mapping[node.label]; // mapping is 0-based + node.label = new_index; + node.bitset.assign(n_block, 0ULL); + node.bitset[new_index / 64] = 1ULL << (new_index % 64); + } else { + // Postorder: first child + recompute_bitsets_postorder(hpart, node.children[0], mapping, n_block); + auto &first_child = hpart.nodes[node.children[0]]; + node.bitset = first_child.bitset; + node.leaf_count = first_child.leaf_count; + node.all_kids_leaves = (node.leaf_count == 1); + + // Remaining children + for (size_t ci = 1; ci < node.children.size(); ++ci) { + size_t child_idx = node.children[ci]; + recompute_bitsets_postorder(hpart, child_idx, mapping, n_block); + auto &child = hpart.nodes[child_idx]; + + for (size_t chunk = 0; chunk < node.bitset.size(); ++chunk) { + node.bitset[chunk] |= child.bitset[chunk]; + } + + node.leaf_count += child.leaf_count; + if (child.leaf_count > 1) { + node.all_kids_leaves = false; + } + } + } +} + +} // namespace TreeDist + + +// [[Rcpp::export]] +void relabel_hpart(SEXP hpart_ptr, const std::vector& map) { + Rcpp::XPtr hpart_xptr(hpart_ptr); + TreeDist::HPart &hpart = *hpart_xptr; + + const size_t n_block = hpart.nodes[1].bitset.size(); + + TreeDist::recompute_bitsets_postorder(hpart, hpart.root, map, n_block); +} diff --git a/tests/testthat/_snaps/HPart/plot-hpart.svg b/tests/testthat/_snaps/HPart/plot-hpart.svg new file mode 100644 index 00000000..285ac14f --- /dev/null +++ b/tests/testthat/_snaps/HPart/plot-hpart.svg @@ -0,0 +1,48 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +1 +2 +3 +4 +5 +6 + + diff --git a/tests/testthat/test-HPart.R b/tests/testthat/test-HPart.R new file mode 100644 index 00000000..5ea55eb5 --- /dev/null +++ b/tests/testthat/test-HPart.R @@ -0,0 +1,75 @@ +library("TreeTools") + +test_that("is.HPart() succeeds", { + expect_true(is.HPart(as.HPart(TreeTools::BalancedTree(7)))) + expect_true(is.HPart(structure(class = "HPart", + list(list("t1"), list("t2", "t3"))))) + expect_false(is.HPart(structure(class = "NonPart", + list(list("t1"), list("t2", "t3"))))) +}) + +test_that("as.phylo.HPart", { + bal7 <- BalancedTree(7) + hb7 <- as.HPart(bal7) + expect_equal(Preorder(as.phylo.HPart(hb7)), bal7) + + bal17 <- BalancedTree(17) + hb17 <- as.HPart(bal17) + expect_equal(Preorder(as.phylo.HPart(hb17)), bal17) +}) + +test_that("as.HPart.numeric", { + hpList <- as.HPart(list(list(1, 3, 9), + list(2, 4, 8), + list(5, 6, 7))) + expect_equal(class(hpList), "HPart") + + hpNum <- as.HPart(c(1, 2, 1, 2, 3, 3, 3, 2, 1)) + expect_equal(class(hpNum), "HPart") + + expect_equal(SelfHMI(hpNum), SelfHMI(hpList)) + expect_equal(HMI(hpNum, hpList), SelfHMI(hpNum)) + + flatP <- as.HPart(list(as.list(1:5), as.list(6:9))) + hp9 <- as.HPart(BalancedTree(1:9)) + expect_equal(HMI(flatP, hp9), 0.99107606) + +}) + +test_that("as.HPart.unimplemented", { + expect_error(as.HPart(matrix()), "no applicable method") + expect_error(as.HPart(list(letters, LETTERS)), "leaves must be integers") + expect_error(as.HPart(list(list(1, 2, 3), list(0, 1, 2))), + ".eaves must contain each integer") + expect_error(as.HPart(list(list(1, 2, 3), list(3, 1, 2))), + ".eaves must contain each integer") +}) + +test_that("HParts are relabelled correctly", { + bal7 <- BalancedTree(7) + hb7 <- as.HPart(bal7) + + map <- c(7:4, 1:3) + mappedLabels <- paste0("t", map) + + hbMap <- RenumberTips(hb7, mappedLabels) + # Here we want only to map the internal node IDs + attr(hbMap, "tip.label") <- TipLabels(hb7) + + bal7tl <- bal7 + bal7tl$tip.label <- bal7$tip[map] + + expect_equal(SortTree(Preorder(as.phylo.HPart(hbMap))), SortTree(bal7tl)) +}) + +test_that("plot.HPart", { + skip_if_not_installed("vdiffr") + vdiffr::expect_doppelganger("plot-HPart", function() + plot(as.HPart(list(list(1, 2, 3), list(4, list(5, 6))))) + ) +}) + +test_that("Renumber.HPart", { + expect_error(RenumberTips(as.HPart(list(1, 2, 4, 3)), 4:2), + "labels 1 missing from `tipOrder`") +}) diff --git a/tests/testthat/test-hierarchical_mutual_information.R b/tests/testthat/test-hierarchical_mutual_information.R new file mode 100644 index 00000000..3399e08e --- /dev/null +++ b/tests/testthat/test-hierarchical_mutual_information.R @@ -0,0 +1,62 @@ +library("TreeTools", quietly = TRUE) + +test_that("Hierarchical Mutual Information", { + + # Create test trees + tree1 <- BalancedTree(8) + tree2 <- PectinateTree(8) + tree3 <- BalancedTree(8) # Identical to tree1 + + # Test basic functionality + expect_no_error(HierarchicalMutualInfo(tree1, tree2)) + + # Test that HMI is numeric and non-negative + hmi <- HierarchicalMutualInfo(tree1, tree2) + expect_true(is.numeric(hmi)) + expect_true(hmi >= 0) + + # Test normalization + hmi_norm <- HierarchicalMutualInfo(tree1, tree2, normalize = TRUE) + expect_true(is.numeric(hmi_norm)) + expect_true(hmi_norm >= 0) + expect_true(hmi_norm <= 1) + + # Test symmetry + hmi12 <- HierarchicalMutualInfo(tree1, tree2) + hmi21 <- HierarchicalMutualInfo(tree2, tree1) + expect_equal(hmi12, hmi21, tolerance = 1e-10) + + # Test identity property - normalized self-comparison should be 1 + hmi_self1_norm <- HierarchicalMutualInfo(tree1, tree1, normalize = TRUE) + hmi_self2_norm <- HierarchicalMutualInfo(tree2, tree2, normalize = TRUE) + + expect_equal(hmi_self1_norm, 1, tolerance = 1e-10) + expect_equal(hmi_self2_norm, 1, tolerance = 1e-10) + + # Test normalized identity + hmi_self_norm <- HierarchicalMutualInfo(tree1, tree1, normalize = TRUE) + expect_equal(hmi_self_norm, 1, tolerance = 1e-10) + + # Test error handling + expect_equal(HierarchicalMutualInfo(tree1), SelfHMI(tree1)) + expect_warning(expect_equal(HierarchicalMutualInfo(tree1, norm = TRUE), 1), + "tree2") + + # Test with different tip numbers (should error) + tree_small <- BalancedTree(6) + expect_error(HierarchicalMutualInfo(tree1, tree_small), + "number of leaves") +}) + +test_that("HMI edge cases", { + bal9 <- BalancedTree(9) + bal9b <- BalancedTree(paste0("t", c(3:1, 7:9, 6:4))) + + expect_lt(HMI(bal9, bal9b), HMI(bal9, bal9)) + + expect_lt(HMI(bal9, bal9b, normalize = TRUE), 0.05) + + expect_equal(AHMI(StarTree(6), BalancedTree(6))[[1]], 0) + expect_equal(AHMI(StarTree(2), BalancedTree(2)), structure(0, sem = NaN)) +}) + diff --git a/tests/testthat/test-hmi.cpp.R b/tests/testthat/test-hmi.cpp.R new file mode 100644 index 00000000..78ad0f22 --- /dev/null +++ b/tests/testthat/test-hmi.cpp.R @@ -0,0 +1,158 @@ +library("TreeTools") + +test_that("HMI fails nicely", { + bal5 <- as.HPart(BalancedTree(5)) + pec5 <- as.HPart(PectinateTree(5)) + expect_error(HierarchicalMutualInfo(bal5, pec5, normalize = "Error"), + "`normalize` must be logical, or a function") + + expect_error(EHMI_xptr(bal5, pec5, tolerance = 1e-16), + ".olerance too low") + expect_error(EHMI_xptr(bal5, pec5, minResample = 1), + "Must perform at least one resampl") +}) + +test_that("HMI examples from Perotti et al. 2015", { + p1 <- list(list(1, list(2, 3)), list(4, 5, 6)) + p2 <- list(1, list(2, 3), list(4, 5, 6)) + expect_equal(SelfHMI(p1), 1.011 / log(2), tolerance = 0.01) + expect_equal(SelfHMI(p2), 1.011 / log(2), tolerance = 0.01) + expect_equal(HMI(p1, p2), log(2) / log(2)) + expect_equal(HMI(p1, p2, normalize = TRUE), 0.685, tolerance = 0.01) +}) + +test_that("HMI is dependent on root position", { + bal9 <- BalancedTree(9) + expect_lt(HMI(RootTree(bal9, 1), RootTree(bal9, 9), normalize = TRUE), 1) + expect_gt(SelfHMI(RootTree(bal9, 1)), SelfHMI(bal9)) # Pectination -> information +}) + +test_that("HMI results match hmi.pynb", { + # Non-hierarchical + p1 <- list(list(19, 18, 5), list(14, 16, 3), list(7), list(10, 8), list(1, 17, 9, 4, 6, 15), list(2, 13, 11), list(12, 0)) + p2 <- list( list(12, 9), list(4, 2, 0, 7), list(16), list(5), list(8, 3, 1, 14), list(11, 6, 10), list(18, 17, 19), list(13, 15)) + expect_equal(HMI(p1, p2), 0.9410980357245466 / log(2)) + + + # Hierarchical + hp0 <- list(list(23), + list(list(list(list(list(list(16), list(17)))))), # Tips above order 2 nodes + list(list(12), list(22, 13)), list(5), list(7), list(24), list(list(list(9), + list(list(14, 2))), + list(list(list(list(list(list(27), list(3))))))), + list(20, 29, 18), list(4), list(26, 15), list(list(10), list(21, 25)), + list(11), list(list(0, 28), list(1), list(6)), list(19, 8)) + + hp1 <- list(list(23), list(list(list(list(list(16, 17))))), list(list(12), list(22, 13)), list(5), list(7), list(24), list(list(list(9), list(list(14, 2))), list(list(list(list(list(27, 3)))))), list(20, 29, 18), list(4), list(26, 15), list(list(10), list(21, 25)), list(11), list(list(0, 28), list(1), list(6)), list(19, 8)) + + hp1Collapsed <- list( + list(23), list(16, 17), list(list(12), list(22, 13)), list(5), list(7), + list(24), list(list(list(9), list(14, 2)), list(27, 3)), list(20, 29, 18), + list(4), list(26, 15), list(list(10), list(21, 25)), list(11), + list(list(0, 28), list(1), list(6)), list(19, 8)) + + hp2 <- list( + list(list(list(0, 25), list(24)), list(6), list(11, 28), list(8)), + list(list(list(19), list(list(list(list(21), list(4), + list(list(list(list(list(22, 7))))))))), + list(5)), list(list(3), list(10, 23, 14)), + list(list(27, 1, 16, 13, 18, 26, 9), + list(list(list(list(15), list(list(list(list(list(list(12, 17)))))))), + list(2, 20)), list(29))) + + expect_equal(HMI(hp1, hp2), 1.0591260408329395 / log(2)) + + # expect_equal(HMI(hp0, hp0), 3.0140772805713665 / log(2)) + # Note that hp0 contains [[16], [17]] and [[27], [3]], whereas hp1 has + # [16, 17] and [27, 3]. I haven't yet worked through why this should give a + # different result. But I don't think we are likely to encounter this case + # in our work. + expect_equal(HMI(hp1, hp1Collapsed), 2.921657656496707 / log(2)) + expect_equal(HMI(hp2, hp2), 2.606241391162456 / log(2)) + + expect_equal(SelfHMI(hp1), HMI(hp1, hp1)) + + ehmi <- structure(0.7806 / log(2), # Calculated from py with tol = 0.001 + var = 0.01, + sd = 0.1, + sem = 0.008, + relativeError = 0.01) + ehmi_cpp <- EHMI(hp1, hp2, precision = 0.01) + expect_gt(attr(ehmi_cpp, "samples"), 36) + attr(ehmi_cpp, "samples") <- NULL # Could vary; no point in testing + expect_equal(ehmi_cpp, ehmi, tolerance = 0.1) + + set.seed(13000) + pyAHMI <- 0.13000 # Calculated with tol = 0.001 + expect_equal(AHMI(hp1, hp2)[[1]], pyAHMI, tolerance = 0.05) + + set.seed(1) + ahmi1 <- AHMI(hp1, hp2) + set.seed(1) + expect_equal(AHMI(hp1, hp2), ahmi1) + nRep <- 100 + ahmis <- replicate(nRep, AHMI(hp1, hp2)) + expect_lt(abs(attr(ahmi1, "sem") - sd(ahmis)), 0.1 * sd(ahmis)) +}) + +test_that("HMI calculated correctly", { + bal6 <- BalancedTree(6) + pec6 <- PectinateTree(6) + + hp1 <- as.HPart(BalancedTree(6)) + hp2 <- as.HPart(PectinateTree(6)) + expect_equal(capture_output(print(hp2)), + "Hierarchical partition on 6 leaves: t1, t2, ..., t5, t6") + expect_equal(capture_output(print(as.HPart(BalancedTree(4)))), + "Hierarchical partition on 4 leaves: t1, t2, t3, t4") + expect_equal(HMI_xptr(hp1, hp2), 0.363353185) + bal8 <- BalancedTree(8) + pec8 <- PectinateTree(8) + star8 <- StarTree(8) + + expect_equal(HierarchicalMutualInfo(bal6, pec6, normalize = TRUE), + HMI_xptr(hp1, hp2) / max(HH_xptr(hp1), HH_xptr(hp2))) + + hp1 <- build_hpart_from_phylo(BalancedTree(8)) + hp2 <- build_hpart_from_phylo(PectinateTree(8)) + expect_equal(HMI_xptr(hp1, hp1), 1.38629436) + expect_equal(HMI_xptr(hp1, hp2), 0.3342954) + +}) + +test_that("HMI_cpp equals SelfHMI for same partition", { + set.seed(1) + tr <- BalancedTree(8) + hp <- as.HPart(tr) + expect_equal(SelfHMI(hp), HMI(hp, hp), tolerance = 1e-12) +}) + +test_that("HMI works with real dataset", { + ch <- c(1L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L) + tr <- structure(list( + edge = structure(c(12L, 12L, 13L, 14L, 15L, 16L, 16L, 17L, 17L, 18L, 18L, + 15L, 14L, 19L, 20L, 20L, 19L, 13L, 21L, 21L, 1L, 13L, + 14L, 15L, 16L, 2L, 17L, 3L, 18L, 4L, 5L, 6L, 19L, 20L, + 7L, 8L, 9L, 21L, 10L, 11L), dim = c(20L, 2L)), + Nnode = 10L, + tip.label = c("Nem", "Sco", "Eun", "Aph", "Chr", "Can", "Hel", "Cha", + "Lep", "Ter", "Lin")), + class = "phylo", order = "preorder") + chPart <- as.HPart(ch) + + # Build HPart from tree, then relabel + trPart <- as.HPart(tr) + attr(trPart, "tip.label") <- seq_along(attr(trPart, "tip.label")) + expect_equal(attr(chPart, "tip.label"), attr(trPart, "tip.label")) + + # Because of the difference in levels, this test should NOT pass (!) + # expect_equal(HMI(chPart, trPart), SelfHMI(chPart)) + + # Relabel tree first, then build HPart + tree <- tr + tree$tip.label <- seq_along(tree[["tip.label"]]) + treePart <- as.HPart(tree) + treePart + expect_equal(HMI(trPart, treePart), SelfHMI(treePart)) + expect_equal(HMI(chPart, trPart), HMI(chPart, treePart)) +})