mhahsler · moredatapls · May 26, 2019 · May 26, 2019 · Jun 4, 2019
diff --git a/NAMESPACE b/NAMESPACE
@@ -40,6 +40,7 @@ S3method(as.reachability, optics)
 
 S3method(print, hdbscan)
 S3method(plot, hdbscan)
+S3method(predict, hdbscan)
 
 S3method(as.reachability, dendrogram)
 

diff --git a/R/hdbscan.R b/R/hdbscan.R
@@ -57,7 +57,6 @@ hdbscan <- function(x, minPts, xdist = NULL,
   ## Generate membership 'probabilities' using core distance as the measure of density
   prob <- rep(0, length(cl))
   for (cid in sl){
-    ccl <- res[[as.character(cid)]]
     max_f <- max(core_dist[which(cl == cid)])
     pr <- (max_f - core_dist[which(cl == cid)])/max_f
     prob[cl == cid] <- pr

diff --git a/R/predict.R b/R/predict.R
@@ -19,3 +19,31 @@ predict.optics <- function (object, newdata = NULL, data, ...) {
   if (is.null(object$cluster)) stop("no extracted clustering available in object! run extractDBSCAN or extractXi first.")
   .predict_frNN(newdata, data, object$cluster, object$eps_cl, ...)
 }
+
+predict.hdbscan <- function(object, newdata = NULL, data, ...) {
+
+  k = 2 * object$minPts
+
+  # get all the nearest neighbor IDs for newdata and their distances
+  nn <- kNN(rbind(data, newdata), k = k, sort = TRUE)$id[-(1:nrow(data)),]
+  nn_dist <- kNNdist(rbind(data, newdata), k = k, all = TRUE)[-(1:nrow(data)),]
+  core_dist <- nn_dist[,object$minPts - 1]
+
+  # for each new data point, compute MRD for its nearest neighbors
+  nn_mrd <- mrd_m(nn_dist, core_dist)
+
+  # reorder the neighbors by their MRD
+  nn_order <- t(apply(nn_mrd, 1, order))
+
+  # get the cluster indices
+  sapply(1:nrow(nn), function(x) {
+    # reorder the neighbors by size
+    x <- nn[x, nn_order[x,]]
+
+    x <- x[x <= nrow(data)]
+    x <- object$cluster[x][x > 0][1]
+    x[is.na(x)] <- 0L
+    x
+  })
+
+}
diff --git a/man/hdbscan.Rd b/man/hdbscan.Rd
@@ -3,6 +3,7 @@
 \alias{HDBSCAN}
 \alias{print.hdbscan}
 \alias{plot.hdbscan}
+\alias{predict.hdbscan}
 \title{
   HDBSCAN
 }
@@ -18,6 +19,7 @@ hdbscan(x, minPts, xdist = NULL,
 \method{print}{hdbscan}(x, ...)
 \method{plot}{hdbscan}(x, scale="suggest",
     gradient=c("yellow", "red"), show_flat = FALSE, ...)
+\method{predict}{hdbscan}(object, newdata = NULL, data, ...)
 }
 %- maybe also 'usage' for other objects documented here.
 \arguments{
@@ -32,7 +34,9 @@ hdbscan(x, minPts, xdist = NULL,
   \item{scale}{ integer; used to scale condensed tree based on the graphics device. Lower scale results in wider trees. }
   \item{gradient}{ character vector; the colors to build the condensed tree coloring with. }
   \item{show_flat}{ logical; whether to draw boxes indicating the most stable clusters. }
-
+  \item{object}{ a HDBSCAN clustering object.}
+  \item{data}{ the data set used to create the HDBSCAN clustering object.}
+  \item{newdata}{ new data set for which the cluster membership should be predicted.}
 }
 \details{
 Computes the hierarchical cluster tree representing density estimates along with the stability-based flat cluster extraction
@@ -41,6 +45,8 @@ proposed by Campello et al. (2013). HDBSCAN essentially computes the hierarchy o
 Additional, related algorithms including the "Global-Local Outlier Score from Hierarchies" (GLOSH) (see section 6 of Campello et al. 2015) outlier scores and ability to cluster based on instance-level constraints (see section 5.3 of Campello et al. 2015) are supported. The algorithms only need the parameter \code{minPts}.
 
 Note that \code{minPts} not only acts as a minimum cluster size to detect, but also as a "smoothing" factor of the density estimates implicitly computed from HDBSCAN.
+
+\code{predict} can be used to predict cluster membership for new data points. A point is considered a member of a cluster if the closest data point (by mutual rechability distance) is member of the cluster. Points which cannot be assigned to a cluster will be reported as members of the noise cluster 0.
 }
 \value{
   A object of class 'hdbscan' with the following components: