Skip to content

Commit

Permalink
Add Sparse Implementation of is.subset with C (#24)
Browse files Browse the repository at this point in the history
* C implementation of is_subset

* add issubset.c

* Finish sparse is.subset implementation in C

* Small bugfix with Dim info

* More bug chasing for sparse is.subset

* Remove maintainer issue

* provide colnames to sparse subset matrices

* add proper subsetting to sparse is.subset
  • Loading branch information
Ian Johnson authored and mhahsler committed Feb 18, 2017
1 parent 8aab736 commit 1744055
Show file tree
Hide file tree
Showing 4 changed files with 143 additions and 42 deletions.
6 changes: 3 additions & 3 deletions DESCRIPTION
Expand Up @@ -8,9 +8,9 @@ Authors@R: c(person("Michael", "Hahsler", role = c("aut", "cre", "cph"),
person("Bettina", "Gruen", role = c("aut", "cph")),
person("Kurt", "Hornik", role = c("aut", "cph")),
person("Christian", "Borgelt", role = c("ctb", "cph")))
Description: Provides the infrastructure for representing,
manipulating and analyzing transaction data and patterns (frequent
itemsets and association rules). Also provides interfaces to
Description: Provides the infrastructure for representing,
manipulating and analyzing transaction data and patterns (frequent
itemsets and association rules). Also provides interfaces to
C implementations of the association mining algorithms Apriori and Eclat
by C. Borgelt.
Classification/ACM: G.4, H.2.8, I.5.1
Expand Down
62 changes: 25 additions & 37 deletions R/is.superset.R
@@ -1,6 +1,6 @@
#######################################################################
# arules - Mining Association Rules and Frequent Itemsets
# Copyright (C) 2011-2015 Michael Hahsler, Christian Buchta,
# Copyright (C) 2011-2015 Michael Hahsler, Christian Buchta,
# Bettina Gruen and Kurt Hornik
#
# This program is free software; you can redistribute it and/or modify
Expand All @@ -22,7 +22,7 @@
## arules specific set methods: is.superset, is.subset (only for itemMatrix)
##
setMethod("is.superset", signature(x = "itemMatrix"),
function(x, y = NULL, proper = FALSE, sparse = FALSE)
function(x, y = NULL, proper = FALSE, sparse = FALSE)
if (is.null(y)) t(is.subset(x, NULL, proper, sparse))
else t(is.subset(y, x, proper, sparse))
)
Expand All @@ -39,7 +39,7 @@ setMethod("is.subset", signature(x = "itemMatrix"),
function(x, y = NULL, proper = FALSE, sparse = FALSE) {
if (length(x) == 0 || (!is.null(y) && length(y) == 0))
return(logical(0))

## y needs to be itemMatrix and x has to conform!
if(!is.null(y)) {
if(is(y, "associations")) y <- items(y)
Expand All @@ -48,60 +48,48 @@ setMethod("is.subset", signature(x = "itemMatrix"),
x <- recode(x, itemLabels = il)
y <- recode(y, itemLabels = il)
}

if(sparse) return(.is.subset_sparse(x, y, proper))

if (is.null(y)) m <- .Call(R_crosstab_ngCMatrix, x@data, NULL, FALSE)
else m <- .Call(R_crosstab_ngCMatrix, x@data, y@data, FALSE)

m <- m == size(x)
if (proper == TRUE)
if (is.null(y))

if (proper == TRUE)
if (is.null(y))
m <- m & outer(size(x), size(x), "<")
else
else
m <- m & outer(size(x), size(y), "<")

rownames(m) <- labels(x)
if(is.null(y)) colnames(m) <- labels(x)
else colnames(m) <- labels(y)

m
}
)

setMethod("is.subset", signature(x = "associations"),
function(x, y = NULL, proper = FALSE, sparse = FALSE)
function(x, y = NULL, proper = FALSE, sparse = FALSE)
is.subset(items(x), y, proper, sparse)
)

### use tidlist intersection
.is.subset_sparse <- function(x, y = NULL, proper=FALSE) {
### use tidlist intersection
.is.subset_sparse <- function(x, y = NULL, proper=FALSE) {

if(is.null(y)) y <- x

xitems <- LIST(x, decode = FALSE)
ylists <- LIST(as(y, "tidLists"), decode = FALSE)

## select tid-lists for items and do intersection
contained <- lapply(xitems, FUN = function(i) {
if(length(i)==0) ssid <- 1:nrow(y)
else {
tidls <- unlist(ylists[i])
if(is.null(tidls)) ssid <- integer(0)
else ssid <- which(tabulate(tidls) == length(i))
}

if(proper) ssid <- ssid[size(y)[ssid] > length(i)]
ssid
})

m <- .list2ngCMatrix(contained, nrow(y))


p <- as.integer(rep(0, x@data@Dim[2]+1))
i <- .Call(R_is_subset, x@data@p, x@data@i, x@data@Dim, y@data@p, y@data@i, y@data@Dim, proper, p, PACKAGE = "arules")

m <- t(new("ngCMatrix", p=p, i=i, Dim=c(as.integer(max(i)+1), as.integer(x@data@Dim[2]))))

if(!is.null(y)) colnames(m) <- labels(y)
rownames(m) <- labels(x)

m

return(m)

}


Expand Down
5 changes: 3 additions & 2 deletions src/dll.c
Expand Up @@ -3,7 +3,8 @@
#include <Rinternals.h>
#include <R_ext/Rdynload.h>

extern SEXP reclat(SEXP x, SEXP y, SEXP dim, SEXP parms, SEXP control,
extern SEXP is_subset(SEXP X_P, SEXP X_I, SEXP X_DIM, SEXP Y_P, SEXP Y_I, SEXP Y_DIM, SEXP PROPER, SEXP OUT_P);
extern SEXP reclat(SEXP x, SEXP y, SEXP dim, SEXP parms, SEXP control,
SEXP itemInfo);
extern SEXP rapriori(SEXP x, SEXP y, SEXP dim, SEXP parms, SEXP control,
SEXP app, SEXP itemInfo);
Expand Down Expand Up @@ -39,6 +40,7 @@ extern SEXP R_tid_rules(SEXP tidLists, SEXP itemsets);
void R_init_arules(DllInfo *dll) {

const R_CallMethodDef CallEntries[] = {
{"R_is_subset", (DL_FUNC) is_subset, 8},
{"R_reclat", (DL_FUNC) reclat, 6},
{"R_rapriori", (DL_FUNC) rapriori, 7},
{"R_transpose_ngCMatrix", (DL_FUNC) R_transpose_ngCMatrix, 1},
Expand Down Expand Up @@ -82,4 +84,3 @@ void R_init_arules(DllInfo *dll) {
R_RegisterCCallable("arules", "R_pnindex", (DL_FUNC) R_pnindex);
R_RegisterCCallable("arules", "R_pnrindex", (DL_FUNC) R_pnrindex);
}

112 changes: 112 additions & 0 deletions src/issubset.c
@@ -0,0 +1,112 @@
/*
C implementation of sparse matrix subset
Author: Ian Johnson
*/

#include <R.h>
#include <Rdefines.h>


void populateMatches(int* matches_for_y, int* x_i, int* x_p, int* y_p, int* y_i, int y_index, int num_rows, int proper){

int y_start_index = x_p[y_index], y_end_index = x_p[y_index+1];

int num_matches = 0;

for(int x_index = 0; x_index < num_rows; x_index++){

int loc = y_p[x_index], end_loc = y_p[x_index+1], curr_col;

if(proper && (end_loc - loc == y_end_index - y_start_index)) continue;

curr_col = y_start_index;

while(loc < end_loc){

if (y_i[loc] == x_i[curr_col]) curr_col++;
if(curr_col == y_end_index) break;

loc++;

}


if(curr_col == y_end_index){
matches_for_y[num_matches++] = x_index;
}

}

matches_for_y[num_matches] = -1;

}

int copyMatches(int* y_matches, int** output_i, int* output_i_length, int* output_i_last){

int index = 0;

while(y_matches[index] != -1){

if(*output_i_last == *output_i_length - 1){
int* tmp = malloc(2*(*output_i_length) * sizeof(int));
memcpy(tmp, *output_i, *output_i_length*sizeof(int));
*output_i_length *= 2;
free(*output_i);
*output_i = tmp;
}

(*output_i)[++(*output_i_last)] = y_matches[index++];

}

return index;

}


SEXP is_subset(SEXP X_P, SEXP X_I, SEXP X_DIM, SEXP Y_P, SEXP Y_I, SEXP Y_DIM, SEXP PROPER, SEXP OUT_P){

int* x_p = INTEGER(X_P);
int* x_i = INTEGER(X_I);

int proper = LOGICAL(PROPER)[0];

int* y_p = INTEGER(Y_P);
int* y_i = INTEGER(Y_I);

int x_p_length = INTEGER(X_DIM)[1];

int y_p_length = INTEGER(Y_DIM)[1];
int y_i_max = INTEGER(Y_DIM)[0];

int output_i_length = y_p_length;
int output_i_last = -1;
int* output_i = malloc((output_i_length+1) * sizeof(int));

int* output_p = INTEGER(OUT_P);
int curr_p = 0;

int* y_matches = malloc((output_i_length+1) * sizeof(int));

//For every item in y, list all matches in x
for(int y_index = 0; y_index < x_p_length; y_index++){

populateMatches(y_matches, x_i, x_p, y_p, y_i, y_index, y_p_length, proper);

curr_p += copyMatches(y_matches, &output_i, &output_i_length, &output_i_last);
output_p[y_index+1] = curr_p;

}

free(y_matches);

SEXP OUT_I = allocVector(INTSXP, output_i_last+1);
for(int i = 0; i < output_i_last+1; i++){
INTEGER(OUT_I)[i] = output_i[i];
}

free(output_i);

return OUT_I;

}

0 comments on commit 1744055

Please sign in to comment.