Merge pull request #309 from mlverse/sparsemax

Sparsemax
mlverse · Oct 19, 2020 · effadbc · effadbc
2 parents 544d39c + 9a906b2
commit effadbc
Show file tree

Hide file tree

Showing 16 changed files with 332 additions and 13 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -134,6 +134,7 @@ export(nn_bce_with_logits_loss)
 export(nn_bilinear)
 export(nn_buffer)
 export(nn_celu)
+export(nn_contrib_sparsemax)
 export(nn_conv1d)
 export(nn_conv2d)
 export(nn_conv3d)
@@ -243,6 +244,7 @@ export(nnf_binary_cross_entropy)
 export(nnf_binary_cross_entropy_with_logits)
 export(nnf_celu)
 export(nnf_celu_)
+export(nnf_contrib_sparsemax)
 export(nnf_conv1d)
 export(nnf_conv2d)
 export(nnf_conv3d)

diff --git a/NEWS.md b/NEWS.md
@@ -28,6 +28,7 @@
 - Fixed `nn_batchnorm*` so it returns the same results as PyTorch (#302)
 - Fixed a bug that made `nn_module$parameter` when there were shared parameters
   between layers. (#306)
+- Added `nnf_contrib_sparsemax` and `nn_contrib_sparsemax`. (#309)
 - Added ASGD optimizer (@krzjoa #307)
 
 # torch 0.1.0

diff --git a/R/RcppExports.R b/R/RcppExports.R
@@ -97,6 +97,10 @@ cpp_autograd_grad <- function(outputs, inputs, grad_outputs, retain_graph, creat
     .Call('_torch_cpp_autograd_grad', PACKAGE = 'torchpkg', outputs, inputs, grad_outputs, retain_graph, create_graph, allow_unused)
 }
 
+cpp_contrib_torch_sparsemax <- function(input, dim) {
+    .Call('_torch_cpp_contrib_torch_sparsemax', PACKAGE = 'torchpkg', input, dim)
+}
+
 cpp_cuda_is_available <- function() {
     .Call('_torch_cpp_cuda_is_available', PACKAGE = 'torchpkg')
 }

diff --git a/R/nn-activation.R b/R/nn-activation.R
@@ -1096,3 +1096,25 @@ nn_log_softmax <- nn_module(
     nnf_log_softmax(input, self$dim)
   }
 )
+
+#' Sparsemax activation
+#'
+#' Sparsemax activation module.
+#' 
+#' @details 
+#' The SparseMax activation is described in
+#' ['From Softmax to Sparsemax: A Sparse Model of Attention and Multi-Label Classification'](https://arxiv.org/abs/1602.02068)
+#' The implementation is based on [aced125/sparsemax](https://github.com/aced125/sparsemax/tree/master/sparsemax)
+#' 
+#' @param dim The dimension over which to apply the sparsemax function. (-1)
+#'
+#' @export
+nn_contrib_sparsemax <- nn_module(
+  "nn_contrib_sparsemax",
+  initialize = function(dim = -1) {
+    self$dim = dim
+  },
+  forward = function(input) {
+    nnf_contrib_sparsemax(input, self$dim)
+  }
+)
diff --git a/R/nn.R b/R/nn.R
@@ -389,22 +389,25 @@ create_nn_module_callable <- function(instance) {
   if (is.numeric(y))
     return(x[[".__enclos_env__"]][["private"]][["modules_"]][[y]])
 
-  if (!is.null(x[[".__enclos_env__"]][["private"]][["parameters_"]])) {
-    pars <- x[[".__enclos_env__"]][["private"]][["parameters_"]]
-    if (y %in% names(pars))
-      return(pars[[y]])
+  pars <- x[[".__enclos_env__"]][["private"]][["parameters_"]]
+  if (!is.null(pars)) {
+    o <- pars[[y]]
+    if (!is.null(o))
+      return(o)
   }
 
-  if (!is.null(x[[".__enclos_env__"]][["private"]][["buffers_"]])) {
-    bufs <- x[[".__enclos_env__"]][["private"]][["buffers_"]]
-    if (y %in% names(bufs))
-      return(bufs[[y]])
+  bufs <- x[[".__enclos_env__"]][["private"]][["buffers_"]]
+  if (!is.null(bufs)) {
+    o <- bufs[[y]]
+    if (!is.null(o))
+      return(o)
   }
 
-  if (!is.null(x[[".__enclos_env__"]][["private"]][["modules_"]])) {
-    mods <- x[[".__enclos_env__"]][["private"]][["modules_"]]
-    if (y %in% names(mods))
-      return(mods[[y]])
+  mods <- x[[".__enclos_env__"]][["private"]][["modules_"]]
+  if (!is.null(mods)) {
+    o <- mods[[y]]
+    if (!is.null(o))
+      return(o)
   }
 
   NextMethod("[[", x)

diff --git a/R/nnf-activation.R b/R/nnf-activation.R
@@ -739,3 +739,30 @@ nnf_sigmoid <- function(input) {
   torch_sigmoid(input)
 }
 
+#' Sparsemax
+#'
+#' Applies the SparseMax activation. 
+#' 
+#' @details 
+#' The SparseMax activation is described in
+#' ['From Softmax to Sparsemax: A Sparse Model of Attention and Multi-Label Classification'](https://arxiv.org/abs/1602.02068)
+#' The implementation is based on [aced125/sparsemax](https://github.com/aced125/sparsemax/tree/master/sparsemax)
+#' 
+#' @param input the input tensor
+#' @param dim The dimension over which to apply the sparsemax function. (-1)
+#'
+#' @export
+nnf_contrib_sparsemax <- function(input, dim = -1) {
+  if (!is_torch_tensor(input))
+    value_error("Input should be a tensor and got '{class(input)}.")
+
+  dim <- as_1_based_dim(dim)
+
+  ptr <- cpp_contrib_torch_sparsemax(input$ptr, dim)
+
+  Tensor$new(ptr = ptr)
+}
+
+
+
+
diff --git a/R/scalar.R b/R/scalar.R
@@ -1,4 +1,4 @@
-Scalar <- R6::R6Class(
+Scalar <- R7Class(
   classname = "torch_scalar", 
 
   public = list(

diff --git a/lantern/CMakeLists.txt b/lantern/CMakeLists.txt
@@ -94,6 +94,7 @@ add_library(lantern SHARED
     src/NNUtilsRnn.cpp
     src/Storage.cpp
     src/Save.cpp
+    src/Contrib/Sparsemax.cpp
 )
 add_library(lantern::library ALIAS lantern)
 

diff --git a/lantern/include/lantern/lantern.h b/lantern/include/lantern/lantern.h
@@ -37,6 +37,7 @@
 
 #include <stdint.h>
 #include <stdio.h>
+#include <string>
 
 extern int lanternLogEnabled;
 #define LLOG(...) if ((lanternLogEnabled & 1) == 1) {              \
@@ -545,6 +546,14 @@ extern "C"
     LANTERN_HOST_HANDLER;
   }
 
+  LANTERN_API void * (LANTERN_PTR _lantern_contrib_torch_sparsemax) (void * input, int dim);
+  HOST_API void * lantern_contrib_torch_sparsemax (void* input, int dim)
+  {
+    void * ret = _lantern_contrib_torch_sparsemax(input, dim);
+    LANTERN_HOST_HANDLER;
+    return ret;
+  }
+
   /* Autogen Headers -- Start */
   LANTERN_API void* (LANTERN_PTR _lantern__cast_byte_tensor_bool)(void* self, void* non_blocking);
   HOST_API void* lantern__cast_byte_tensor_bool(void* self, void* non_blocking) { void* ret = _lantern__cast_byte_tensor_bool(self, non_blocking); LANTERN_HOST_HANDLER return ret; }
@@ -4263,6 +4272,7 @@ bool lanternInit(const std::string &libPath, std::string *pError)
   LOAD_SYMBOL(_lantern_Tensor_names);
   LOAD_SYMBOL(_lantern_string_new);
   LOAD_SYMBOL(_lantern_string_delete);
+  LOAD_SYMBOL(_lantern_contrib_torch_sparsemax);
   /* Autogen Symbols -- Start */
   LOAD_SYMBOL(_lantern__cast_byte_tensor_bool)
   LOAD_SYMBOL(_lantern__cast_char_tensor_bool)

diff --git a/lantern/src/Contrib/Sparsemax.cpp b/lantern/src/Contrib/Sparsemax.cpp
@@ -0,0 +1,133 @@
+#define LANTERN_BUILD
+#include "lantern/lantern.h"
+#include <torch/torch.h>
+#include <string>
+#include <iostream>
+#include "../utils.hpp"
+#include <torch/torch.h>
+#include <stdexcept>      // std::out_of_range
+
+using namespace torch::autograd;
+
+// Inherit from Function
+class SparseMaxFunction : public Function<SparseMaxFunction> {
+ public:
+
+  static torch::Tensor forward(AutogradContext *ctx, torch::Tensor input, int dim) {
+
+    auto input_dim = input.dim();
+    if (input_dim <= dim || dim < -input_dim)
+    {
+        throw std::out_of_range("Dimension out of range");
+    }
+
+    bool needs_reshaping = input_dim > 2;
+    auto original_size = input.sizes().vec();
+
+    if (needs_reshaping)
+    {
+        // transpose batch and nth dim
+        input = input.transpose(0, dim);
+
+        // Flatten all dimensions except nth dim
+        input = input.reshape({input.size(0), -1});
+
+        // Transpose flattened dimensions to 0th dim, nth dim to last dim
+        input = input.transpose(0, -1);
+    }
+
+    // Translate by max for numerical stability
+    input = input - std::get<0>(input.max(-1, true)).expand_as(input);
+
+    auto zs = std::get<0>(input.sort(-1, true));
+    auto range = torch::arange(1, input.size(-1) + 1);
+    range = range.expand_as(input).to(input);
+
+    // Determine sparsity of projection
+    auto bound = 1 + range * zs;
+    auto is_gt = bound.gt(zs.cumsum(-1)).to(input.dtype());
+    auto k = std::get<0>((is_gt * range).max(-1, true));
+
+    // Compute threshold
+    auto zs_sparse = is_gt * zs;
+
+    // Compute taus
+    auto taus = (zs_sparse.sum(-1, true) - 1) / k;
+    taus = taus.expand_as(input);
+
+    auto output = torch::max(torch::zeros_like(input), input - taus);
+
+    // Save context
+    ctx->save_for_backward({output});
+    ctx->saved_data["needs_reshaping"] = needs_reshaping;
+    ctx->saved_data["dim"] = dim;
+
+    if (needs_reshaping)
+    {
+        // Tranpose flattened dim to last dim, nth dim to 0th dim
+        output = output.transpose(0, 1);
+
+        // Reshape to original size
+        output = output.reshape(original_size);
+
+        // Swap batch dim and nth dim
+        output = output.transpose(0, dim);
+    }
+
+    return output;
+  }
+
+  static tensor_list backward(AutogradContext *ctx, tensor_list grad_outputs) {
+    auto saved = ctx->get_saved_variables();
+    auto output = saved[0];
+    auto grad_output = grad_outputs[0];
+
+    bool needs_reshaping = ctx->saved_data["needs_reshaping"].toBool();
+    int dim = ctx->saved_data["dim"].toInt();
+    auto original_size = grad_output.sizes().vec();
+
+    if (needs_reshaping)
+    {
+        // transpose batch and nth dim
+        grad_output = grad_output.transpose(0, dim);
+
+        // Flatten all dimensions except nth dim
+        grad_output = grad_output.reshape({grad_output.size(0), -1});
+
+        // Transpose flattened dimensions to 0th dim, nth dim to last dim
+        grad_output = grad_output.transpose(0, -1);
+    }
+
+    // Compute gradient
+    auto nonzeros = torch::ne(output, 0);
+    auto num_nonzeros = nonzeros.sum(-1, true);
+    auto sum = (grad_output * nonzeros).sum(-1, true) / num_nonzeros;
+    auto grad_input = nonzeros * (grad_output - sum.expand_as(grad_output));
+
+    if (needs_reshaping)
+    {
+        // Tranpose flattened dim to last dim, nth dim to 0th dim
+        grad_input = grad_input.transpose(0, 1);
+
+        // Reshape to original size
+        grad_input = grad_input.reshape(original_size);
+
+        // Swap batch dim and nth dim
+        grad_input = grad_input.transpose(0, dim);
+    }
+
+    auto o = torch::autograd::variable_list(2);
+    o[0] = grad_input;
+
+    return o;
+  }
+};
+
+void * _lantern_contrib_torch_sparsemax (void * input, int dim)
+{
+    LANTERN_FUNCTION_START
+    torch::Tensor t = reinterpret_cast<LanternObject<torch::Tensor> *>(input)->get();
+    torch::Tensor res = SparseMaxFunction::apply(t, dim);
+    return (void*) new LanternObject<torch::Tensor>(res);
+    LANTERN_FUNCTION_END
+}
diff --git a/man/nn_contrib_sparsemax.Rd b/man/nn_contrib_sparsemax.Rd
diff --git a/man/nnf_contrib_sparsemax.Rd b/man/nnf_contrib_sparsemax.Rd
diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp
@@ -282,6 +282,18 @@ BEGIN_RCPP
     return rcpp_result_gen;
 END_RCPP
 }
+// cpp_contrib_torch_sparsemax
+Rcpp::XPtr<XPtrTorchTensor> cpp_contrib_torch_sparsemax(Rcpp::XPtr<XPtrTorchTensor> input, int dim);
+RcppExport SEXP _torch_cpp_contrib_torch_sparsemax(SEXP inputSEXP, SEXP dimSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::RNGScope rcpp_rngScope_gen;
+    Rcpp::traits::input_parameter< Rcpp::XPtr<XPtrTorchTensor> >::type input(inputSEXP);
+    Rcpp::traits::input_parameter< int >::type dim(dimSEXP);
+    rcpp_result_gen = Rcpp::wrap(cpp_contrib_torch_sparsemax(input, dim));
+    return rcpp_result_gen;
+END_RCPP
+}
 // cpp_cuda_is_available
 bool cpp_cuda_is_available();
 RcppExport SEXP _torch_cpp_cuda_is_available() {
@@ -23836,6 +23848,7 @@ static const R_CallMethodDef CallEntries[] = {
     {"_torch_cpp_autograd_node_next_edges", (DL_FUNC) &_torch_cpp_autograd_node_next_edges, 1},
     {"_torch_cpp_autograd_edge_function", (DL_FUNC) &_torch_cpp_autograd_edge_function, 1},
     {"_torch_cpp_autograd_grad", (DL_FUNC) &_torch_cpp_autograd_grad, 6},
+    {"_torch_cpp_contrib_torch_sparsemax", (DL_FUNC) &_torch_cpp_contrib_torch_sparsemax, 2},
     {"_torch_cpp_cuda_is_available", (DL_FUNC) &_torch_cpp_cuda_is_available, 0},
     {"_torch_cpp_cuda_device_count", (DL_FUNC) &_torch_cpp_cuda_device_count, 0},
     {"_torch_cpp_cuda_current_device", (DL_FUNC) &_torch_cpp_cuda_current_device, 0},

diff --git a/src/contrib.cpp b/src/contrib.cpp
@@ -0,0 +1,9 @@
+#include "torch_types.h"
+#include "utils.h"
+
+// [[Rcpp::export]]
+Rcpp::XPtr<XPtrTorchTensor> cpp_contrib_torch_sparsemax (Rcpp::XPtr<XPtrTorchTensor> input, int dim)
+{
+  XPtrTorchTensor out = lantern_contrib_torch_sparsemax(input->get(), dim);
+  return make_xptr<XPtrTorchTensor>(out);
+}