JIT-compiled CUDA backend for Enoki

mitsuba-renderer · Jan 23, 2019 · 74fbfaa · 74fbfaa
1 parent 918453e
commit 74fbfaa
Show file tree

Hide file tree

Showing 17 changed files with 2,033 additions and 45 deletions.
diff --git a/.gitignore b/.gitignore
@@ -48,6 +48,9 @@ Release
 *_avx512_skx
 *_neon
 *.ppm
+libenoki-cuda.so
+libenoki-cuda.dylib
+enoki-cuda.dll
 
 # Documentation
 /html

diff --git a/.gitmodules b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "ext/cub"]
+	path = ext/cub
+	url = https://github.com/NVlabs/cub
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1,5 +1,12 @@
 cmake_minimum_required (VERSION 2.8.12)
-project(enoki CXX)
+
+option(ENOKI_CUDA "Build Enoki CUDA library?" OFF)
+
+if (ENOKI_CUDA)
+  project(enoki CXX CUDA)
+else()
+  project(enoki CXX)
+endif()
 
 set(ENOKI_MASTER_PROJECT OFF)
 if (${CMAKE_CURRENT_SOURCE_DIR} STREQUAL ${CMAKE_SOURCE_DIR})
@@ -224,6 +231,15 @@ if (ENOKI_TEST)
   add_subdirectory(tests)
 endif()
 
+if (ENOKI_CUDA)
+  include_directories(ext/cub)
+  add_library(enoki-cuda SHARED
+      src/cuda/common.cuh
+      src/cuda/horiz.cu
+      src/cuda/trace.cu
+  )
+endif()
+
 # Build the documentation
 if (ENOKI_MASTER_PROJECT)
   find_package(Sphinx)

diff --git a/ext/cub b/ext/cub
diff --git a/include/enoki/array_avx512.h b/include/enoki/array_avx512.h
@@ -970,7 +970,6 @@ template <bool Approx_, RoundingMode Mode_, bool IsMask_, typename Derived_> str
     }
 
     template <size_t Stride, typename Index, typename Mask>
-
     ENOKI_INLINE void scatter_(void *ptr, const Index &index, const Mask &mask) const {
         if constexpr (sizeof(scalar_t<Index>) == 4)
             _mm512_mask_i32scatter_pd(ptr, mask.k, index.m, m, Stride);

diff --git a/include/enoki/array_base.h b/include/enoki/array_base.h
@@ -56,6 +56,9 @@ template <typename Value_, typename Derived_> struct ArrayBase {
     /// Does this array compute derivatives using automatic differentation?
     static constexpr bool IsDiff = is_diff_array_v<Value_>;
 
+    /// Does this array reside on the GPU? (via CUDA)
+    static constexpr bool IsCUDA = is_cuda_array_v<Value_>;
+
     /// Does this array map operations onto native vector instructions?
     static constexpr bool IsNative = false;
 

diff --git a/include/enoki/array_call.h b/include/enoki/array_call.h
@@ -115,7 +115,7 @@ template <typename Storage_> struct call_support_base {
     template <typename Func, typename InputMask,
               typename Tuple, size_t ... Indices>
     ENOKI_INLINE auto dispatch(Func func, InputMask mask_, Tuple tuple,
-                               std::index_sequence<Indices...>) {
+                               std::index_sequence<Indices...>) const {
         Mask mask = Mask(mask_) & neq(self, nullptr);
 
         using FuncResult = decltype(func(
@@ -126,23 +126,46 @@ template <typename Storage_> struct call_support_base {
 
         if constexpr (!std::is_void_v<FuncResult>) {
             using Result = typename vectorize_result<Mask, FuncResult>::type;
-
             Result result = zero<Result>(self.size());
 
-            while (any(mask)) {
-                InstancePtr value      = extract(self, mask);
-                Mask active            = mask & eq(self, value);
-                mask                   = andnot(mask, active);
-                masked(result, active) = func(value, active, std::get<Indices>(tuple)...);
+            if constexpr (!is_cuda_array_v<Storage>) {
+                while (any(mask)) {
+                    InstancePtr value      = extract(self, mask);
+                    Mask active            = mask & eq(self, value);
+                    mask                   = andnot(mask, active);
+                    masked(result, active) = func(value, active, std::get<Indices>(tuple)...);
+                }
+            } else {
+                for (auto [value, perm] : partition(self & mask)) {
+                    if (value == nullptr)
+                        continue;
+
+                    Result temp = func(value, true,
+                        gather<std::decay_t<std::tuple_element_t<Indices, Tuple>>>(
+                            std::get<Indices>(tuple), perm)...);
+
+                    scatter(result, temp, perm);
+                }
             }
 
             return result;
         } else {
-            while (any(mask)) {
-                InstancePtr value = extract(self, mask);
-                Mask active       = mask & eq(self, value);
-                mask              = andnot(mask, active);
-                func(value, active, std::get<Indices>(tuple)...);
+            if constexpr (!is_cuda_array_v<Storage>) {
+                while (any(mask)) {
+                    InstancePtr value = extract(self, mask);
+                    Mask active       = mask & eq(self, value);
+                    mask              = andnot(mask, active);
+                    func(value, active, std::get<Indices>(tuple)...);
+                }
+            } else {
+                for (auto [value, perm] : partition(self & mask)) {
+                    if (value == nullptr)
+                        continue;
+
+                    func(value, true,
+                         gather<std::decay_t<std::tuple_element_t<Indices, Tuple>>>(
+                             std::get<Indices>(tuple), perm)...);
+                }
             }
         }
     }
@@ -184,7 +207,7 @@ private:                                                                       \
         decltype(std::declval<InstancePtr>()->func(std::declval<Args>()...));  \
                                                                                \
 public:                                                                        \
-    template <typename... Args> auto func(Args&&... args) {                    \
+    template <typename... Args> auto func(Args&&... args) const {              \
         auto lambda = [](InstancePtr instance, const Mask &mask,               \
                          auto &&... a) ENOKI_INLINE_LAMBDA {                   \
             ENOKI_MARK_USED(mask);                                             \
@@ -208,29 +231,30 @@ public:                                                                        \
         }                                                                      \
     }
 
-#define ENOKI_CALL_SUPPORT_GETTER(name, field)                                 \
-    template <                                                                 \
-        typename Field = decltype(Class::field),                               \
-        typename Return = replace_scalar_t<Storage, Field, false>>             \
-    Return name(Mask mask = true) const {                                      \
-        using IntType = replace_scalar_t<Storage, std::uintptr_t, false>;      \
-        auto offset =                                                          \
-            IntType(self) + (std::uintptr_t) &(((Class *) nullptr)->field);    \
-        mask &= neq(self, nullptr);                                            \
-        return gather<Return, 1>(nullptr, offset, mask);                       \
-    }
-
 #define ENOKI_CALL_SUPPORT_GETTER_TYPE(name, field, type)                      \
     template <                                                                 \
+        typename Field = decltype(Class::field),                               \
         typename Return = replace_scalar_t<Storage, type, false>>              \
     Return name(Mask mask = true) const {                                      \
-        using IntType = replace_scalar_t<Storage, std::uintptr_t, false>;      \
-        auto offset =                                                          \
-            IntType(self) + (std::uintptr_t) &(((Class *) nullptr)->field);    \
-        mask &= neq(self, nullptr);                                            \
-        return gather<Return, 1>(nullptr, offset, mask);                       \
+        if constexpr (!is_cuda_array_v<Storage>) {                             \
+            using IntType = replace_scalar_t<Storage, std::uintptr_t, false>;  \
+            auto offset =                                                      \
+               IntType(self) + (std::uintptr_t) &(((Class *) nullptr)->field); \
+            mask &= neq(self, nullptr);                                        \
+            return gather<Return, 1>(nullptr, offset, mask);                   \
+        } else {                                                               \
+            auto l = [](InstancePtr inst, const Mask &) ENOKI_INLINE_LAMBDA {  \
+                return inst->name();                                           \
+            };                                                                 \
+            return Base::dispatch(                                             \
+                l, mask, std::tie(mask),                                       \
+                std::make_index_sequence<0>());                                \
+        }                                                                      \
     }
 
+#define ENOKI_CALL_SUPPORT_GETTER(name, field) \
+    ENOKI_CALL_SUPPORT_GETTER_TYPE(name, field, Field)
+
 #define ENOKI_CALL_SUPPORT_END(PacketType)                                     \
         };                                                                     \
     }

diff --git a/include/enoki/array_math.h b/include/enoki/array_math.h
@@ -127,7 +127,8 @@ ENOKI_INLINE T poly10(const T1 &x, const T2 &c0, const T2 &c1, const T2 &c2,
         } else if constexpr (is_recursive_array_v<E>) {                        \
             return E(name(low(x)), name(high(x)));                             \
         } else if constexpr (is_dynamic_array_v<E> &&                          \
-                            !is_diff_array_v<E>) {                             \
+                            !is_diff_array_v<E> &&                             \
+                            !is_cuda_array_v<E>) {                             \
             E r = empty<E>(x.size());                                          \
             auto pr = r.packet_ptr();                                          \
             auto px = x.packet_ptr();                                          \
@@ -173,6 +174,7 @@ ENOKI_INLINE T poly10(const T1 &x, const T2 &c0, const T2 &c1, const T2 &c2,
             return std::pair<E, E>(E(l.first, h.first),                        \
                                    E(l.second, h.second));                     \
         } else if constexpr (is_dynamic_array_v<E> &&                          \
+                            !is_cuda_array_v<E> &&                             \
                             !is_diff_array_v<E>) {                             \
             std::pair<E, E> r(empty<E>(x.size()), empty<E>(x.size()));         \
             auto pr0 = r.first.packet_ptr(),                                   \
@@ -225,6 +227,7 @@ ENOKI_INLINE T poly10(const T1 &x, const T2 &c0, const T2 &c1, const T2 &c2,
                              !std::is_same_v<T2, E>) {                         \
             return name((const E& ) x, (const E &) y);                         \
         } else if constexpr (is_dynamic_array_v<E> &&                          \
+                            !is_cuda_array_v<E> &&                             \
                             !is_diff_array_v<E>) {                             \
             E r;                                                               \
             r.resize_like(x, y);                                               \

diff --git a/include/enoki/array_router.h b/include/enoki/array_router.h
@@ -649,6 +649,10 @@ template <typename T> ENOKI_INLINE auto normalize(const T &v) {
     return v * rsqrt<array_approx_v<T>>(squared_norm(v));
 }
 
+template <typename T> ENOKI_INLINE auto partition(const T &v) {
+    return v.partition_();
+}
+
 template <typename T1, typename T2,
           enable_if_t<array_size_v<T1> == 3 &&
                       array_size_v<T2> == 3> = 0>
@@ -1172,7 +1176,7 @@ ENOKI_INLINE Array gather(const Source &source, const Args &... args) {
     if constexpr (is_diff_array_v<Source>)
         Source::set_scatter_gather_source_(source);
 
-   Array result = gather<Array, Stride, Packed>(source.data(), args...);
+    Array result = gather<Array, Stride, Packed>(source.data(), args...);
 
     if constexpr (is_diff_array_v<Source>)
         Source::clear_scatter_gather_source_();

diff --git a/include/enoki/array_struct.h b/include/enoki/array_struct.h
@@ -158,7 +158,7 @@ struct struct_support<T, enable_if_static_array_t<T>> {
         if constexpr (!is_diff_array_v<T>)
             return value;
         else
-            return detach2(value, std::make_index_sequence<Size>());
+            return detach(value, std::make_index_sequence<Size>());
     }
 
     template <typename T2>
@@ -202,6 +202,11 @@ struct struct_support<T, enable_if_static_array_t<T>> {
         return gather(src, index, mask, std::make_index_sequence<Size>());
     }
 
+    template <typename Dst, typename Index, typename Mask>
+    static ENOKI_INLINE void scatter(Dst &dst, const T &value, const Index &index, const Mask &mask) {
+        scatter(dst, value, index, mask, std::make_index_sequence<Size>());
+    }
+
 private:
     template <typename T2, size_t... Is>
     static ENOKI_INLINE decltype(auto) packet(T2 &value, size_t i, std::index_sequence<Is...>) {
@@ -238,11 +243,18 @@ struct struct_support<T, enable_if_static_array_t<T>> {
     }
 
     template <typename T2, size_t... Is>
-    static ENOKI_INLINE decltype(auto) detach2(T2 &a, std::index_sequence<Is...>) {
+    static ENOKI_INLINE decltype(auto) detach(T2 &a, std::index_sequence<Is...>) {
         using Value = decltype(enoki::detach(a.coeff(0)));
         using Return = typename T::template ReplaceValue<Value>;
         return Return(enoki::detach(a.coeff(Is))...);
     }
+
+    template <typename Dst, typename Index, typename Mask, size_t... Is>
+    static ENOKI_INLINE void scatter(Dst &src, const T &value, const Index &index,
+                                     const Mask &mask, std::index_sequence<Is...>) {
+        bool unused[] = { (enoki::scatter(src.coeff(Is), value.coeff(Is), index, mask), false) ... };
+        ENOKI_MARK_USED(unused);
+    }
 };
 
 template <typename T>
@@ -345,14 +357,22 @@ template <typename T> bool ragged(const T &a) {
 template <
     typename Array, typename Index,
     typename Mask = mask_t<replace_scalar_t<Index, scalar_t<Array>>>,
-    typename Source,
-    enable_if_t<!std::is_pointer_v<Source> && !std::is_array_v<Source> &&
-                 array_depth_v<Source> != 1> = 0>
+    typename Source, enable_if_t<(array_depth_v<Source> > 1)> = 0>
 ENOKI_INLINE Array gather(const Source &source, const Index &index,
                           const identity_t<Mask> &mask = true) {
     return struct_support_t<Array>::gather(source, index, mask);
 }
 
+
+template <
+    typename Array, typename Index,
+    typename Mask = mask_t<replace_scalar_t<Index, scalar_t<Array>>>,
+    typename Target, enable_if_t<(array_depth_v<Target> > 1)> = 0>
+ENOKI_INLINE void scatter(Target &target, const Array &value, const Index &index,
+                          const detail::identity_t<Mask> &mask = true) {
+    struct_support_t<Array>::scatter(target, value, index, mask);
+}
+
 //! @}
 // -----------------------------------------------------------------------
 

diff --git a/include/enoki/array_traits.h b/include/enoki/array_traits.h
@@ -224,6 +224,18 @@ template <typename T> struct is_diff_array<T, enable_if_array_t<T>> {
 template <typename T> constexpr bool is_diff_array_v = is_diff_array<T>::value;
 template <typename T> using enable_if_diff_t = enable_if_t<is_diff_array_v<T>>;
 
+/// Does this array reside on the GPU (via CUDA)?
+template <typename T, typename = int> struct is_cuda_array {
+    static constexpr bool value = false;
+};
+
+template <typename T> struct is_cuda_array<T, enable_if_array_t<T>> {
+    static constexpr bool value = std::decay_t<T>::Derived::IsCUDA;
+};
+
+template <typename T> constexpr bool is_cuda_array_v = is_cuda_array<T>::value;
+template <typename T> using enable_if_cuda_t = enable_if_t<is_cuda_array_v<T>>;
+
 /// Determine the depth of a nested Enoki array (scalars evaluate to zero)
 template <typename T, typename = int> struct array_depth {
     static constexpr size_t value = 0;
@@ -505,4 +517,61 @@ template <typename T, bool CopyFlags = true> using ssize_array_t   = replace_sca
 
 template <typename T> using struct_support_t = struct_support<std::decay_t<T>>;
 
+// -----------------------------------------------------------------------
+//! @{ \name Type enumeration
+// -----------------------------------------------------------------------
+
+enum EnokiType { Invalid = 0, Int8, UInt8, Int16, UInt16,
+                 Int32, UInt32, Int64, UInt64, Float16,
+                 Float32, Float64, Bool, Pointer };
+
+template <typename T, typename = int> struct enoki_type {
+    static constexpr EnokiType value = EnokiType::Invalid;
+};
+
+template <typename T> struct enoki_type<T, enable_if_t<is_int8_v<T>>> {
+    static constexpr EnokiType value =
+        std::is_signed_v<T> ? EnokiType::Int8 : EnokiType::UInt8;
+};
+
+template <typename T> struct enoki_type<T, enable_if_t<is_int16_v<T>>> {
+    static constexpr EnokiType value =
+        std::is_signed_v<T> ? EnokiType::Int16 : EnokiType::UInt16;
+};
+
+template <typename T> struct enoki_type<T, enable_if_t<is_int32_v<T>>> {
+    static constexpr EnokiType value =
+        std::is_signed_v<T> ? EnokiType::Int32 : EnokiType::UInt32;
+};
+
+template <typename T> struct enoki_type<T, enable_if_t<is_int64_v<T>>> {
+    static constexpr EnokiType value =
+        std::is_signed_v<T> ? EnokiType::Int64 : EnokiType::UInt64;
+};
+
+template <> struct enoki_type<half> {
+    static constexpr EnokiType value = EnokiType::Float16;
+};
+
+template <> struct enoki_type<float> {
+    static constexpr EnokiType value = EnokiType::Float32;
+};
+
+template <> struct enoki_type<double> {
+    static constexpr EnokiType value = EnokiType::Float64;
+};
+
+template <> struct enoki_type<bool> {
+    static constexpr EnokiType value = EnokiType::Bool;
+};
+
+template <typename T> struct enoki_type<T *> {
+    static constexpr EnokiType value = EnokiType::Pointer;
+};
+
+template <typename T> constexpr EnokiType enoki_type_v = enoki_type<T>::value;
+
+//! @}
+// -----------------------------------------------------------------------
+
 NAMESPACE_END(enoki)