Skip to content

Commit

Permalink
JIT-compiled CUDA backend for Enoki
Browse files Browse the repository at this point in the history
  • Loading branch information
wjakob committed Jan 23, 2019
1 parent 918453e commit 74fbfaa
Show file tree
Hide file tree
Showing 17 changed files with 2,033 additions and 45 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Expand Up @@ -48,6 +48,9 @@ Release
*_avx512_skx
*_neon
*.ppm
libenoki-cuda.so
libenoki-cuda.dylib
enoki-cuda.dll

# Documentation
/html
Expand Down
3 changes: 3 additions & 0 deletions .gitmodules
@@ -0,0 +1,3 @@
[submodule "ext/cub"]
path = ext/cub
url = https://github.com/NVlabs/cub
18 changes: 17 additions & 1 deletion CMakeLists.txt
@@ -1,5 +1,12 @@
cmake_minimum_required (VERSION 2.8.12)
project(enoki CXX)

option(ENOKI_CUDA "Build Enoki CUDA library?" OFF)

if (ENOKI_CUDA)
project(enoki CXX CUDA)
else()
project(enoki CXX)
endif()

set(ENOKI_MASTER_PROJECT OFF)
if (${CMAKE_CURRENT_SOURCE_DIR} STREQUAL ${CMAKE_SOURCE_DIR})
Expand Down Expand Up @@ -224,6 +231,15 @@ if (ENOKI_TEST)
add_subdirectory(tests)
endif()

if (ENOKI_CUDA)
include_directories(ext/cub)
add_library(enoki-cuda SHARED
src/cuda/common.cuh
src/cuda/horiz.cu
src/cuda/trace.cu
)
endif()

# Build the documentation
if (ENOKI_MASTER_PROJECT)
find_package(Sphinx)
Expand Down
1 change: 1 addition & 0 deletions ext/cub
Submodule cub added at c3ccea
1 change: 0 additions & 1 deletion include/enoki/array_avx512.h
Expand Up @@ -970,7 +970,6 @@ template <bool Approx_, RoundingMode Mode_, bool IsMask_, typename Derived_> str
}

template <size_t Stride, typename Index, typename Mask>

ENOKI_INLINE void scatter_(void *ptr, const Index &index, const Mask &mask) const {
if constexpr (sizeof(scalar_t<Index>) == 4)
_mm512_mask_i32scatter_pd(ptr, mask.k, index.m, m, Stride);
Expand Down
3 changes: 3 additions & 0 deletions include/enoki/array_base.h
Expand Up @@ -56,6 +56,9 @@ template <typename Value_, typename Derived_> struct ArrayBase {
/// Does this array compute derivatives using automatic differentation?
static constexpr bool IsDiff = is_diff_array_v<Value_>;

/// Does this array reside on the GPU? (via CUDA)
static constexpr bool IsCUDA = is_cuda_array_v<Value_>;

/// Does this array map operations onto native vector instructions?
static constexpr bool IsNative = false;

Expand Down
84 changes: 54 additions & 30 deletions include/enoki/array_call.h
Expand Up @@ -115,7 +115,7 @@ template <typename Storage_> struct call_support_base {
template <typename Func, typename InputMask,
typename Tuple, size_t ... Indices>
ENOKI_INLINE auto dispatch(Func func, InputMask mask_, Tuple tuple,
std::index_sequence<Indices...>) {
std::index_sequence<Indices...>) const {
Mask mask = Mask(mask_) & neq(self, nullptr);

using FuncResult = decltype(func(
Expand All @@ -126,23 +126,46 @@ template <typename Storage_> struct call_support_base {

if constexpr (!std::is_void_v<FuncResult>) {
using Result = typename vectorize_result<Mask, FuncResult>::type;

Result result = zero<Result>(self.size());

while (any(mask)) {
InstancePtr value = extract(self, mask);
Mask active = mask & eq(self, value);
mask = andnot(mask, active);
masked(result, active) = func(value, active, std::get<Indices>(tuple)...);
if constexpr (!is_cuda_array_v<Storage>) {
while (any(mask)) {
InstancePtr value = extract(self, mask);
Mask active = mask & eq(self, value);
mask = andnot(mask, active);
masked(result, active) = func(value, active, std::get<Indices>(tuple)...);
}
} else {
for (auto [value, perm] : partition(self & mask)) {
if (value == nullptr)
continue;

Result temp = func(value, true,
gather<std::decay_t<std::tuple_element_t<Indices, Tuple>>>(
std::get<Indices>(tuple), perm)...);

scatter(result, temp, perm);
}
}

return result;
} else {
while (any(mask)) {
InstancePtr value = extract(self, mask);
Mask active = mask & eq(self, value);
mask = andnot(mask, active);
func(value, active, std::get<Indices>(tuple)...);
if constexpr (!is_cuda_array_v<Storage>) {
while (any(mask)) {
InstancePtr value = extract(self, mask);
Mask active = mask & eq(self, value);
mask = andnot(mask, active);
func(value, active, std::get<Indices>(tuple)...);
}
} else {
for (auto [value, perm] : partition(self & mask)) {
if (value == nullptr)
continue;

func(value, true,
gather<std::decay_t<std::tuple_element_t<Indices, Tuple>>>(
std::get<Indices>(tuple), perm)...);
}
}
}
}
Expand Down Expand Up @@ -184,7 +207,7 @@ private: \
decltype(std::declval<InstancePtr>()->func(std::declval<Args>()...)); \
\
public: \
template <typename... Args> auto func(Args&&... args) { \
template <typename... Args> auto func(Args&&... args) const { \
auto lambda = [](InstancePtr instance, const Mask &mask, \
auto &&... a) ENOKI_INLINE_LAMBDA { \
ENOKI_MARK_USED(mask); \
Expand All @@ -208,29 +231,30 @@ public: \
} \
}

#define ENOKI_CALL_SUPPORT_GETTER(name, field) \
template < \
typename Field = decltype(Class::field), \
typename Return = replace_scalar_t<Storage, Field, false>> \
Return name(Mask mask = true) const { \
using IntType = replace_scalar_t<Storage, std::uintptr_t, false>; \
auto offset = \
IntType(self) + (std::uintptr_t) &(((Class *) nullptr)->field); \
mask &= neq(self, nullptr); \
return gather<Return, 1>(nullptr, offset, mask); \
}

#define ENOKI_CALL_SUPPORT_GETTER_TYPE(name, field, type) \
template < \
typename Field = decltype(Class::field), \
typename Return = replace_scalar_t<Storage, type, false>> \
Return name(Mask mask = true) const { \
using IntType = replace_scalar_t<Storage, std::uintptr_t, false>; \
auto offset = \
IntType(self) + (std::uintptr_t) &(((Class *) nullptr)->field); \
mask &= neq(self, nullptr); \
return gather<Return, 1>(nullptr, offset, mask); \
if constexpr (!is_cuda_array_v<Storage>) { \
using IntType = replace_scalar_t<Storage, std::uintptr_t, false>; \
auto offset = \
IntType(self) + (std::uintptr_t) &(((Class *) nullptr)->field); \
mask &= neq(self, nullptr); \
return gather<Return, 1>(nullptr, offset, mask); \
} else { \
auto l = [](InstancePtr inst, const Mask &) ENOKI_INLINE_LAMBDA { \
return inst->name(); \
}; \
return Base::dispatch( \
l, mask, std::tie(mask), \
std::make_index_sequence<0>()); \
} \
}

#define ENOKI_CALL_SUPPORT_GETTER(name, field) \
ENOKI_CALL_SUPPORT_GETTER_TYPE(name, field, Field)

#define ENOKI_CALL_SUPPORT_END(PacketType) \
}; \
}
Expand Down
5 changes: 4 additions & 1 deletion include/enoki/array_math.h
Expand Up @@ -127,7 +127,8 @@ ENOKI_INLINE T poly10(const T1 &x, const T2 &c0, const T2 &c1, const T2 &c2,
} else if constexpr (is_recursive_array_v<E>) { \
return E(name(low(x)), name(high(x))); \
} else if constexpr (is_dynamic_array_v<E> && \
!is_diff_array_v<E>) { \
!is_diff_array_v<E> && \
!is_cuda_array_v<E>) { \
E r = empty<E>(x.size()); \
auto pr = r.packet_ptr(); \
auto px = x.packet_ptr(); \
Expand Down Expand Up @@ -173,6 +174,7 @@ ENOKI_INLINE T poly10(const T1 &x, const T2 &c0, const T2 &c1, const T2 &c2,
return std::pair<E, E>(E(l.first, h.first), \
E(l.second, h.second)); \
} else if constexpr (is_dynamic_array_v<E> && \
!is_cuda_array_v<E> && \
!is_diff_array_v<E>) { \
std::pair<E, E> r(empty<E>(x.size()), empty<E>(x.size())); \
auto pr0 = r.first.packet_ptr(), \
Expand Down Expand Up @@ -225,6 +227,7 @@ ENOKI_INLINE T poly10(const T1 &x, const T2 &c0, const T2 &c1, const T2 &c2,
!std::is_same_v<T2, E>) { \
return name((const E& ) x, (const E &) y); \
} else if constexpr (is_dynamic_array_v<E> && \
!is_cuda_array_v<E> && \
!is_diff_array_v<E>) { \
E r; \
r.resize_like(x, y); \
Expand Down
6 changes: 5 additions & 1 deletion include/enoki/array_router.h
Expand Up @@ -649,6 +649,10 @@ template <typename T> ENOKI_INLINE auto normalize(const T &v) {
return v * rsqrt<array_approx_v<T>>(squared_norm(v));
}

template <typename T> ENOKI_INLINE auto partition(const T &v) {
return v.partition_();
}

template <typename T1, typename T2,
enable_if_t<array_size_v<T1> == 3 &&
array_size_v<T2> == 3> = 0>
Expand Down Expand Up @@ -1172,7 +1176,7 @@ ENOKI_INLINE Array gather(const Source &source, const Args &... args) {
if constexpr (is_diff_array_v<Source>)
Source::set_scatter_gather_source_(source);

Array result = gather<Array, Stride, Packed>(source.data(), args...);
Array result = gather<Array, Stride, Packed>(source.data(), args...);

if constexpr (is_diff_array_v<Source>)
Source::clear_scatter_gather_source_();
Expand Down
30 changes: 25 additions & 5 deletions include/enoki/array_struct.h
Expand Up @@ -158,7 +158,7 @@ struct struct_support<T, enable_if_static_array_t<T>> {
if constexpr (!is_diff_array_v<T>)
return value;
else
return detach2(value, std::make_index_sequence<Size>());
return detach(value, std::make_index_sequence<Size>());
}

template <typename T2>
Expand Down Expand Up @@ -202,6 +202,11 @@ struct struct_support<T, enable_if_static_array_t<T>> {
return gather(src, index, mask, std::make_index_sequence<Size>());
}

template <typename Dst, typename Index, typename Mask>
static ENOKI_INLINE void scatter(Dst &dst, const T &value, const Index &index, const Mask &mask) {
scatter(dst, value, index, mask, std::make_index_sequence<Size>());
}

private:
template <typename T2, size_t... Is>
static ENOKI_INLINE decltype(auto) packet(T2 &value, size_t i, std::index_sequence<Is...>) {
Expand Down Expand Up @@ -238,11 +243,18 @@ struct struct_support<T, enable_if_static_array_t<T>> {
}

template <typename T2, size_t... Is>
static ENOKI_INLINE decltype(auto) detach2(T2 &a, std::index_sequence<Is...>) {
static ENOKI_INLINE decltype(auto) detach(T2 &a, std::index_sequence<Is...>) {
using Value = decltype(enoki::detach(a.coeff(0)));
using Return = typename T::template ReplaceValue<Value>;
return Return(enoki::detach(a.coeff(Is))...);
}

template <typename Dst, typename Index, typename Mask, size_t... Is>
static ENOKI_INLINE void scatter(Dst &src, const T &value, const Index &index,
const Mask &mask, std::index_sequence<Is...>) {
bool unused[] = { (enoki::scatter(src.coeff(Is), value.coeff(Is), index, mask), false) ... };
ENOKI_MARK_USED(unused);
}
};

template <typename T>
Expand Down Expand Up @@ -345,14 +357,22 @@ template <typename T> bool ragged(const T &a) {
template <
typename Array, typename Index,
typename Mask = mask_t<replace_scalar_t<Index, scalar_t<Array>>>,
typename Source,
enable_if_t<!std::is_pointer_v<Source> && !std::is_array_v<Source> &&
array_depth_v<Source> != 1> = 0>
typename Source, enable_if_t<(array_depth_v<Source> > 1)> = 0>
ENOKI_INLINE Array gather(const Source &source, const Index &index,
const identity_t<Mask> &mask = true) {
return struct_support_t<Array>::gather(source, index, mask);
}


template <
typename Array, typename Index,
typename Mask = mask_t<replace_scalar_t<Index, scalar_t<Array>>>,
typename Target, enable_if_t<(array_depth_v<Target> > 1)> = 0>
ENOKI_INLINE void scatter(Target &target, const Array &value, const Index &index,
const detail::identity_t<Mask> &mask = true) {
struct_support_t<Array>::scatter(target, value, index, mask);
}

//! @}
// -----------------------------------------------------------------------

Expand Down
69 changes: 69 additions & 0 deletions include/enoki/array_traits.h
Expand Up @@ -224,6 +224,18 @@ template <typename T> struct is_diff_array<T, enable_if_array_t<T>> {
template <typename T> constexpr bool is_diff_array_v = is_diff_array<T>::value;
template <typename T> using enable_if_diff_t = enable_if_t<is_diff_array_v<T>>;

/// Does this array reside on the GPU (via CUDA)?
template <typename T, typename = int> struct is_cuda_array {
static constexpr bool value = false;
};

template <typename T> struct is_cuda_array<T, enable_if_array_t<T>> {
static constexpr bool value = std::decay_t<T>::Derived::IsCUDA;
};

template <typename T> constexpr bool is_cuda_array_v = is_cuda_array<T>::value;
template <typename T> using enable_if_cuda_t = enable_if_t<is_cuda_array_v<T>>;

/// Determine the depth of a nested Enoki array (scalars evaluate to zero)
template <typename T, typename = int> struct array_depth {
static constexpr size_t value = 0;
Expand Down Expand Up @@ -505,4 +517,61 @@ template <typename T, bool CopyFlags = true> using ssize_array_t = replace_sca

template <typename T> using struct_support_t = struct_support<std::decay_t<T>>;

// -----------------------------------------------------------------------
//! @{ \name Type enumeration
// -----------------------------------------------------------------------

enum EnokiType { Invalid = 0, Int8, UInt8, Int16, UInt16,
Int32, UInt32, Int64, UInt64, Float16,
Float32, Float64, Bool, Pointer };

template <typename T, typename = int> struct enoki_type {
static constexpr EnokiType value = EnokiType::Invalid;
};

template <typename T> struct enoki_type<T, enable_if_t<is_int8_v<T>>> {
static constexpr EnokiType value =
std::is_signed_v<T> ? EnokiType::Int8 : EnokiType::UInt8;
};

template <typename T> struct enoki_type<T, enable_if_t<is_int16_v<T>>> {
static constexpr EnokiType value =
std::is_signed_v<T> ? EnokiType::Int16 : EnokiType::UInt16;
};

template <typename T> struct enoki_type<T, enable_if_t<is_int32_v<T>>> {
static constexpr EnokiType value =
std::is_signed_v<T> ? EnokiType::Int32 : EnokiType::UInt32;
};

template <typename T> struct enoki_type<T, enable_if_t<is_int64_v<T>>> {
static constexpr EnokiType value =
std::is_signed_v<T> ? EnokiType::Int64 : EnokiType::UInt64;
};

template <> struct enoki_type<half> {
static constexpr EnokiType value = EnokiType::Float16;
};

template <> struct enoki_type<float> {
static constexpr EnokiType value = EnokiType::Float32;
};

template <> struct enoki_type<double> {
static constexpr EnokiType value = EnokiType::Float64;
};

template <> struct enoki_type<bool> {
static constexpr EnokiType value = EnokiType::Bool;
};

template <typename T> struct enoki_type<T *> {
static constexpr EnokiType value = EnokiType::Pointer;
};

template <typename T> constexpr EnokiType enoki_type_v = enoki_type<T>::value;

//! @}
// -----------------------------------------------------------------------

NAMESPACE_END(enoki)

0 comments on commit 74fbfaa

Please sign in to comment.