Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added CUDA 12.4+ support #3744

Merged
merged 8 commits into from
May 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 2 additions & 16 deletions modules/cudaarithm/src/cuda/polar_cart.cu
Original file line number Diff line number Diff line change
Expand Up @@ -133,23 +133,9 @@ void cv::cuda::cartToPolar(InputArray _x, InputArray _y, OutputArray _mag, Outpu
GpuMat_<float> anglec(angle.reshape(1));

if (angleInDegrees)
{
gridTransformTuple(zipPtr(xc, yc),
tie(magc, anglec),
make_tuple(
binaryTupleAdapter<0, 1>(magnitude_func<float>()),
binaryTupleAdapter<0, 1>(direction_func<float, true>())),
stream);
}
gridTransformBinary(xc, yc, magc, anglec, magnitude_func<float>(), direction_func<float, true>(), stream);
else
{
gridTransformTuple(zipPtr(xc, yc),
tie(magc, anglec),
make_tuple(
binaryTupleAdapter<0, 1>(magnitude_func<float>()),
binaryTupleAdapter<0, 1>(direction_func<float, false>())),
stream);
}
gridTransformBinary(xc, yc, magc, anglec, magnitude_func<float>(), direction_func<float, false>(), stream);

syncOutput(mag, _mag, stream);
syncOutput(angle, _angle, stream);
Expand Down
9 changes: 6 additions & 3 deletions modules/cudaarithm/src/cuda/split_merge.cu
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,8 @@ namespace
{
static void call(const GpuMat* src, GpuMat& dst, Stream& stream)
{
gridMerge(zipPtr(globPtr<T>(src[0]), globPtr<T>(src[1])),
const std::array<GlobPtrSz<T>, 2> d_src = {globPtr<T>(src[0]), globPtr<T>(src[1])};
gridMerge(d_src,
globPtr<typename MakeVec<T, 2>::type>(dst),
stream);
}
Expand All @@ -77,7 +78,8 @@ namespace
{
static void call(const GpuMat* src, GpuMat& dst, Stream& stream)
{
gridMerge(zipPtr(globPtr<T>(src[0]), globPtr<T>(src[1]), globPtr<T>(src[2])),
const std::array<GlobPtrSz<T>, 3> d_src = {globPtr<T>(src[0]), globPtr<T>(src[1]), globPtr<T>(src[2])};
gridMerge(d_src,
globPtr<typename MakeVec<T, 3>::type>(dst),
stream);
}
Expand All @@ -87,7 +89,8 @@ namespace
{
static void call(const GpuMat* src, GpuMat& dst, Stream& stream)
{
gridMerge(zipPtr(globPtr<T>(src[0]), globPtr<T>(src[1]), globPtr<T>(src[2]), globPtr<T>(src[3])),
const std::array<GlobPtrSz<T>, 4 > d_src = {globPtr<T>(src[0]), globPtr<T>(src[1]), globPtr<T>(src[2]), globPtr<T>(src[3])};
gridMerge(d_src,
globPtr<typename MakeVec<T, 4>::type>(dst),
stream);
}
Expand Down
55 changes: 46 additions & 9 deletions modules/cudev/include/opencv2/cudev/block/detail/reduce.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,17 @@ namespace block_reduce_detail
val = smem[tid];
}


// merge

template <typename T, class Op>
__device__ __forceinline__ void merge(volatile T* smem, T& val, uint tid, uint delta, const Op& op)
{
T reg = smem[tid + delta];
smem[tid] = val = op(val, reg);
}

#if (CUDART_VERSION < 12040)
template <typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9>
__device__ __forceinline__ void loadToSmem(const tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem,
Expand All @@ -172,15 +183,6 @@ namespace block_reduce_detail
For<0, tuple_size<tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9> >::value>::loadFromSmem(smem, val, tid);
}

// merge

template <typename T, class Op>
__device__ __forceinline__ void merge(volatile T* smem, T& val, uint tid, uint delta, const Op& op)
{
T reg = smem[tid + delta];
smem[tid] = val = op(val, reg);
}

template <typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9,
class Op0, class Op1, class Op2, class Op3, class Op4, class Op5, class Op6, class Op7, class Op8, class Op9>
Expand Down Expand Up @@ -214,6 +216,41 @@ namespace block_reduce_detail
}
#endif

#else
template <typename... P, typename... R>
__device__ __forceinline__ void loadToSmem(const tuple<P...>& smem, const tuple<R...>& val, uint tid)
{
For<0, tuple_size<tuple<P...> >::value>::loadToSmem(smem, val, tid);
}

template <typename... P, typename... R>
__device__ __forceinline__ void loadFromSmem(const tuple<P...>& smem, const tuple<R...>& val, uint tid)
{
For<0, tuple_size<tuple<P...> >::value>::loadFromSmem(smem, val, tid);
}

template <typename... P, typename... R, class... Op>
__device__ __forceinline__ void merge(const tuple<P...>& smem, const tuple<R...>& val, uint tid, uint delta, const tuple<Op...>& op)
{
For<0, tuple_size<tuple<P...> >::value>::merge(smem, val, tid, delta, op);
}

// mergeShfl

template <typename T, class Op>
__device__ __forceinline__ void mergeShfl(T& val, uint delta, uint width, const Op& op)
{
T reg = shfl_down(val, delta, width);
val = op(val, reg);
}

template <typename... R, class... Op>
__device__ __forceinline__ void mergeShfl(const tuple<R...>& val, uint delta, uint width, const tuple<Op...>& op)
{
For<0, tuple_size<tuple<R...> >::value>::mergeShfl(val, delta, width, op);
}
#endif

// Generic

template <int N> struct Generic
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,7 @@ namespace block_reduce_key_val_detail
data = smem[tid];
}

#if (CUDART_VERSION < 12040)
template <typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9>
__device__ __forceinline__ void loadToSmem(const tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& smem,
Expand Down Expand Up @@ -241,6 +242,67 @@ namespace block_reduce_key_val_detail
{
For<0, tuple_size<tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::merge(skeys, key, svals, val, cmp, tid, delta);
}
#else
template <typename... VP, typename... VR>
__device__ __forceinline__ void loadToSmem(const tuple<VP...>& smem, const tuple<VR...>& data, uint tid)
{
For<0, tuple_size<tuple<VP...> >::value>::loadToSmem(smem, data, tid);
}

template <typename... VP, typename... VR>
__device__ __forceinline__ void loadFromSmem(const tuple<VP...>& smem, const tuple<VR...>& data, uint tid)
{
For<0, tuple_size<tuple<VP...> >::value>::loadFromSmem(smem, data, tid);
}

// copyVals

template <typename V>
__device__ __forceinline__ void copyVals(volatile V* svals, V& val, uint tid, uint delta)
{
svals[tid] = val = svals[tid + delta];
}

template <typename... VP, typename... VR>
__device__ __forceinline__ void copyVals(const tuple<VP...>& svals, const tuple<VR...>& val, uint tid, uint delta)
{
For<0, tuple_size<tuple<VP...> >::value>::copy(svals, val, tid, delta);
}

// merge

template <typename K, typename V, class Cmp>
__device__ void merge(volatile K* skeys, K& key, volatile V* svals, V& val, const Cmp& cmp, uint tid, uint delta)
{
K reg = skeys[tid + delta];

if (cmp(reg, key))
{
skeys[tid] = key = reg;
copyVals(svals, val, tid, delta);
}
}

template <typename K, typename... VP, typename... VR, class Cmp>
__device__ void merge(volatile K* skeys, K& key, const tuple<VP...>& svals, const tuple<VR...>& val, const Cmp& cmp, uint tid, uint delta)
{
K reg = skeys[tid + delta];

if (cmp(reg, key))
{
skeys[tid] = key = reg;
copyVals(svals, val, tid, delta);
}
}

template <typename... KP, typename... KR, typename... VP, typename... VR, class... Cmp>
__device__ __forceinline__ void merge(const tuple<KP...>& skeys, const tuple<KR...>& key,
const tuple<VP...>& svals, const tuple<VR...>& val,
const tuple<Cmp...>& cmp, uint tid, uint delta)
{
For<0, tuple_size<tuple<VP...> >::value>::merge(skeys, key, svals, val, cmp, tid, delta);
}
#endif

// Generic

Expand Down
35 changes: 35 additions & 0 deletions modules/cudev/include/opencv2/cudev/block/reduce.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@
#include "../warp/reduce.hpp"
#include "detail/reduce.hpp"
#include "detail/reduce_key_val.hpp"
#include <cuda_runtime_api.h>

namespace cv { namespace cudev {

Expand All @@ -65,6 +66,7 @@ __device__ __forceinline__ void blockReduce(volatile T* smem, T& val, uint tid,
block_reduce_detail::Dispatcher<N>::reductor::template reduce<volatile T*, T&, const Op&>(smem, val, tid, op);
}

#if (CUDART_VERSION < 12040)
template <int N,
typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9,
Expand Down Expand Up @@ -126,6 +128,39 @@ __device__ __forceinline__ void blockReduceKeyVal(const tuple<KP0, KP1, KP2, KP3
>(skeys, key, svals, val, tid, cmp);
}

#else

template <int N, typename... P, typename... R, typename... Op>
__device__ __forceinline__ void blockReduce(const tuple<P...>& smem,
const tuple<R...>& val,
uint tid,
const tuple<Op...>& op)
{
block_reduce_detail::Dispatcher<N>::reductor::template reduce<const tuple<P...>&, const tuple<R...>&, const tuple<Op...>&>(smem, val, tid, op);
}

// blockReduceKeyVal

template <int N, typename K, typename V, class Cmp>
__device__ __forceinline__ void blockReduceKeyVal(volatile K* skeys, K& key, volatile V* svals, V& val, uint tid, const Cmp& cmp)
{
block_reduce_key_val_detail::Dispatcher<N>::reductor::template reduce<volatile K*, K&, volatile V*, V&, const Cmp&>(skeys, key, svals, val, tid, cmp);
}

template <int N, typename K, typename... VP, typename... VR, class Cmp>
__device__ __forceinline__ void blockReduceKeyVal(volatile K* skeys, K& key, const tuple<VP...>& svals, const tuple<VR...>& val, uint tid, const Cmp& cmp)
{
block_reduce_key_val_detail::Dispatcher<N>::reductor::template reduce<volatile K*, K&, const tuple<VP...>&, const tuple<VR...>&, const Cmp&>(skeys, key, svals, val, tid, cmp);
}

template <int N, typename... KP, typename... KR, typename... VP, typename... VR, class... Cmp>
__device__ __forceinline__ void blockReduceKeyVal(const tuple<KP...>& skeys, const tuple<KR...>& key, const tuple<VP...>& svals, const tuple<VR...>& val, uint tid, const tuple<Cmp...>& cmp)
{
block_reduce_key_val_detail::Dispatcher<N>::reductor::template reduce< const tuple<KP...>&, const tuple<KR...>&, const tuple<VP...>&, const tuple<VR...>&, const tuple<Cmp...>&>(skeys, key, svals, val, tid, cmp);
}

#endif

//! @}

}}
Expand Down
25 changes: 22 additions & 3 deletions modules/cudev/include/opencv2/cudev/grid/detail/split_merge.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -157,28 +157,47 @@ namespace grid_split_merge_detail
template <class Policy> struct MergeImpl<2, Policy>
{
template <class SrcPtrTuple, typename DstType, class MaskPtr>
__host__ static void merge(const SrcPtrTuple& src, const GlobPtr<DstType>& dst, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
__host__ static void mergeTuple(const SrcPtrTuple& src, const GlobPtr<DstType>& dst, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
{
mergeC2<Policy>(get<0>(src), get<1>(src), dst, mask, rows, cols, stream);
}

template <class SrcPtrArray, typename DstType, class MaskPtr>
__host__ static void mergeArray(const SrcPtrArray& src, const GlobPtr<DstType>& dst, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
{
mergeC2<Policy>(src[0], src[1], dst, mask, rows, cols, stream);
}

};

template <class Policy> struct MergeImpl<3, Policy>
{
template <class SrcPtrTuple, typename DstType, class MaskPtr>
__host__ static void merge(const SrcPtrTuple& src, const GlobPtr<DstType>& dst, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
__host__ static void mergeTuple(const SrcPtrTuple& src, const GlobPtr<DstType>& dst, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
{
mergeC3<Policy>(get<0>(src), get<1>(src), get<2>(src), dst, mask, rows, cols, stream);
}

template <class SrcPtrArray, typename DstType, class MaskPtr>
__host__ static void mergeArray(const SrcPtrArray& src, const GlobPtr<DstType>& dst, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
{
mergeC3<Policy>(src[0], src[1], src[2], dst, mask, rows, cols, stream);
}
};

template <class Policy> struct MergeImpl<4, Policy>
{
template <class SrcPtrTuple, typename DstType, class MaskPtr>
__host__ static void merge(const SrcPtrTuple& src, const GlobPtr<DstType>& dst, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
__host__ static void mergeTuple(const SrcPtrTuple& src, const GlobPtr<DstType>& dst, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
{
mergeC4<Policy>(get<0>(src), get<1>(src), get<2>(src), get<3>(src), dst, mask, rows, cols, stream);
}

template <class SrcPtrArray, typename DstType, class MaskPtr>
__host__ static void mergeArray(const SrcPtrArray& src, const GlobPtr<DstType>& dst, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
{
mergeC4<Policy>(src[0], src[1], src[2], src[3], dst, mask, rows, cols, stream);
}
};

// split
Expand Down
Loading
Loading