From c95324337b4bb459760f46eaa4f4167b457cde0f Mon Sep 17 00:00:00 2001 From: Modi Mo Date: Wed, 1 Oct 2025 20:04:16 -0700 Subject: [PATCH 1/3] enable full flang cuda build --- flang-rt/lib/runtime/CMakeLists.txt | 3 --- flang-rt/lib/runtime/extrema.cpp | 10 +++++++--- flang-rt/lib/runtime/findloc.cpp | 11 +++++++---- 3 files changed, 14 insertions(+), 10 deletions(-) diff --git a/flang-rt/lib/runtime/CMakeLists.txt b/flang-rt/lib/runtime/CMakeLists.txt index 6548ec955b2b8..e8f70bd544e0b 100644 --- a/flang-rt/lib/runtime/CMakeLists.txt +++ b/flang-rt/lib/runtime/CMakeLists.txt @@ -178,9 +178,6 @@ endif () if ("${LLVM_RUNTIMES_TARGET}" MATCHES "^amdgcn|^nvptx") set(sources ${gpu_sources}) elseif(FLANG_RT_EXPERIMENTAL_OFFLOAD_SUPPORT STREQUAL "CUDA") - # findloc.cpp has some issues with higher compute capability. Remove it - # from CUDA build until we can lower its memory footprint. - list(REMOVE_ITEM supported_sources findloc.cpp) set(sources ${supported_sources}) else () set(sources ${supported_sources} ${host_sources} ${f128_sources}) diff --git a/flang-rt/lib/runtime/extrema.cpp b/flang-rt/lib/runtime/extrema.cpp index 9846529665e8b..29f0e93e9631f 100644 --- a/flang-rt/lib/runtime/extrema.cpp +++ b/flang-rt/lib/runtime/extrema.cpp @@ -397,9 +397,13 @@ template class COMPARE> struct DoPartialMaxOrMinLocHelper { template struct Functor { - RT_API_ATTRS void operator()(const char *intrinsic, Descriptor &result, - const Descriptor &x, int kind, int dim, const Descriptor *mask, - bool back, Terminator &terminator) const { +#if defined(__CUDACC__) + __attribute__((noinline)) +#endif + RT_API_ATTRS void + operator()(const char *intrinsic, Descriptor &result, const Descriptor &x, + int kind, int dim, const Descriptor *mask, bool back, + Terminator &terminator) const { DoPartialMaxOrMinLoc( intrinsic, result, x, kind, dim, mask, back, terminator); } diff --git a/flang-rt/lib/runtime/findloc.cpp b/flang-rt/lib/runtime/findloc.cpp index 5485f4b97bd2f..fe11386988476 100644 --- a/flang-rt/lib/runtime/findloc.cpp +++ b/flang-rt/lib/runtime/findloc.cpp @@ -153,10 +153,13 @@ template struct NumericFindlocHelper { template struct Functor { - RT_API_ATTRS void operator()(TypeCategory targetCat, int targetKind, - Descriptor &result, const Descriptor &x, const Descriptor &target, - int kind, int dim, const Descriptor *mask, bool back, - Terminator &terminator) const { +#if defined(__CUDACC__) + __attribute__((noinline)) +#endif + RT_API_ATTRS void + operator()(TypeCategory targetCat, int targetKind, Descriptor &result, + const Descriptor &x, const Descriptor &target, int kind, int dim, + const Descriptor *mask, bool back, Terminator &terminator) const { switch (targetCat) { case TypeCategory::Integer: case TypeCategory::Unsigned: From 65c8c54335b1f3ef061744e9cca90c96c7358ec0 Mon Sep 17 00:00:00 2001 From: Modi Mo Date: Thu, 2 Oct 2025 18:28:38 -0700 Subject: [PATCH 2/3] add comments --- flang-rt/lib/runtime/extrema.cpp | 3 +++ flang-rt/lib/runtime/findloc.cpp | 3 +++ 2 files changed, 6 insertions(+) diff --git a/flang-rt/lib/runtime/extrema.cpp b/flang-rt/lib/runtime/extrema.cpp index 29f0e93e9631f..3c9af38a64ddc 100644 --- a/flang-rt/lib/runtime/extrema.cpp +++ b/flang-rt/lib/runtime/extrema.cpp @@ -397,6 +397,9 @@ template class COMPARE> struct DoPartialMaxOrMinLocHelper { template struct Functor { + // NVCC inlines more aggressively which causes too many specializations of + // this function to be inlined causing compiler timeouts. Set as + // noinline to allow compilation to complete. #if defined(__CUDACC__) __attribute__((noinline)) #endif diff --git a/flang-rt/lib/runtime/findloc.cpp b/flang-rt/lib/runtime/findloc.cpp index fe11386988476..e1060bf82c333 100644 --- a/flang-rt/lib/runtime/findloc.cpp +++ b/flang-rt/lib/runtime/findloc.cpp @@ -153,6 +153,9 @@ template struct NumericFindlocHelper { template struct Functor { + // NVCC inlines more aggressively which causes too many specializations of + // this function to be inlined causing compiler timeouts. Set as + // noinline to allow compilation to complete. #if defined(__CUDACC__) __attribute__((noinline)) #endif From 7c715317555227eccb48faf96ead72464e0adfc9 Mon Sep 17 00:00:00 2001 From: Modi Mo Date: Thu, 2 Oct 2025 21:07:56 -0700 Subject: [PATCH 3/3] use RT_DEVICE_NOINLINE and clang format --- flang-rt/lib/runtime/extrema.cpp | 10 +++------- flang-rt/lib/runtime/findloc.cpp | 11 ++++------- 2 files changed, 7 insertions(+), 14 deletions(-) diff --git a/flang-rt/lib/runtime/extrema.cpp b/flang-rt/lib/runtime/extrema.cpp index 3c9af38a64ddc..c4575cced9017 100644 --- a/flang-rt/lib/runtime/extrema.cpp +++ b/flang-rt/lib/runtime/extrema.cpp @@ -400,13 +400,9 @@ struct DoPartialMaxOrMinLocHelper { // NVCC inlines more aggressively which causes too many specializations of // this function to be inlined causing compiler timeouts. Set as // noinline to allow compilation to complete. -#if defined(__CUDACC__) - __attribute__((noinline)) -#endif - RT_API_ATTRS void - operator()(const char *intrinsic, Descriptor &result, const Descriptor &x, - int kind, int dim, const Descriptor *mask, bool back, - Terminator &terminator) const { + RT_API_ATTRS RT_DEVICE_NOINLINE void operator()(const char *intrinsic, + Descriptor &result, const Descriptor &x, int kind, int dim, + const Descriptor *mask, bool back, Terminator &terminator) const { DoPartialMaxOrMinLoc( intrinsic, result, x, kind, dim, mask, back, terminator); } diff --git a/flang-rt/lib/runtime/findloc.cpp b/flang-rt/lib/runtime/findloc.cpp index e1060bf82c333..b5031ec95508d 100644 --- a/flang-rt/lib/runtime/findloc.cpp +++ b/flang-rt/lib/runtime/findloc.cpp @@ -156,13 +156,10 @@ struct NumericFindlocHelper { // NVCC inlines more aggressively which causes too many specializations of // this function to be inlined causing compiler timeouts. Set as // noinline to allow compilation to complete. -#if defined(__CUDACC__) - __attribute__((noinline)) -#endif - RT_API_ATTRS void - operator()(TypeCategory targetCat, int targetKind, Descriptor &result, - const Descriptor &x, const Descriptor &target, int kind, int dim, - const Descriptor *mask, bool back, Terminator &terminator) const { + RT_API_ATTRS RT_DEVICE_NOINLINE void operator()(TypeCategory targetCat, + int targetKind, Descriptor &result, const Descriptor &x, + const Descriptor &target, int kind, int dim, const Descriptor *mask, + bool back, Terminator &terminator) const { switch (targetCat) { case TypeCategory::Integer: case TypeCategory::Unsigned: