diff --git a/flang-rt/lib/runtime/CMakeLists.txt b/flang-rt/lib/runtime/CMakeLists.txt index 6548ec955b2b8..e8f70bd544e0b 100644 --- a/flang-rt/lib/runtime/CMakeLists.txt +++ b/flang-rt/lib/runtime/CMakeLists.txt @@ -178,9 +178,6 @@ endif () if ("${LLVM_RUNTIMES_TARGET}" MATCHES "^amdgcn|^nvptx") set(sources ${gpu_sources}) elseif(FLANG_RT_EXPERIMENTAL_OFFLOAD_SUPPORT STREQUAL "CUDA") - # findloc.cpp has some issues with higher compute capability. Remove it - # from CUDA build until we can lower its memory footprint. - list(REMOVE_ITEM supported_sources findloc.cpp) set(sources ${supported_sources}) else () set(sources ${supported_sources} ${host_sources} ${f128_sources}) diff --git a/flang-rt/lib/runtime/extrema.cpp b/flang-rt/lib/runtime/extrema.cpp index 9846529665e8b..c4575cced9017 100644 --- a/flang-rt/lib/runtime/extrema.cpp +++ b/flang-rt/lib/runtime/extrema.cpp @@ -397,9 +397,12 @@ template class COMPARE> struct DoPartialMaxOrMinLocHelper { template struct Functor { - RT_API_ATTRS void operator()(const char *intrinsic, Descriptor &result, - const Descriptor &x, int kind, int dim, const Descriptor *mask, - bool back, Terminator &terminator) const { + // NVCC inlines more aggressively which causes too many specializations of + // this function to be inlined causing compiler timeouts. Set as + // noinline to allow compilation to complete. + RT_API_ATTRS RT_DEVICE_NOINLINE void operator()(const char *intrinsic, + Descriptor &result, const Descriptor &x, int kind, int dim, + const Descriptor *mask, bool back, Terminator &terminator) const { DoPartialMaxOrMinLoc( intrinsic, result, x, kind, dim, mask, back, terminator); } diff --git a/flang-rt/lib/runtime/findloc.cpp b/flang-rt/lib/runtime/findloc.cpp index 5485f4b97bd2f..b5031ec95508d 100644 --- a/flang-rt/lib/runtime/findloc.cpp +++ b/flang-rt/lib/runtime/findloc.cpp @@ -153,10 +153,13 @@ template struct NumericFindlocHelper { template struct Functor { - RT_API_ATTRS void operator()(TypeCategory targetCat, int targetKind, - Descriptor &result, const Descriptor &x, const Descriptor &target, - int kind, int dim, const Descriptor *mask, bool back, - Terminator &terminator) const { + // NVCC inlines more aggressively which causes too many specializations of + // this function to be inlined causing compiler timeouts. Set as + // noinline to allow compilation to complete. + RT_API_ATTRS RT_DEVICE_NOINLINE void operator()(TypeCategory targetCat, + int targetKind, Descriptor &result, const Descriptor &x, + const Descriptor &target, int kind, int dim, const Descriptor *mask, + bool back, Terminator &terminator) const { switch (targetCat) { case TypeCategory::Integer: case TypeCategory::Unsigned: