diff --git a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h index bbdef481a2085..b64419f5ae6da 100644 --- a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h +++ b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h @@ -190,6 +190,7 @@ struct IntrinsicLibrary { mlir::Value genAtomicAdd(mlir::Type, llvm::ArrayRef); fir::ExtendedValue genAtomicAddR2(mlir::Type, llvm::ArrayRef); + template fir::ExtendedValue genAtomicAddVector(mlir::Type, llvm::ArrayRef); mlir::Value genAtomicAnd(mlir::Type, llvm::ArrayRef); diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp index b9ea8b125b780..3156c8cb4332c 100644 --- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp +++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp @@ -290,12 +290,12 @@ static constexpr IntrinsicHandler handlers[]{ {"atan2pi", &I::genAtanpi}, {"atand", &I::genAtand}, {"atanpi", &I::genAtanpi}, - {"atomicadd_r2x2", - &I::genAtomicAddVector, + {"atomicadd_r4x2", + &I::genAtomicAddVector<2>, {{{"a", asAddr}, {"v", asAddr}}}, false}, - {"atomicadd_r4x2", - &I::genAtomicAddVector, + {"atomicadd_r4x4", + &I::genAtomicAddVector<4>, {{{"a", asAddr}, {"v", asAddr}}}, false}, {"atomicaddd", &I::genAtomicAdd, {{{"a", asAddr}, {"v", asValue}}}, false}, @@ -306,6 +306,14 @@ static constexpr IntrinsicHandler handlers[]{ &I::genAtomicAddR2, {{{"a", asAddr}, {"v", asAddr}}}, false}, + {"atomicaddvector_r2x2", + &I::genAtomicAddVector<2>, + {{{"a", asAddr}, {"v", asAddr}}}, + false}, + {"atomicaddvector_r4x2", + &I::genAtomicAddVector<2>, + {{{"a", asAddr}, {"v", asAddr}}}, + false}, {"atomicandi", &I::genAtomicAnd, {{{"a", asAddr}, {"v", asValue}}}, false}, {"atomiccasd", &I::genAtomicCas, @@ -3176,44 +3184,51 @@ IntrinsicLibrary::genAtomicAddR2(mlir::Type resultType, mlir::ArrayRef{0}); } +template fir::ExtendedValue IntrinsicLibrary::genAtomicAddVector(mlir::Type resultType, llvm::ArrayRef args) { assert(args.size() == 2); mlir::Value res = fir::AllocaOp::create( - builder, loc, fir::SequenceType::get({2}, resultType)); + builder, loc, fir::SequenceType::get({extent}, resultType)); mlir::Value a = fir::getBase(args[0]); if (mlir::isa(a.getType())) { a = fir::BoxAddrOp::create(builder, loc, a); } - auto vecTy = mlir::VectorType::get({2}, resultType); + auto vecTy = mlir::VectorType::get({extent}, resultType); auto refTy = fir::ReferenceType::get(resultType); mlir::Type i32Ty = builder.getI32Type(); mlir::Type idxTy = builder.getIndexType(); - mlir::Value zero = builder.createIntegerConstant(loc, idxTy, 0); - mlir::Value one = builder.createIntegerConstant(loc, idxTy, 1); - mlir::Value v1Coord = fir::CoordinateOp::create(builder, loc, refTy, - fir::getBase(args[1]), zero); - mlir::Value v2Coord = fir::CoordinateOp::create(builder, loc, refTy, - fir::getBase(args[1]), one); - mlir::Value v1 = fir::LoadOp::create(builder, loc, v1Coord); - mlir::Value v2 = fir::LoadOp::create(builder, loc, v2Coord); + + // Extract the values from the array. + llvm::SmallVector values; + for (unsigned i = 0; i < extent; ++i) { + mlir::Value pos = builder.createIntegerConstant(loc, idxTy, i); + mlir::Value coord = fir::CoordinateOp::create(builder, loc, refTy, + fir::getBase(args[1]), pos); + mlir::Value value = fir::LoadOp::create(builder, loc, coord); + values.push_back(value); + } + // Pack extracted values into a vector to call the atomic add. mlir::Value undef = mlir::LLVM::UndefOp::create(builder, loc, vecTy); - mlir::Value vec1 = mlir::LLVM::InsertElementOp::create( - builder, loc, undef, v1, builder.createIntegerConstant(loc, i32Ty, 0)); - mlir::Value vec2 = mlir::LLVM::InsertElementOp::create( - builder, loc, vec1, v2, builder.createIntegerConstant(loc, i32Ty, 1)); + for (unsigned i = 0; i < extent; ++i) { + mlir::Value insert = mlir::LLVM::InsertElementOp::create( + builder, loc, undef, values[i], + builder.createIntegerConstant(loc, i32Ty, i)); + undef = insert; + } + // Atomic operation with a vector of values. mlir::Value add = - genAtomBinOp(builder, loc, mlir::LLVM::AtomicBinOp::fadd, a, vec2); - mlir::Value r1 = mlir::LLVM::ExtractElementOp::create( - builder, loc, add, builder.createIntegerConstant(loc, i32Ty, 0)); - mlir::Value r2 = mlir::LLVM::ExtractElementOp::create( - builder, loc, add, builder.createIntegerConstant(loc, i32Ty, 1)); - mlir::Value c1 = fir::CoordinateOp::create(builder, loc, refTy, res, zero); - mlir::Value c2 = fir::CoordinateOp::create(builder, loc, refTy, res, one); - fir::StoreOp::create(builder, loc, r1, c1); - fir::StoreOp::create(builder, loc, r2, c2); - mlir::Value ext = builder.createIntegerConstant(loc, idxTy, 2); + genAtomBinOp(builder, loc, mlir::LLVM::AtomicBinOp::fadd, a, undef); + // Store results in the result array. + for (unsigned i = 0; i < extent; ++i) { + mlir::Value r = mlir::LLVM::ExtractElementOp::create( + builder, loc, add, builder.createIntegerConstant(loc, i32Ty, i)); + mlir::Value c = fir::CoordinateOp::create( + builder, loc, refTy, res, builder.createIntegerConstant(loc, idxTy, i)); + fir::StoreOp::create(builder, loc, r, c); + } + mlir::Value ext = builder.createIntegerConstant(loc, idxTy, extent); return fir::ArrayBoxValue(res, {ext}); } diff --git a/flang/module/cudadevice.f90 b/flang/module/cudadevice.f90 index b1aef95cba8c9..27097193aaa9b 100644 --- a/flang/module/cudadevice.f90 +++ b/flang/module/cudadevice.f90 @@ -1179,13 +1179,22 @@ attributes(device) pure integer(4) function atomicaddr2(address, val) end interface interface atomicaddvector - attributes(device) pure function atomicadd_r2x2(address, val) result(z) + attributes(device) pure function atomicaddvector_r2x2(address, val) result(z) !dir$ ignore_tkr (rd) address, (d) val real(2), dimension(2), intent(inout) :: address real(2), dimension(2), intent(in) :: val real(2), dimension(2) :: z end function + attributes(device) pure function atomicaddvector_r4x2(address, val) result(z) + !dir$ ignore_tkr (rd) address, (d) val + real(4), dimension(2), intent(inout) :: address + real(4), dimension(2), intent(in) :: val + real(4), dimension(2) :: z + end function + end interface + + interface atomicaddreal4x2 attributes(device) pure function atomicadd_r4x2(address, val) result(z) !dir$ ignore_tkr (rd) address, (d) val real(4), dimension(2), intent(inout) :: address @@ -1194,6 +1203,15 @@ attributes(device) pure function atomicadd_r4x2(address, val) result(z) end function end interface + interface atomicaddreal4x4 + attributes(device) pure function atomicadd_r4x4(address, val) result(z) + !dir$ ignore_tkr (rd) address, (d) val + real(4), dimension(4), intent(inout) :: address + real(4), dimension(4), intent(in) :: val + real(4), dimension(4) :: z + end function + end interface + interface atomicsub attributes(device) pure integer function atomicsubi(address, val) !dir$ ignore_tkr (d) address, (d) val diff --git a/flang/test/Lower/CUDA/cuda-atomicadd.cuf b/flang/test/Lower/CUDA/cuda-atomicadd.cuf index baa6cdb3d5869..6669b4afa291d 100644 --- a/flang/test/Lower/CUDA/cuda-atomicadd.cuf +++ b/flang/test/Lower/CUDA/cuda-atomicadd.cuf @@ -2,18 +2,34 @@ ! Test CUDA Fortran atmoicadd functions available cudadevice module -attributes(global) subroutine atomicaddvector_r2() +attributes(global) subroutine test_atomicaddvector_r2() real(2), device :: a(2), tmp1(2), tmp2(2) tmp1 = atomicAddVector(a, tmp2) end subroutine -! CHECK-LABEL: func.func @_QPatomicaddvector_r2() attributes {cuf.proc_attr = #cuf.cuda_proc} +! CHECK-LABEL: func.func @_QPtest_atomicaddvector_r2() attributes {cuf.proc_attr = #cuf.cuda_proc} ! CHECK: llvm.atomicrmw fadd %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, vector<2xf16> -attributes(global) subroutine atomicaddvector_r4() +attributes(global) subroutine test_atomicaddvector_r4() real(4), device :: a(2), tmp1(2), tmp2(2) tmp1 = atomicAddVector(a, tmp2) end subroutine -! CHECK-LABEL: func.func @_QPatomicaddvector_r4() attributes {cuf.proc_attr = #cuf.cuda_proc} +! CHECK-LABEL: func.func @_QPtest_atomicaddvector_r4() attributes {cuf.proc_attr = #cuf.cuda_proc} ! CHECK: llvm.atomicrmw fadd %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, vector<2xf32> + +attributes(global) subroutine test_atomicadd_r2x4() + real(4), device :: a(2), tmp1(2), tmp2(2) + tmp1 = atomicaddreal4x2(a, tmp2) +end subroutine + +! CHECK-LABEL: func.func @_QPtest_atomicadd_r2x4() attributes {cuf.proc_attr = #cuf.cuda_proc} +! CHECK: llvm.atomicrmw fadd %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, vector<2xf32> + +attributes(global) subroutine test_atomicadd_r4x4() + real(4), device :: a(4), tmp1(4), tmp2(4) + tmp1 = atomicaddreal4x4(a, tmp2) +end subroutine + +! CHECK-LABEL: func.func @_QPtest_atomicadd_r4x4() attributes {cuf.proc_attr = #cuf.cuda_proc} +! CHECK: llvm.atomicrmw fadd %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, vector<4xf32>