Skip to content

Commit

Permalink
Reapply "[LowerMemIntrinsics] Use correct alignment in residual loop …
Browse files Browse the repository at this point in the history
…for variable llvm.memcpy" (#98482)

Reverts #98295, which reverted #97998

The failure in the "InOneWeekend" test of the HIP test suite on
clang-hip-vega20
(https://lab.llvm.org/buildbot/#/builders/123/builds/1498) seems to be
unrelated; I observed it (and a similar failure for the "TheNextWeek"
test in the same suite) intermittently on my system, with and without
the patch applied. (It occurred in 2 out of 50 repeated runs without the
patch and in 1 out of 50 runs with the patch.)
  • Loading branch information
ritter-x2a committed Jul 12, 2024
1 parent d601ca3 commit cbc96b9
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 15 deletions.
9 changes: 6 additions & 3 deletions llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -262,6 +262,9 @@ void llvm::createMemCpyLoopUnknownSize(
assert((ResLoopOpSize == AtomicElementSize ? *AtomicElementSize : 1) &&
"Store size is expected to match type size");

Align ResSrcAlign(commonAlignment(PartSrcAlign, ResLoopOpSize));
Align ResDstAlign(commonAlignment(PartDstAlign, ResLoopOpSize));

Value *RuntimeResidual = getRuntimeLoopRemainder(DL, PLBuilder, CopyLen,
CILoopOpSize, LoopOpSize);
Value *RuntimeBytesCopied = PLBuilder.CreateSub(CopyLen, RuntimeResidual);
Expand Down Expand Up @@ -303,16 +306,16 @@ void llvm::createMemCpyLoopUnknownSize(
Value *SrcGEP =
ResBuilder.CreateInBoundsGEP(ResLoopOpType, SrcAddr, FullOffset);
LoadInst *Load = ResBuilder.CreateAlignedLoad(ResLoopOpType, SrcGEP,
PartSrcAlign, SrcIsVolatile);
ResSrcAlign, SrcIsVolatile);
if (!CanOverlap) {
// Set alias scope for loads.
Load->setMetadata(LLVMContext::MD_alias_scope,
MDNode::get(Ctx, NewScope));
}
Value *DstGEP =
ResBuilder.CreateInBoundsGEP(ResLoopOpType, DstAddr, FullOffset);
StoreInst *Store = ResBuilder.CreateAlignedStore(Load, DstGEP, PartDstAlign,
DstIsVolatile);
StoreInst *Store =
ResBuilder.CreateAlignedStore(Load, DstGEP, ResDstAlign, DstIsVolatile);
if (!CanOverlap) {
// Indicate that stores don't overlap loads.
Store->setMetadata(LLVMContext::MD_noalias, MDNode::get(Ctx, NewScope));
Expand Down
24 changes: 12 additions & 12 deletions llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
Original file line number Diff line number Diff line change
Expand Up @@ -930,9 +930,9 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_variable(ptr addrs
; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
; OPT-NEXT: [[TMP10:%.*]] = add i64 [[TMP3]], [[RESIDUAL_LOOP_INDEX]]
; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[TMP10]]
; OPT-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(1) [[TMP11]], align 4
; OPT-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(1) [[TMP11]], align 1
; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[TMP10]]
; OPT-NEXT: store i8 [[TMP12]], ptr addrspace(1) [[TMP13]], align 4
; OPT-NEXT: store i8 [[TMP12]], ptr addrspace(1) [[TMP13]], align 1
; OPT-NEXT: [[TMP14]] = add i64 [[RESIDUAL_LOOP_INDEX]], 1
; OPT-NEXT: [[TMP15:%.*]] = icmp ult i64 [[TMP14]], [[TMP2]]
; OPT-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
Expand Down Expand Up @@ -966,9 +966,9 @@ define amdgpu_kernel void @memcpy_global_align2_global_align2_variable(ptr addrs
; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
; OPT-NEXT: [[TMP10:%.*]] = add i64 [[TMP3]], [[RESIDUAL_LOOP_INDEX]]
; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[TMP10]]
; OPT-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(1) [[TMP11]], align 2
; OPT-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(1) [[TMP11]], align 1
; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[TMP10]]
; OPT-NEXT: store i8 [[TMP12]], ptr addrspace(1) [[TMP13]], align 2
; OPT-NEXT: store i8 [[TMP12]], ptr addrspace(1) [[TMP13]], align 1
; OPT-NEXT: [[TMP14]] = add i64 [[RESIDUAL_LOOP_INDEX]], 1
; OPT-NEXT: [[TMP15:%.*]] = icmp ult i64 [[TMP14]], [[TMP2]]
; OPT-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
Expand Down Expand Up @@ -1038,9 +1038,9 @@ define amdgpu_kernel void @memcpy_local_align4_local_align4_variable(ptr addrspa
; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
; OPT-NEXT: [[TMP10:%.*]] = add i32 [[TMP3]], [[RESIDUAL_LOOP_INDEX]]
; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[TMP10]]
; OPT-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(3) [[TMP11]], align 4
; OPT-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(3) [[TMP11]], align 1
; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[TMP10]]
; OPT-NEXT: store i8 [[TMP12]], ptr addrspace(3) [[TMP13]], align 4
; OPT-NEXT: store i8 [[TMP12]], ptr addrspace(3) [[TMP13]], align 1
; OPT-NEXT: [[TMP14]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1
; OPT-NEXT: [[TMP15:%.*]] = icmp ult i32 [[TMP14]], [[TMP2]]
; OPT-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
Expand Down Expand Up @@ -1074,9 +1074,9 @@ define amdgpu_kernel void @memcpy_local_align2_local_align2_variable(ptr addrspa
; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
; OPT-NEXT: [[TMP10:%.*]] = add i32 [[TMP3]], [[RESIDUAL_LOOP_INDEX]]
; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[TMP10]]
; OPT-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(3) [[TMP11]], align 2
; OPT-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(3) [[TMP11]], align 1
; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[TMP10]]
; OPT-NEXT: store i8 [[TMP12]], ptr addrspace(3) [[TMP13]], align 2
; OPT-NEXT: store i8 [[TMP12]], ptr addrspace(3) [[TMP13]], align 1
; OPT-NEXT: [[TMP14]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1
; OPT-NEXT: [[TMP15:%.*]] = icmp ult i32 [[TMP14]], [[TMP2]]
; OPT-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
Expand Down Expand Up @@ -1146,9 +1146,9 @@ define amdgpu_kernel void @memcpy_local_align4_global_align4_variable(ptr addrsp
; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
; OPT-NEXT: [[TMP10:%.*]] = add i32 [[TMP3]], [[RESIDUAL_LOOP_INDEX]]
; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i32 [[TMP10]]
; OPT-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(1) [[TMP11]], align 4
; OPT-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(1) [[TMP11]], align 1
; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[TMP10]]
; OPT-NEXT: store i8 [[TMP12]], ptr addrspace(3) [[TMP13]], align 4
; OPT-NEXT: store i8 [[TMP12]], ptr addrspace(3) [[TMP13]], align 1
; OPT-NEXT: [[TMP14]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1
; OPT-NEXT: [[TMP15:%.*]] = icmp ult i32 [[TMP14]], [[TMP2]]
; OPT-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
Expand Down Expand Up @@ -1182,9 +1182,9 @@ define amdgpu_kernel void @memcpy_global_align4_local_align4_variable(ptr addrsp
; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
; OPT-NEXT: [[TMP10:%.*]] = add i32 [[TMP3]], [[RESIDUAL_LOOP_INDEX]]
; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[TMP10]]
; OPT-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(3) [[TMP11]], align 4
; OPT-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(3) [[TMP11]], align 1
; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i32 [[TMP10]]
; OPT-NEXT: store i8 [[TMP12]], ptr addrspace(1) [[TMP13]], align 4
; OPT-NEXT: store i8 [[TMP12]], ptr addrspace(1) [[TMP13]], align 1
; OPT-NEXT: [[TMP14]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1
; OPT-NEXT: [[TMP15:%.*]] = icmp ult i32 [[TMP14]], [[TMP2]]
; OPT-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
Expand Down

0 comments on commit cbc96b9

Please sign in to comment.