Skip to content

Commit

Permalink
[AMDGPU] Add alias.scope metadata to lowered LDS struct
Browse files Browse the repository at this point in the history
Alias analysis is unable to disambiguate accesses to the structure
fields without it unlike distinct variables. As a result we cannot
combine ds_read and ds_write operations in a case of any store in
between which always considered clobbering.

Differential Revision: https://reviews.llvm.org/D108315
  • Loading branch information
rampitec committed Aug 19, 2021
1 parent 8cf5b69 commit 8d7d89b
Show file tree
Hide file tree
Showing 3 changed files with 174 additions and 9 deletions.
57 changes: 48 additions & 9 deletions llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
Expand Up @@ -42,6 +42,7 @@
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InlineAsm.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/MDBuilder.h"
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
#include "llvm/Support/CommandLine.h"
Expand Down Expand Up @@ -282,6 +283,21 @@ class AMDGPULowerModuleLDS : public ModulePass {
// so remove the variables from these lists before replaceAllUsesWith
removeFromUsedLists(M, LocalVars);

// Create alias.scope and their lists. Each field in the new structure
// does not alias with all other fields.
SmallVector<MDNode *> AliasScopes;
SmallVector<Metadata *> NoAliasList;
if (LocalVars.size() > 1) {
MDBuilder MDB(Ctx);
AliasScopes.reserve(LocalVars.size());
MDNode *Domain = MDB.createAnonymousAliasScopeDomain();
for (size_t I = 0; I < LocalVars.size(); I++) {
MDNode *Scope = MDB.createAnonymousAliasScope(Domain);
AliasScopes.push_back(Scope);
}
NoAliasList.append(&AliasScopes[1], AliasScopes.end());
}

// Replace uses of ith variable with a constantexpr to the ith field of the
// instance that will be allocated by AMDGPUMachineFunction
Type *I32 = Type::getInt32Ty(Ctx);
Expand Down Expand Up @@ -313,7 +329,15 @@ class AMDGPULowerModuleLDS : public ModulePass {

uint64_t Off = DL.getStructLayout(LDSTy)->getElementOffset(I);
Align A = commonAlignment(StructAlign, Off);
refineUsesAlignment(GEP, A, DL);

if (I)
NoAliasList[I - 1] = AliasScopes[I - 1];
MDNode *NoAlias =
NoAliasList.empty() ? nullptr : MDNode::get(Ctx, NoAliasList);
MDNode *AliasScope =
AliasScopes.empty() ? nullptr : MDNode::get(Ctx, {AliasScopes[I]});

refineUsesAlignmentAndAA(GEP, A, DL, AliasScope, NoAlias);
}

// Mark kernels with asm that reads the address of the allocated structure
Expand All @@ -334,12 +358,25 @@ class AMDGPULowerModuleLDS : public ModulePass {
return true;
}

void refineUsesAlignment(Value *Ptr, Align A, const DataLayout &DL,
unsigned MaxDepth = 5) {
if (!MaxDepth || A == 1)
void refineUsesAlignmentAndAA(Value *Ptr, Align A, const DataLayout &DL,
MDNode *AliasScope, MDNode *NoAlias,
unsigned MaxDepth = 5) {
if (!MaxDepth || (A == 1 && !AliasScope))
return;

for (User *U : Ptr->users()) {
if (auto *I = dyn_cast<Instruction>(U)) {
if (AliasScope && I->mayReadOrWriteMemory()) {
MDNode *AS = I->getMetadata(LLVMContext::MD_alias_scope);
AS = MDNode::concatenate(AS, AliasScope);
I->setMetadata(LLVMContext::MD_alias_scope, AS);

MDNode *NA = I->getMetadata(LLVMContext::MD_noalias);
NA = MDNode::concatenate(NA, NoAlias);
I->setMetadata(LLVMContext::MD_noalias, NA);
}
}

if (auto *LI = dyn_cast<LoadInst>(U)) {
LI->setAlignment(std::max(A, LI->getAlign()));
continue;
Expand All @@ -364,17 +401,19 @@ class AMDGPULowerModuleLDS : public ModulePass {
if (auto *GEP = dyn_cast<GetElementPtrInst>(U)) {
unsigned BitWidth = DL.getIndexTypeSizeInBits(GEP->getType());
APInt Off(BitWidth, 0);
if (GEP->getPointerOperand() == Ptr &&
GEP->accumulateConstantOffset(DL, Off)) {
Align GA = commonAlignment(A, Off.getLimitedValue());
refineUsesAlignment(GEP, GA, DL, MaxDepth - 1);
if (GEP->getPointerOperand() == Ptr) {
Align GA;
if (GEP->accumulateConstantOffset(DL, Off))
GA = commonAlignment(A, Off.getLimitedValue());
refineUsesAlignmentAndAA(GEP, GA, DL, AliasScope, NoAlias,
MaxDepth - 1);
}
continue;
}
if (auto *I = dyn_cast<Instruction>(U)) {
if (I->getOpcode() == Instruction::BitCast ||
I->getOpcode() == Instruction::AddrSpaceCast)
refineUsesAlignment(I, A, DL, MaxDepth - 1);
refineUsesAlignmentAndAA(I, A, DL, AliasScope, NoAlias, MaxDepth - 1);
}
}
}
Expand Down
49 changes: 49 additions & 0 deletions llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-merge.ll
@@ -0,0 +1,49 @@
; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck %s
; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s

@a = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 4
@b = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 4

; CHECK-LABEL: @no_clobber_ds_load_stores_x2_preexisting_aa
; CHECK: store i32 1, i32 addrspace(3)* %0, align 16, !tbaa !0, !alias.scope !5, !noalias !10
; CHECK: %val.a = load i32, i32 addrspace(3)* %gep.a, align 4, !tbaa !0, !alias.scope !5, !noalias !10
; CHECK: store i32 2, i32 addrspace(3)* %1, align 16, !tbaa !0, !alias.scope !10, !noalias !5
; CHECK: %val.b = load i32, i32 addrspace(3)* %gep.b, align 4, !tbaa !0, !alias.scope !10, !noalias !5

define amdgpu_kernel void @no_clobber_ds_load_stores_x2_preexisting_aa(i32 addrspace(1)* %arg, i32 %i) {
bb:
store i32 1, i32 addrspace(3)* getelementptr inbounds ([64 x i32], [64 x i32] addrspace(3)* @a, i32 0, i32 0), align 4, !alias.scope !0, !noalias !3, !tbaa !5
%gep.a = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* @a, i32 0, i32 %i
%val.a = load i32, i32 addrspace(3)* %gep.a, align 4, !alias.scope !0, !noalias !3, !tbaa !5
store i32 2, i32 addrspace(3)* getelementptr inbounds ([64 x i32], [64 x i32] addrspace(3)* @b, i32 0, i32 0), align 4, !alias.scope !3, !noalias !0, !tbaa !5
%gep.b = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* @b, i32 0, i32 %i
%val.b = load i32, i32 addrspace(3)* %gep.b, align 4, !alias.scope !3, !noalias !0, !tbaa !5
%val = add i32 %val.a, %val.b
store i32 %val, i32 addrspace(1)* %arg, align 4
ret void
}

!0 = !{!1}
!1 = distinct !{!1, !2}
!2 = distinct !{!2}
!3 = !{!4}
!4 = distinct !{!4, !2}
!5 = !{!6, !7, i64 0}
!6 = !{!"no_clobber_ds_load_stores_x2_preexisting_aa", !7, i64 0}
!7 = !{!"int", !8, i64 0}
!8 = !{!"omnipotent char", !9, i64 0}
!9 = !{!"Simple C++ TBAA"}

; CHECK:!0 = !{!1, !2, i64 0}
; CHECK:!1 = !{!"no_clobber_ds_load_stores_x2_preexisting_aa", !2, i64 0}
; CHECK:!2 = !{!"int", !3, i64 0}
; CHECK:!3 = !{!"omnipotent char", !4, i64 0}
; CHECK:!4 = !{!"Simple C++ TBAA"}
; CHECK:!5 = !{!6, !8}
; CHECK:!6 = distinct !{!6, !7}
; CHECK:!7 = distinct !{!7}
; CHECK:!8 = distinct !{!8, !9}
; CHECK:!9 = distinct !{!9}
; CHECK:!10 = !{!11, !12}
; CHECK:!11 = distinct !{!11, !7}
; CHECK:!12 = distinct !{!12, !9}
77 changes: 77 additions & 0 deletions llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll
@@ -0,0 +1,77 @@
; RUN: llc -march=amdgcn -mcpu=gfx900 -O3 < %s | FileCheck -check-prefix=GCN %s
; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck %s
; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s

@a = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 4
@b = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 4
@c = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 4

; GCN-LABEL: {{^}}no_clobber_ds_load_stores_x2:
; GCN: ds_write2st64_b32
; GCN: ds_read2st64_b32

; CHECK-LABEL: @no_clobber_ds_load_stores_x2
; CHECK: store i32 1, i32 addrspace(3)* %0, align 16, !alias.scope !0, !noalias !3
; CHECK: %val.a = load i32, i32 addrspace(3)* %gep.a, align 4, !alias.scope !0, !noalias !3
; CHECK: store i32 2, i32 addrspace(3)* %1, align 16, !alias.scope !3, !noalias !0
; CHECK: %val.b = load i32, i32 addrspace(3)* %gep.b, align 4, !alias.scope !3, !noalias !0

define amdgpu_kernel void @no_clobber_ds_load_stores_x2(i32 addrspace(1)* %arg, i32 %i) {
bb:
store i32 1, i32 addrspace(3)* getelementptr inbounds ([64 x i32], [64 x i32] addrspace(3)* @a, i32 0, i32 0), align 4
%gep.a = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* @a, i32 0, i32 %i
%val.a = load i32, i32 addrspace(3)* %gep.a, align 4
store i32 2, i32 addrspace(3)* getelementptr inbounds ([64 x i32], [64 x i32] addrspace(3)* @b, i32 0, i32 0), align 4
%gep.b = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* @b, i32 0, i32 %i
%val.b = load i32, i32 addrspace(3)* %gep.b, align 4
%val = add i32 %val.a, %val.b
store i32 %val, i32 addrspace(1)* %arg, align 4
ret void
}

; GCN-LABEL: {{^}}no_clobber_ds_load_stores_x3:
; GCN-DAG: ds_write2st64_b32
; GCN-DAG: ds_write_b32
; GCN-DAG: ds_read2st64_b32
; GCN-DAG: ds_read_b32

; CHECK-LABEL: @no_clobber_ds_load_stores_x3
; CHECK: store i32 1, i32 addrspace(3)* %0, align 16, !alias.scope !5, !noalias !8
; CHECK: %val.a = load i32, i32 addrspace(3)* %gep.a, align 4, !alias.scope !5, !noalias !8
; CHECK: store i32 2, i32 addrspace(3)* %1, align 16, !alias.scope !11, !noalias !12
; CHECK: %val.b = load i32, i32 addrspace(3)* %gep.b, align 4, !alias.scope !11, !noalias !12
; CHECK: store i32 3, i32 addrspace(3)* %2, align 16, !alias.scope !13, !noalias !14
; CHECK: %val.c = load i32, i32 addrspace(3)* %gep.c, align 4, !alias.scope !13, !noalias !14

define amdgpu_kernel void @no_clobber_ds_load_stores_x3(i32 addrspace(1)* %arg, i32 %i) {
bb:
store i32 1, i32 addrspace(3)* getelementptr inbounds ([64 x i32], [64 x i32] addrspace(3)* @a, i32 0, i32 0), align 4
%gep.a = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* @a, i32 0, i32 %i
%val.a = load i32, i32 addrspace(3)* %gep.a, align 4
store i32 2, i32 addrspace(3)* getelementptr inbounds ([64 x i32], [64 x i32] addrspace(3)* @b, i32 0, i32 0), align 4
%gep.b = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* @b, i32 0, i32 %i
%val.b = load i32, i32 addrspace(3)* %gep.b, align 4
store i32 3, i32 addrspace(3)* getelementptr inbounds ([64 x i32], [64 x i32] addrspace(3)* @c, i32 0, i32 0), align 4
%gep.c = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* @c, i32 0, i32 %i
%val.c = load i32, i32 addrspace(3)* %gep.c, align 4
%val.1 = add i32 %val.a, %val.b
%val = add i32 %val.1, %val.c
store i32 %val, i32 addrspace(1)* %arg, align 4
ret void
}

; CHECK: !0 = !{!1}
; CHECK: !1 = distinct !{!1, !2}
; CHECK: !2 = distinct !{!2}
; CHECK: !3 = !{!4}
; CHECK: !4 = distinct !{!4, !2}
; CHECK: !5 = !{!6}
; CHECK: !6 = distinct !{!6, !7}
; CHECK: !7 = distinct !{!7}
; CHECK: !8 = !{!9, !10}
; CHECK: !9 = distinct !{!9, !7}
; CHECK: !10 = distinct !{!10, !7}
; CHECK: !11 = !{!9}
; CHECK: !12 = !{!6, !10}
; CHECK: !13 = !{!10}
; CHECK: !14 = !{!6, !9}

0 comments on commit 8d7d89b

Please sign in to comment.