Skip to content

Commit

Permalink
Fix invalid addrspacecast due to combining alloca with global var
Browse files Browse the repository at this point in the history
For function-scope variables with large initialisation list, FE usually generates a global variable to hold the initializer, then generates memcpy intrinsic to initialize the alloca. InstCombiner::visitAllocaInst identifies such allocas which are accessed only by reading and replaces them with the global variable. This is done by casting the global variable to the type of the alloca and replacing all references.

However, when the global variable is in a different address space which is disjoint with addr space 0 (e.g. for IR generated from OpenCL, global variable cannot be in private addr space i.e. addr space 0), casting the global variable to addr space 0 results in invalid IR for certain targets (e.g. amdgpu).

To fix this issue, when the global variable is not in addr space 0, instead of casting it to addr space 0, this patch chases down the uses of alloca until reaching the load instructions, then replaces load from alloca with load from the global variable. If during the chasing bitcast and GEP are encountered, new bitcast and GEP based on the global variable are generated and used in the load instructions.

A debug output is also added to amdgpu backend to facilitate debugging such issues.

Differential Revision: https://reviews.llvm.org/D27283
  • Loading branch information
yxsamliu committed Dec 5, 2016
1 parent 3efb661 commit f11fc4f
Show file tree
Hide file tree
Showing 4 changed files with 138 additions and 5 deletions.
5 changes: 5 additions & 0 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Expand Up @@ -34,6 +34,9 @@
#include "llvm/CodeGen/Analysis.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/Function.h"
#include "llvm/Support/Debug.h"

#define DEBUG_TYPE "amdgpu-isel"

using namespace llvm;

Expand Down Expand Up @@ -2134,6 +2137,8 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
}

// global <-> flat are no-ops and never emitted.
DEBUG(dbgs() << "Invalid addrspacecast:\n";
ASC->dump());

const MachineFunction &MF = DAG.getMachineFunction();
DiagnosticInfoUnsupported InvalidAddrSpaceCast(
Expand Down
5 changes: 5 additions & 0 deletions llvm/lib/Transforms/InstCombine/InstCombineInternal.h
Expand Up @@ -314,6 +314,11 @@ class LLVM_LIBRARY_VISIBILITY InstCombiner
bool replacedSelectWithOperand(SelectInst *SI, const ICmpInst *Icmp,
const unsigned SIOpd);

/// Try to replace instruction \p I with value \p V which are pointers
/// in different address space.
/// \return true if successful.
bool replacePointer(Instruction &I, Value *V);

private:
bool ShouldChangeType(unsigned FromBitWidth, unsigned ToBitWidth) const;
bool ShouldChangeType(Type *From, Type *To) const;
Expand Down
68 changes: 63 additions & 5 deletions llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
Expand Up @@ -223,6 +223,55 @@ static Instruction *simplifyAllocaArraySize(InstCombiner &IC, AllocaInst &AI) {
return nullptr;
}

// If I and V are pointers in different address space, it is not allowed to
// use replaceAllUsesWith since I and V have different types. A
// non-target-specific transformation should not use addrspacecast on V since
// the two address space may be disjoint depending on target.
//
// This function chases down uses of the old pointer until reaching the load
// instructions, then replaces the old pointer in the load instructions with
// the new pointer. If during the chasing it sees bitcast or GEP, it will
// create new bitcast or GEP with the new pointer and use them in the load
// instruction.
bool InstCombiner::replacePointer(Instruction &I,
Value *V) {
auto *PT = dyn_cast<PointerType>(I.getType());
auto *NT = dyn_cast<PointerType>(V->getType());
assert(PT && NT && PT != NT && PT->getElementType() == NT->getElementType() &&
"Invalid usage");

for (auto U : I.users()) {
DEBUG(dbgs() << "Found pointer user: " << *U << '\n');
if (auto *LT = dyn_cast<LoadInst>(&*U)) {
auto *NewI = new LoadInst(V);
NewI->takeName(LT);
InsertNewInstWith(NewI, *LT);
replaceInstUsesWith(*LT, NewI);
} else if (auto *GEP = dyn_cast<GetElementPtrInst>(&*U)) {
SmallVector<Value *, 8> Indices;
Indices.append(GEP->idx_begin(), GEP->idx_end());
auto *NewI = GetElementPtrInst::Create(NT->getElementType(), V, Indices);
InsertNewInstWith(NewI, *GEP);
if (!replacePointer(*GEP, NewI))
return false;
else
NewI->takeName(GEP);
} else if (auto *BC = dyn_cast<BitCastInst>(&*U)) {
auto *NewT = PointerType::get(BC->getType()->getPointerElementType(),
NT->getAddressSpace());
auto *NewI = new BitCastInst(V, NewT);
InsertNewInstWith(NewI, *BC);
if (!replacePointer(*BC, NewI))
return false;
else
NewI->takeName(BC);
} else {
return false;
}
}
return true;
}

Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
if (auto *I = simplifyAllocaArraySize(*this, AI))
return I;
Expand Down Expand Up @@ -293,12 +342,21 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
for (unsigned i = 0, e = ToDelete.size(); i != e; ++i)
eraseInstFromFunction(*ToDelete[i]);
Constant *TheSrc = cast<Constant>(Copy->getSource());
auto *SrcTy = TheSrc->getType();
auto *DestTy = PointerType::get(AI.getType()->getPointerElementType(),
SrcTy->getPointerAddressSpace());
Constant *Cast
= ConstantExpr::getPointerBitCastOrAddrSpaceCast(TheSrc, AI.getType());
Instruction *NewI = replaceInstUsesWith(AI, Cast);
eraseInstFromFunction(*Copy);
++NumGlobalCopies;
return NewI;
= ConstantExpr::getPointerBitCastOrAddrSpaceCast(TheSrc, DestTy);
if (AI.getType()->getPointerAddressSpace()
== SrcTy->getPointerAddressSpace()) {
Instruction *NewI = replaceInstUsesWith(AI, Cast);
eraseInstFromFunction(*Copy);
++NumGlobalCopies;
return NewI;
} else {
replacePointer(AI, Cast);
++NumGlobalCopies;
}
}
}
}
Expand Down
65 changes: 65 additions & 0 deletions llvm/test/Transforms/InstCombine/memcpy-addrspace.ll
@@ -0,0 +1,65 @@
; RUN: opt < %s -instcombine -S | FileCheck %s

@test.data = private unnamed_addr addrspace(2) constant [8 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7], align 4

; CHECK-LABEL: test_load
; CHECK: %[[GEP:.*]] = getelementptr [8 x i32], [8 x i32] addrspace(2)* @test.data, i64 0, i64 %x
; CHECK: %{{.*}} = load i32, i32 addrspace(2)* %[[GEP]]
; CHECK-NOT: alloca
; CHECK-NOT: call void @llvm.memcpy.p0i8.p2i8.i64
; CHECK-NOT: addrspacecast
; CHECK-NOT: load i32, i32*
define void @test_load(i32 addrspace(1)* %out, i64 %x) {
entry:
%data = alloca [8 x i32], align 4
%0 = bitcast [8 x i32]* %data to i8*
call void @llvm.memcpy.p0i8.p2i8.i64(i8* %0, i8 addrspace(2)* bitcast ([8 x i32] addrspace(2)* @test.data to i8 addrspace(2)*), i64 32, i32 4, i1 false)
%arrayidx = getelementptr inbounds [8 x i32], [8 x i32]* %data, i64 0, i64 %x
%1 = load i32, i32* %arrayidx, align 4
%arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %x
store i32 %1, i32 addrspace(1)* %arrayidx1, align 4
ret void
}

; CHECK-LABEL: test_call
; CHECK: alloca
; CHECK: call void @llvm.memcpy.p0i8.p2i8.i64
; CHECK-NOT: addrspacecast
; CHECK: call i32 @foo(i32* %{{.*}})
define void @test_call(i32 addrspace(1)* %out, i64 %x) {
entry:
%data = alloca [8 x i32], align 4
%0 = bitcast [8 x i32]* %data to i8*
call void @llvm.memcpy.p0i8.p2i8.i64(i8* %0, i8 addrspace(2)* bitcast ([8 x i32] addrspace(2)* @test.data to i8 addrspace(2)*), i64 32, i32 4, i1 false)
%arrayidx = getelementptr inbounds [8 x i32], [8 x i32]* %data, i64 0, i64 %x
%1 = call i32 @foo(i32* %arrayidx)
%arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %x
store i32 %1, i32 addrspace(1)* %arrayidx1, align 4
ret void
}

; CHECK-LABEL: test_load_and_call
; CHECK: alloca
; CHECK: call void @llvm.memcpy.p0i8.p2i8.i64
; CHECK: load i32, i32* %{{.*}}
; CHECK: call i32 @foo(i32* %{{.*}})
; CHECK-NOT: addrspacecast
; CHECK-NOT: load i32, i32 addrspace(2)*
define void @test_load_and_call(i32 addrspace(1)* %out, i64 %x, i64 %y) {
entry:
%data = alloca [8 x i32], align 4
%0 = bitcast [8 x i32]* %data to i8*
call void @llvm.memcpy.p0i8.p2i8.i64(i8* %0, i8 addrspace(2)* bitcast ([8 x i32] addrspace(2)* @test.data to i8 addrspace(2)*), i64 32, i32 4, i1 false)
%arrayidx = getelementptr inbounds [8 x i32], [8 x i32]* %data, i64 0, i64 %x
%1 = load i32, i32* %arrayidx, align 4
%arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %x
store i32 %1, i32 addrspace(1)* %arrayidx1, align 4
%2 = call i32 @foo(i32* %arrayidx)
%arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %y
store i32 %2, i32 addrspace(1)* %arrayidx2, align 4
ret void
}


declare void @llvm.memcpy.p0i8.p2i8.i64(i8* nocapture writeonly, i8 addrspace(2)* nocapture readonly, i64, i32, i1)
declare i32 @foo(i32* %x)

0 comments on commit f11fc4f

Please sign in to comment.