|
|
@@ -0,0 +1,365 @@ |
|
|
//===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===// |
|
|
// |
|
|
// The LLVM Compiler Infrastructure |
|
|
// |
|
|
// This file is distributed under the University of Illinois Open Source |
|
|
// License. See LICENSE.TXT for details. |
|
|
// |
|
|
//===----------------------------------------------------------------------===// |
|
|
// |
|
|
// This pass eliminates allocas by either converting them into vectors or |
|
|
// by migrating them to local address space. |
|
|
// |
|
|
//===----------------------------------------------------------------------===// |
|
|
|
|
|
#include "AMDGPU.h" |
|
|
#include "AMDGPUSubtarget.h" |
|
|
#include "llvm/Analysis/ValueTracking.h" |
|
|
#include "llvm/IR/IRBuilder.h" |
|
|
#include "llvm/IR/InstVisitor.h" |
|
|
#include "llvm/Support/Debug.h" |
|
|
|
|
|
#define DEBUG_TYPE "amdgpu-promote-alloca" |
|
|
|
|
|
using namespace llvm; |
|
|
|
|
|
namespace { |
|
|
|
|
|
class AMDGPUPromoteAlloca : public FunctionPass, |
|
|
public InstVisitor<AMDGPUPromoteAlloca> { |
|
|
|
|
|
static char ID; |
|
|
Module *Mod; |
|
|
const AMDGPUSubtarget &ST; |
|
|
int LocalMemAvailable; |
|
|
|
|
|
public: |
|
|
AMDGPUPromoteAlloca(const AMDGPUSubtarget &st) : FunctionPass(ID), ST(st), |
|
|
LocalMemAvailable(0) { } |
|
|
virtual bool doInitialization(Module &M); |
|
|
virtual bool runOnFunction(Function &F); |
|
|
virtual const char *getPassName() const { |
|
|
return "AMDGPU Promote Alloca"; |
|
|
} |
|
|
void visitAlloca(AllocaInst &I); |
|
|
}; |
|
|
|
|
|
} // End anonymous namespace |
|
|
|
|
|
char AMDGPUPromoteAlloca::ID = 0; |
|
|
|
|
|
bool AMDGPUPromoteAlloca::doInitialization(Module &M) { |
|
|
Mod = &M; |
|
|
return false; |
|
|
} |
|
|
|
|
|
bool AMDGPUPromoteAlloca::runOnFunction(Function &F) { |
|
|
|
|
|
const FunctionType *FTy = F.getFunctionType(); |
|
|
|
|
|
LocalMemAvailable = ST.getLocalMemorySize(); |
|
|
|
|
|
|
|
|
// If the function has any arguments in the local address space, then it's |
|
|
// possible these arguments require the entire local memory space, so |
|
|
// we cannot use local memory in the pass. |
|
|
for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i) { |
|
|
const Type *ParamTy = FTy->getParamType(i); |
|
|
if (ParamTy->isPointerTy() && |
|
|
ParamTy->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { |
|
|
LocalMemAvailable = 0; |
|
|
DEBUG(dbgs() << "Function has local memory argument. Promoting to " |
|
|
"local memory disabled.\n"); |
|
|
break; |
|
|
} |
|
|
} |
|
|
|
|
|
if (LocalMemAvailable > 0) { |
|
|
// Check how much local memory is being used by global objects |
|
|
for (Module::global_iterator I = Mod->global_begin(), |
|
|
E = Mod->global_end(); I != E; ++I) { |
|
|
GlobalVariable *GV = I; |
|
|
PointerType *GVTy = GV->getType(); |
|
|
if (GVTy->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) |
|
|
continue; |
|
|
for (Value::use_iterator U = GV->use_begin(), |
|
|
UE = GV->use_end(); U != UE; ++U) { |
|
|
Instruction *Use = dyn_cast<Instruction>(*U); |
|
|
if (!Use) |
|
|
continue; |
|
|
if (Use->getParent()->getParent() == &F) |
|
|
LocalMemAvailable -= |
|
|
Mod->getDataLayout()->getTypeAllocSize(GVTy->getElementType()); |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
LocalMemAvailable = std::max(0, LocalMemAvailable); |
|
|
DEBUG(dbgs() << LocalMemAvailable << "bytes free in local memory.\n"); |
|
|
|
|
|
visit(F); |
|
|
|
|
|
return false; |
|
|
} |
|
|
|
|
|
static VectorType *arrayTypeToVecType(const Type *ArrayTy) { |
|
|
return VectorType::get(ArrayTy->getArrayElementType(), |
|
|
ArrayTy->getArrayNumElements()); |
|
|
} |
|
|
|
|
|
static Value* calculateVectorIndex(Value *Ptr, |
|
|
std::map<GetElementPtrInst*, Value*> GEPIdx) { |
|
|
if (isa<AllocaInst>(Ptr)) |
|
|
return Constant::getNullValue(Type::getInt32Ty(Ptr->getContext())); |
|
|
|
|
|
GetElementPtrInst *GEP = cast<GetElementPtrInst>(Ptr); |
|
|
|
|
|
return GEPIdx[GEP]; |
|
|
} |
|
|
|
|
|
static Value* GEPToVectorIndex(GetElementPtrInst *GEP) { |
|
|
// FIXME we only support simple cases |
|
|
if (GEP->getNumOperands() != 3) |
|
|
return NULL; |
|
|
|
|
|
ConstantInt *I0 = dyn_cast<ConstantInt>(GEP->getOperand(1)); |
|
|
if (!I0 || !I0->isZero()) |
|
|
return NULL; |
|
|
|
|
|
return GEP->getOperand(2); |
|
|
} |
|
|
|
|
|
static bool tryPromoteAllocaToVector(AllocaInst *Alloca) { |
|
|
Type *AllocaTy = Alloca->getAllocatedType(); |
|
|
|
|
|
DEBUG(dbgs() << "Alloca Candidate for vectorization \n"); |
|
|
|
|
|
// FIXME: There is no reason why we can't support larger arrays, we |
|
|
// are just being conservative for now. |
|
|
if (!AllocaTy->isArrayTy() || |
|
|
AllocaTy->getArrayElementType()->isVectorTy() || |
|
|
AllocaTy->getArrayNumElements() > 4) { |
|
|
|
|
|
DEBUG(dbgs() << " Cannot convert type to vector"); |
|
|
return false; |
|
|
} |
|
|
|
|
|
std::map<GetElementPtrInst*, Value*> GEPVectorIdx; |
|
|
std::vector<Value*> WorkList; |
|
|
for (User *AllocaUser : Alloca->users()) { |
|
|
GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(AllocaUser); |
|
|
if (!GEP) { |
|
|
WorkList.push_back(AllocaUser); |
|
|
continue; |
|
|
} |
|
|
|
|
|
Value *Index = GEPToVectorIndex(GEP); |
|
|
|
|
|
// If we can't compute a vector index from this GEP, then we can't |
|
|
// promote this alloca to vector. |
|
|
if (!Index) { |
|
|
DEBUG(dbgs() << " Cannot compute vector index for GEP " << *GEP << "\n"); |
|
|
return false; |
|
|
} |
|
|
|
|
|
GEPVectorIdx[GEP] = Index; |
|
|
for (User *GEPUser : AllocaUser->users()) { |
|
|
WorkList.push_back(GEPUser); |
|
|
} |
|
|
} |
|
|
|
|
|
VectorType *VectorTy = arrayTypeToVecType(AllocaTy); |
|
|
|
|
|
DEBUG(dbgs() << " Converting alloca to vector "; AllocaTy->dump(); |
|
|
dbgs() << " -> "; VectorTy->dump(); dbgs() << "\n"); |
|
|
|
|
|
for (std::vector<Value*>::iterator I = WorkList.begin(), |
|
|
E = WorkList.end(); I != E; ++I) { |
|
|
Instruction *Inst = cast<Instruction>(*I); |
|
|
IRBuilder<> Builder(Inst); |
|
|
switch (Inst->getOpcode()) { |
|
|
case Instruction::Load: { |
|
|
Value *Ptr = Inst->getOperand(0); |
|
|
Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx); |
|
|
Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0)); |
|
|
Value *VecValue = Builder.CreateLoad(BitCast); |
|
|
Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index); |
|
|
Inst->replaceAllUsesWith(ExtractElement); |
|
|
Inst->eraseFromParent(); |
|
|
break; |
|
|
} |
|
|
case Instruction::Store: { |
|
|
Value *Ptr = Inst->getOperand(1); |
|
|
Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx); |
|
|
Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0)); |
|
|
Value *VecValue = Builder.CreateLoad(BitCast); |
|
|
Value *NewVecValue = Builder.CreateInsertElement(VecValue, |
|
|
Inst->getOperand(0), |
|
|
Index); |
|
|
Builder.CreateStore(NewVecValue, BitCast); |
|
|
Inst->eraseFromParent(); |
|
|
break; |
|
|
} |
|
|
case Instruction::BitCast: |
|
|
break; |
|
|
|
|
|
default: |
|
|
Inst->dump(); |
|
|
llvm_unreachable("Do not know how to replace this instruction " |
|
|
"with vector op"); |
|
|
} |
|
|
} |
|
|
return true; |
|
|
} |
|
|
|
|
|
static void collectUsesWithPtrTypes(Value *Val, std::vector<Value*> &WorkList) { |
|
|
for (User *User : Val->users()) { |
|
|
if(std::find(WorkList.begin(), WorkList.end(), User) != WorkList.end()) |
|
|
continue; |
|
|
if (isa<CallInst>(User)) { |
|
|
WorkList.push_back(User); |
|
|
continue; |
|
|
} |
|
|
if (!User->getType()->isPointerTy()) |
|
|
continue; |
|
|
WorkList.push_back(User); |
|
|
collectUsesWithPtrTypes(User, WorkList); |
|
|
} |
|
|
} |
|
|
|
|
|
void AMDGPUPromoteAlloca::visitAlloca(AllocaInst &I) { |
|
|
IRBuilder<> Builder(&I); |
|
|
|
|
|
// First try to replace the alloca with a vector |
|
|
Type *AllocaTy = I.getAllocatedType(); |
|
|
|
|
|
DEBUG(dbgs() << "Trying to promote " << I); |
|
|
|
|
|
if (tryPromoteAllocaToVector(&I)) |
|
|
return; |
|
|
|
|
|
DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n"); |
|
|
|
|
|
// FIXME: This is the maximum work group size. We should try to get |
|
|
// value from the reqd_work_group_size function attribute if it is |
|
|
// available. |
|
|
unsigned WorkGroupSize = 256; |
|
|
int AllocaSize = WorkGroupSize * |
|
|
Mod->getDataLayout()->getTypeAllocSize(AllocaTy); |
|
|
|
|
|
if (AllocaSize > LocalMemAvailable) { |
|
|
DEBUG(dbgs() << " Not enough local memory to promote alloca.\n"); |
|
|
return; |
|
|
} |
|
|
|
|
|
DEBUG(dbgs() << "Promoting alloca to local memory\n"); |
|
|
LocalMemAvailable -= AllocaSize; |
|
|
|
|
|
GlobalVariable *GV = new GlobalVariable( |
|
|
*Mod, ArrayType::get(I.getAllocatedType(), 256), false, |
|
|
GlobalValue::ExternalLinkage, 0, I.getName(), 0, |
|
|
GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS); |
|
|
|
|
|
FunctionType *FTy = FunctionType::get( |
|
|
Type::getInt32Ty(Mod->getContext()), false); |
|
|
AttributeSet AttrSet; |
|
|
AttrSet.addAttribute(Mod->getContext(), 0, Attribute::ReadNone); |
|
|
|
|
|
Value *ReadLocalSizeY = Mod->getOrInsertFunction( |
|
|
"llvm.r600.read.local.size.y", FTy, AttrSet); |
|
|
Value *ReadLocalSizeZ = Mod->getOrInsertFunction( |
|
|
"llvm.r600.read.local.size.z", FTy, AttrSet); |
|
|
Value *ReadTIDIGX = Mod->getOrInsertFunction( |
|
|
"llvm.r600.read.tidig.x", FTy, AttrSet); |
|
|
Value *ReadTIDIGY = Mod->getOrInsertFunction( |
|
|
"llvm.r600.read.tidig.y", FTy, AttrSet); |
|
|
Value *ReadTIDIGZ = Mod->getOrInsertFunction( |
|
|
"llvm.r600.read.tidig.z", FTy, AttrSet); |
|
|
|
|
|
|
|
|
Value *TCntY = Builder.CreateCall(ReadLocalSizeY); |
|
|
Value *TCntZ = Builder.CreateCall(ReadLocalSizeZ); |
|
|
Value *TIdX = Builder.CreateCall(ReadTIDIGX); |
|
|
Value *TIdY = Builder.CreateCall(ReadTIDIGY); |
|
|
Value *TIdZ = Builder.CreateCall(ReadTIDIGZ); |
|
|
|
|
|
Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ); |
|
|
Tmp0 = Builder.CreateMul(Tmp0, TIdX); |
|
|
Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ); |
|
|
Value *TID = Builder.CreateAdd(Tmp0, Tmp1); |
|
|
TID = Builder.CreateAdd(TID, TIdZ); |
|
|
|
|
|
std::vector<Value*> Indices; |
|
|
Indices.push_back(Constant::getNullValue(Type::getInt32Ty(Mod->getContext()))); |
|
|
Indices.push_back(TID); |
|
|
|
|
|
Value *Offset = Builder.CreateGEP(GV, Indices); |
|
|
I.mutateType(Offset->getType()); |
|
|
I.replaceAllUsesWith(Offset); |
|
|
I.eraseFromParent(); |
|
|
|
|
|
std::vector<Value*> WorkList; |
|
|
|
|
|
collectUsesWithPtrTypes(Offset, WorkList); |
|
|
|
|
|
for (std::vector<Value*>::iterator i = WorkList.begin(), |
|
|
e = WorkList.end(); i != e; ++i) { |
|
|
Value *V = *i; |
|
|
CallInst *Call = dyn_cast<CallInst>(V); |
|
|
if (!Call) { |
|
|
Type *EltTy = V->getType()->getPointerElementType(); |
|
|
PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS); |
|
|
V->mutateType(NewTy); |
|
|
continue; |
|
|
} |
|
|
|
|
|
IntrinsicInst *Intr = dyn_cast<IntrinsicInst>(Call); |
|
|
if (!Intr) { |
|
|
std::vector<Type*> ArgTypes; |
|
|
for (unsigned ArgIdx = 0, ArgEnd = Call->getNumArgOperands(); |
|
|
ArgIdx != ArgEnd; ++ArgIdx) { |
|
|
ArgTypes.push_back(Call->getArgOperand(ArgIdx)->getType()); |
|
|
} |
|
|
Function *F = Call->getCalledFunction(); |
|
|
FunctionType *NewType = FunctionType::get(Call->getType(), ArgTypes, |
|
|
F->isVarArg()); |
|
|
Constant *C = Mod->getOrInsertFunction(StringRef(F->getName().str() + ".local"), NewType, |
|
|
F->getAttributes()); |
|
|
Function *NewF = cast<Function>(C); |
|
|
Call->setCalledFunction(NewF); |
|
|
continue; |
|
|
} |
|
|
|
|
|
Builder.SetInsertPoint(Intr); |
|
|
switch (Intr->getIntrinsicID()) { |
|
|
case Intrinsic::lifetime_start: |
|
|
case Intrinsic::lifetime_end: |
|
|
// These intrinsics are for address space 0 only |
|
|
Intr->eraseFromParent(); |
|
|
continue; |
|
|
case Intrinsic::memcpy: { |
|
|
MemCpyInst *MemCpy = cast<MemCpyInst>(Intr); |
|
|
Builder.CreateMemCpy(MemCpy->getRawDest(), MemCpy->getRawSource(), |
|
|
MemCpy->getLength(), MemCpy->getAlignment(), |
|
|
MemCpy->isVolatile()); |
|
|
Intr->eraseFromParent(); |
|
|
continue; |
|
|
} |
|
|
case Intrinsic::memset: { |
|
|
MemSetInst *MemSet = cast<MemSetInst>(Intr); |
|
|
Builder.CreateMemSet(MemSet->getRawDest(), MemSet->getValue(), |
|
|
MemSet->getLength(), MemSet->getAlignment(), |
|
|
MemSet->isVolatile()); |
|
|
Intr->eraseFromParent(); |
|
|
continue; |
|
|
} |
|
|
default: |
|
|
Intr->dump(); |
|
|
llvm_unreachable("Don't know how to promote alloca intrinsic use."); |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
FunctionPass *llvm::createAMDGPUPromoteAlloca(const AMDGPUSubtarget &ST) { |
|
|
return new AMDGPUPromoteAlloca(ST); |
|
|
} |