Skip to content

Commit

Permalink
[Polly][GPGPU] Added SPIR Code Generation and Corresponding Runtime S…
Browse files Browse the repository at this point in the history
…upport for Intel

Summary:
Added SPIR Code Generation to the PPCG Code Generator. This can be invoked using
the polly-gpu-arch flag value 'spir32' or 'spir64' for 32 and 64 bit code respectively.
In addition to that, runtime support has been added to execute said SPIR code on Intel
GPU's, where the system is equipped with Intel's open source driver Beignet (development
version). This requires the cmake flag 'USE_INTEL_OCL' to be turned on, and the polly-gpu-runtime
flag value to be 'libopencl'.
The transformation of LLVM IR to SPIR is currently quite a hack, consisting in part of regex
string transformations.
Has been tested (working) with Polybench 3.2 on an Intel i7-5500U (integrated graphics chip).

Reviewers: bollu, grosser, Meinersbur, singam-sanjay

Reviewed By: grosser, singam-sanjay

Subscribers: pollydev, nemanjai, mgorny, Anastasia, kbarton

Tags: #polly

Differential Revision: https://reviews.llvm.org/D35185

llvm-svn: 308751
  • Loading branch information
phschaad committed Jul 21, 2017
1 parent 4403b2b commit 2f3073b
Show file tree
Hide file tree
Showing 5 changed files with 353 additions and 43 deletions.
2 changes: 1 addition & 1 deletion polly/include/polly/CodeGen/PPCGCodeGeneration.h
Expand Up @@ -16,7 +16,7 @@
#define POLLY_PPCGCODEGENERATION_H

/// The GPU Architecture to target.
enum GPUArch { NVPTX64 };
enum GPUArch { NVPTX64, SPIR32, SPIR64 };

/// The GPU Runtime implementation to use.
enum GPURuntime { CUDA, OpenCL };
Expand Down
169 changes: 158 additions & 11 deletions polly/lib/CodeGen/PPCGCodeGeneration.cpp
Expand Up @@ -545,6 +545,11 @@ class GPUNodeBuilder : public IslNodeBuilder {
/// @param The kernel to generate the intrinsic functions for.
void insertKernelIntrinsics(ppcg_kernel *Kernel);

/// Insert function calls to retrieve the SPIR group/local ids.
///
/// @param The kernel to generate the function calls for.
void insertKernelCallsSPIR(ppcg_kernel *Kernel);

/// Setup the creation of functions referenced by the GPU kernel.
///
/// 1. Create new function declarations in GPUModule which are the same as
Expand Down Expand Up @@ -1254,10 +1259,24 @@ void GPUNodeBuilder::createScopStmt(isl_ast_expr *Expr,

void GPUNodeBuilder::createKernelSync() {
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
const char *SpirName = "__gen_ocl_barrier_global";

Function *Sync;

switch (Arch) {
case GPUArch::SPIR64:
case GPUArch::SPIR32:
Sync = M->getFunction(SpirName);

// If Sync is not available, declare it.
if (!Sync) {
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
std::vector<Type *> Args;
FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
Sync = Function::Create(Ty, Linkage, SpirName, M);
Sync->setCallingConv(CallingConv::SPIR_FUNC);
}
break;
case GPUArch::NVPTX64:
Sync = Intrinsic::getDeclaration(M, Intrinsic::nvvm_barrier0);
break;
Expand Down Expand Up @@ -1668,7 +1687,8 @@ void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) {

finalizeKernelArguments(Kernel);
Function *F = Builder.GetInsertBlock()->getParent();
addCUDAAnnotations(F->getParent(), BlockDimX, BlockDimY, BlockDimZ);
if (Arch == GPUArch::NVPTX64)
addCUDAAnnotations(F->getParent(), BlockDimX, BlockDimY, BlockDimZ);
clearDominators(F);
clearScalarEvolution(F);
clearLoops(F);
Expand Down Expand Up @@ -1725,12 +1745,35 @@ static std::string computeNVPTXDataLayout(bool is64Bit) {
return Ret;
}

/// Compute the DataLayout string for a SPIR kernel.
///
/// @param is64Bit Are we looking for a 64 bit architecture?
static std::string computeSPIRDataLayout(bool is64Bit) {
std::string Ret = "";

if (!is64Bit) {
Ret += "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:"
"64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:"
"32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:"
"256:256-v256:256:256-v512:512:512-v1024:1024:1024";
} else {
Ret += "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:"
"64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:"
"32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:"
"256:256-v256:256:256-v512:512:512-v1024:1024:1024";
}

return Ret;
}

Function *
GPUNodeBuilder::createKernelFunctionDecl(ppcg_kernel *Kernel,
SetVector<Value *> &SubtreeValues) {
std::vector<Type *> Args;
std::string Identifier = getKernelFuncName(Kernel->id);

std::vector<Metadata *> MemoryType;

for (long i = 0; i < Prog->n_array; i++) {
if (!ppcg_kernel_requires_array_argument(Kernel, i))
continue;
Expand All @@ -1739,16 +1782,23 @@ GPUNodeBuilder::createKernelFunctionDecl(ppcg_kernel *Kernel,
isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set);
const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(Id);
Args.push_back(SAI->getElementType());
MemoryType.push_back(
ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), 0)));
} else {
static const int UseGlobalMemory = 1;
Args.push_back(Builder.getInt8PtrTy(UseGlobalMemory));
MemoryType.push_back(
ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), 1)));
}
}

int NumHostIters = isl_space_dim(Kernel->space, isl_dim_set);

for (long i = 0; i < NumHostIters; i++)
for (long i = 0; i < NumHostIters; i++) {
Args.push_back(Builder.getInt64Ty());
MemoryType.push_back(
ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), 0)));
}

int NumVars = isl_space_dim(Kernel->space, isl_dim_param);

Expand All @@ -1757,19 +1807,49 @@ GPUNodeBuilder::createKernelFunctionDecl(ppcg_kernel *Kernel,
Value *Val = IDToValue[Id];
isl_id_free(Id);
Args.push_back(Val->getType());
MemoryType.push_back(
ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), 0)));
}

for (auto *V : SubtreeValues)
for (auto *V : SubtreeValues) {
Args.push_back(V->getType());
MemoryType.push_back(
ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), 0)));
}

auto *FT = FunctionType::get(Builder.getVoidTy(), Args, false);
auto *FN = Function::Create(FT, Function::ExternalLinkage, Identifier,
GPUModule.get());

std::vector<Metadata *> EmptyStrings;

for (unsigned int i = 0; i < MemoryType.size(); i++) {
EmptyStrings.push_back(MDString::get(FN->getContext(), ""));
}

if (Arch == GPUArch::SPIR32 || Arch == GPUArch::SPIR64) {
FN->setMetadata("kernel_arg_addr_space",
MDNode::get(FN->getContext(), MemoryType));
FN->setMetadata("kernel_arg_name",
MDNode::get(FN->getContext(), EmptyStrings));
FN->setMetadata("kernel_arg_access_qual",
MDNode::get(FN->getContext(), EmptyStrings));
FN->setMetadata("kernel_arg_type",
MDNode::get(FN->getContext(), EmptyStrings));
FN->setMetadata("kernel_arg_type_qual",
MDNode::get(FN->getContext(), EmptyStrings));
FN->setMetadata("kernel_arg_base_type",
MDNode::get(FN->getContext(), EmptyStrings));
}

switch (Arch) {
case GPUArch::NVPTX64:
FN->setCallingConv(CallingConv::PTX_Kernel);
break;
case GPUArch::SPIR32:
case GPUArch::SPIR64:
FN->setCallingConv(CallingConv::SPIR_KERNEL);
break;
}

auto Arg = FN->arg_begin();
Expand Down Expand Up @@ -1835,6 +1915,9 @@ void GPUNodeBuilder::insertKernelIntrinsics(ppcg_kernel *Kernel) {
Intrinsic::ID IntrinsicsTID[3];

switch (Arch) {
case GPUArch::SPIR64:
case GPUArch::SPIR32:
llvm_unreachable("Cannot generate NVVM intrinsics for SPIR");
case GPUArch::NVPTX64:
IntrinsicsBID[0] = Intrinsic::nvvm_read_ptx_sreg_ctaid_x;
IntrinsicsBID[1] = Intrinsic::nvvm_read_ptx_sreg_ctaid_y;
Expand Down Expand Up @@ -1866,6 +1949,41 @@ void GPUNodeBuilder::insertKernelIntrinsics(ppcg_kernel *Kernel) {
}
}

void GPUNodeBuilder::insertKernelCallsSPIR(ppcg_kernel *Kernel) {
const char *GroupName[3] = {"__gen_ocl_get_group_id0",
"__gen_ocl_get_group_id1",
"__gen_ocl_get_group_id2"};

const char *LocalName[3] = {"__gen_ocl_get_local_id0",
"__gen_ocl_get_local_id1",
"__gen_ocl_get_local_id2"};

auto createFunc = [this](const char *Name, __isl_take isl_id *Id) mutable {
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
Function *FN = M->getFunction(Name);

// If FN is not available, declare it.
if (!FN) {
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
std::vector<Type *> Args;
FunctionType *Ty = FunctionType::get(Builder.getInt32Ty(), Args, false);
FN = Function::Create(Ty, Linkage, Name, M);
FN->setCallingConv(CallingConv::SPIR_FUNC);
}

Value *Val = Builder.CreateCall(FN, {});
Val = Builder.CreateIntCast(Val, Builder.getInt64Ty(), false, Name);
IDToValue[Id] = Val;
KernelIDs.insert(std::unique_ptr<isl_id, IslIdDeleter>(Id));
};

for (int i = 0; i < Kernel->n_grid; ++i)
createFunc(GroupName[i], isl_id_list_get_id(Kernel->block_ids, i));

for (int i = 0; i < Kernel->n_block; ++i)
createFunc(LocalName[i], isl_id_list_get_id(Kernel->thread_ids, i));
}

void GPUNodeBuilder::prepareKernelArguments(ppcg_kernel *Kernel, Function *FN) {
auto Arg = FN->arg_begin();
for (long i = 0; i < Kernel->n_array; i++) {
Expand Down Expand Up @@ -2004,6 +2122,14 @@ void GPUNodeBuilder::createKernelFunction(
GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-nvcl"));
GPUModule->setDataLayout(computeNVPTXDataLayout(true /* is64Bit */));
break;
case GPUArch::SPIR32:
GPUModule->setTargetTriple(Triple::normalize("spir-unknown-unknown"));
GPUModule->setDataLayout(computeSPIRDataLayout(false /* is64Bit */));
break;
case GPUArch::SPIR64:
GPUModule->setTargetTriple(Triple::normalize("spir64-unknown-unknown"));
GPUModule->setDataLayout(computeSPIRDataLayout(true /* is64Bit */));
break;
}

Function *FN = createKernelFunctionDecl(Kernel, SubtreeValues);
Expand All @@ -2021,7 +2147,16 @@ void GPUNodeBuilder::createKernelFunction(

prepareKernelArguments(Kernel, FN);
createKernelVariables(Kernel, FN);
insertKernelIntrinsics(Kernel);

switch (Arch) {
case GPUArch::NVPTX64:
insertKernelIntrinsics(Kernel);
break;
case GPUArch::SPIR32:
case GPUArch::SPIR64:
insertKernelCallsSPIR(Kernel);
break;
}
}

std::string GPUNodeBuilder::createKernelASM() {
Expand All @@ -2038,6 +2173,13 @@ std::string GPUNodeBuilder::createKernelASM() {
break;
}
break;
case GPUArch::SPIR64:
case GPUArch::SPIR32:
std::string SPIRAssembly;
raw_string_ostream IROstream(SPIRAssembly);
IROstream << *GPUModule;
IROstream.flush();
return SPIRAssembly;
}

std::string ErrMsg;
Expand All @@ -2057,6 +2199,9 @@ std::string GPUNodeBuilder::createKernelASM() {
case GPUArch::NVPTX64:
subtarget = CudaVersion;
break;
case GPUArch::SPIR32:
case GPUArch::SPIR64:
llvm_unreachable("No subtarget for SPIR architecture");
}

std::unique_ptr<TargetMachine> TargetM(GPUTarget->createTargetMachine(
Expand Down Expand Up @@ -2097,13 +2242,15 @@ std::string GPUNodeBuilder::finalizeKernelFunction() {
if (DumpKernelIR)
outs() << *GPUModule << "\n";

// Optimize module.
llvm::legacy::PassManager OptPasses;
PassManagerBuilder PassBuilder;
PassBuilder.OptLevel = 3;
PassBuilder.SizeLevel = 0;
PassBuilder.populateModulePassManager(OptPasses);
OptPasses.run(*GPUModule);
if (Arch != GPUArch::SPIR32 && Arch != GPUArch::SPIR64) {
// Optimize module.
llvm::legacy::PassManager OptPasses;
PassManagerBuilder PassBuilder;
PassBuilder.OptLevel = 3;
PassBuilder.SizeLevel = 0;
PassBuilder.populateModulePassManager(OptPasses);
OptPasses.run(*GPUModule);
}

std::string Assembly = createKernelASM();

Expand Down
6 changes: 5 additions & 1 deletion polly/lib/Support/RegisterPasses.cpp
Expand Up @@ -117,7 +117,11 @@ static cl::opt<GPURuntime> GPURuntimeChoice(
static cl::opt<GPUArch>
GPUArchChoice("polly-gpu-arch", cl::desc("The GPU Architecture to target"),
cl::values(clEnumValN(GPUArch::NVPTX64, "nvptx64",
"target NVIDIA 64-bit architecture")),
"target NVIDIA 64-bit architecture"),
clEnumValN(GPUArch::SPIR32, "spir32",
"target SPIR 32-bit architecture"),
clEnumValN(GPUArch::SPIR64, "spir64",
"target SPIR 64-bit architecture")),
cl::init(GPUArch::NVPTX64), cl::ZeroOrMore,
cl::cat(PollyCategory));
#endif
Expand Down

0 comments on commit 2f3073b

Please sign in to comment.