diff --git a/llvm/CODE_OWNERS.TXT b/llvm/CODE_OWNERS.TXT index eb17f5cb597d39..c6ac488c4c5460 100644 --- a/llvm/CODE_OWNERS.TXT +++ b/llvm/CODE_OWNERS.TXT @@ -254,5 +254,5 @@ E: zixuan.wu@linux.alibaba.com D: C-SKY backend (lib/Target/CSKY/*) N: Ilia Diachkov -E: iliya.diyachkov@intel.com +E: ilia.diachkov@gmail.com D: SPIR-V backend (lib/Target/SPIRV/*) diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp index ce2eaaf5b4d9c3..e9418fd023ac12 100644 --- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp @@ -14,6 +14,7 @@ #include "SPIRVBuiltins.h" #include "SPIRV.h" #include "SPIRVUtils.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/IntrinsicsSPIRV.h" #include #include @@ -1361,6 +1362,156 @@ static bool generateSpecConstantInst(const SPIRV::IncomingCall *Call, } } +static MachineInstr *getBlockStructInstr(Register ParamReg, + MachineRegisterInfo *MRI) { + // We expect the following sequence of instructions: + // %0:_(pN) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.spv.alloca) + // or = G_GLOBAL_VALUE @block_literal_global + // %1:_(pN) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.spv.bitcast), %0 + // %2:_(p4) = G_ADDRSPACE_CAST %1:_(pN) + MachineInstr *MI = MRI->getUniqueVRegDef(ParamReg); + assert(MI->getOpcode() == TargetOpcode::G_ADDRSPACE_CAST && + MI->getOperand(1).isReg()); + Register BitcastReg = MI->getOperand(1).getReg(); + MachineInstr *BitcastMI = MRI->getUniqueVRegDef(BitcastReg); + assert(isSpvIntrinsic(*BitcastMI, Intrinsic::spv_bitcast) && + BitcastMI->getOperand(2).isReg()); + Register ValueReg = BitcastMI->getOperand(2).getReg(); + MachineInstr *ValueMI = MRI->getUniqueVRegDef(ValueReg); + return ValueMI; +} + +// Return type of the instruction result from spv_assign_type intrinsic. +// TODO: maybe unify with prelegalizer pass. +static const Type *getMachineInstrType(MachineInstr *MI) { + MachineInstr *NextMI = MI->getNextNode(); + if (isSpvIntrinsic(*NextMI, Intrinsic::spv_assign_name)) + NextMI = NextMI->getNextNode(); + Register ValueReg = MI->getOperand(0).getReg(); + if (!isSpvIntrinsic(*NextMI, Intrinsic::spv_assign_type) || + NextMI->getOperand(1).getReg() != ValueReg) + return nullptr; + Type *Ty = getMDOperandAsType(NextMI->getOperand(2).getMetadata(), 0); + assert(Ty && "Type is expected"); + return getTypedPtrEltType(Ty); +} + +static const Type *getBlockStructType(Register ParamReg, + MachineRegisterInfo *MRI) { + // In principle, this information should be passed to us from Clang via + // an elementtype attribute. However, said attribute requires that + // the function call be an intrinsic, which is not. Instead, we rely on being + // able to trace this to the declaration of a variable: OpenCL C specification + // section 6.12.5 should guarantee that we can do this. + MachineInstr *MI = getBlockStructInstr(ParamReg, MRI); + if (MI->getOpcode() == TargetOpcode::G_GLOBAL_VALUE) + return getTypedPtrEltType(MI->getOperand(1).getGlobal()->getType()); + assert(isSpvIntrinsic(*MI, Intrinsic::spv_alloca) && + "Blocks in OpenCL C must be traceable to allocation site"); + return getMachineInstrType(MI); +} + +// TODO: maybe move to the global register. +static SPIRVType * +getOrCreateSPIRVDeviceEventPointer(MachineIRBuilder &MIRBuilder, + SPIRVGlobalRegistry *GR) { + LLVMContext &Context = MIRBuilder.getMF().getFunction().getContext(); + Type *OpaqueType = StructType::getTypeByName(Context, "spirv.DeviceEvent"); + if (!OpaqueType) + OpaqueType = StructType::getTypeByName(Context, "opencl.clk_event_t"); + if (!OpaqueType) + OpaqueType = StructType::create(Context, "spirv.DeviceEvent"); + unsigned SC0 = storageClassToAddressSpace(SPIRV::StorageClass::Function); + unsigned SC1 = storageClassToAddressSpace(SPIRV::StorageClass::Generic); + Type *PtrType = PointerType::get(PointerType::get(OpaqueType, SC0), SC1); + return GR->getOrCreateSPIRVType(PtrType, MIRBuilder); +} + +static bool buildEnqueueKernel(const SPIRV::IncomingCall *Call, + MachineIRBuilder &MIRBuilder, + SPIRVGlobalRegistry *GR) { + MachineRegisterInfo *MRI = MIRBuilder.getMRI(); + const DataLayout &DL = MIRBuilder.getDataLayout(); + bool HasEvents = Call->Builtin->Name.find("events") != StringRef::npos; + const SPIRVType *Int32Ty = GR->getOrCreateSPIRVIntegerType(32, MIRBuilder); + + // Make vararg instructions before OpEnqueueKernel. + // Local sizes arguments: Sizes of block invoke arguments. Clang generates + // local size operands as an array, so we need to unpack them. + SmallVector LocalSizes; + if (Call->Builtin->Name.find("_varargs") != StringRef::npos) { + const unsigned LocalSizeArrayIdx = HasEvents ? 9 : 6; + Register GepReg = Call->Arguments[LocalSizeArrayIdx]; + MachineInstr *GepMI = MRI->getUniqueVRegDef(GepReg); + assert(isSpvIntrinsic(*GepMI, Intrinsic::spv_gep) && + GepMI->getOperand(3).isReg()); + Register ArrayReg = GepMI->getOperand(3).getReg(); + MachineInstr *ArrayMI = MRI->getUniqueVRegDef(ArrayReg); + const Type *LocalSizeTy = getMachineInstrType(ArrayMI); + assert(LocalSizeTy && "Local size type is expected"); + const uint64_t LocalSizeNum = + cast(LocalSizeTy)->getNumElements(); + unsigned SC = storageClassToAddressSpace(SPIRV::StorageClass::Generic); + const LLT LLType = LLT::pointer(SC, GR->getPointerSize()); + const SPIRVType *PointerSizeTy = GR->getOrCreateSPIRVPointerType( + Int32Ty, MIRBuilder, SPIRV::StorageClass::Function); + for (unsigned I = 0; I < LocalSizeNum; ++I) { + Register Reg = + MIRBuilder.getMRI()->createVirtualRegister(&SPIRV::IDRegClass); + MIRBuilder.getMRI()->setType(Reg, LLType); + GR->assignSPIRVTypeToVReg(PointerSizeTy, Reg, MIRBuilder.getMF()); + auto GEPInst = MIRBuilder.buildIntrinsic(Intrinsic::spv_gep, + ArrayRef{Reg}, true); + GEPInst + .addImm(GepMI->getOperand(2).getImm()) // In bound. + .addUse(ArrayMI->getOperand(0).getReg()) // Alloca. + .addUse(buildConstantIntReg(0, MIRBuilder, GR)) // Indices. + .addUse(buildConstantIntReg(I, MIRBuilder, GR)); + LocalSizes.push_back(Reg); + } + } + + // SPIRV OpEnqueueKernel instruction has 10+ arguments. + auto MIB = MIRBuilder.buildInstr(SPIRV::OpEnqueueKernel) + .addDef(Call->ReturnRegister) + .addUse(GR->getSPIRVTypeID(Int32Ty)); + + // Copy all arguments before block invoke function pointer. + const unsigned BlockFIdx = HasEvents ? 6 : 3; + for (unsigned i = 0; i < BlockFIdx; i++) + MIB.addUse(Call->Arguments[i]); + + // If there are no event arguments in the original call, add dummy ones. + if (!HasEvents) { + MIB.addUse(buildConstantIntReg(0, MIRBuilder, GR)); // Dummy num events. + Register NullPtr = GR->getOrCreateConstNullPtr( + MIRBuilder, getOrCreateSPIRVDeviceEventPointer(MIRBuilder, GR)); + MIB.addUse(NullPtr); // Dummy wait events. + MIB.addUse(NullPtr); // Dummy ret event. + } + + MachineInstr *BlockMI = getBlockStructInstr(Call->Arguments[BlockFIdx], MRI); + assert(BlockMI->getOpcode() == TargetOpcode::G_GLOBAL_VALUE); + // Invoke: Pointer to invoke function. + MIB.addGlobalAddress(BlockMI->getOperand(1).getGlobal()); + + Register BlockLiteralReg = Call->Arguments[BlockFIdx + 1]; + // Param: Pointer to block literal. + MIB.addUse(BlockLiteralReg); + + Type *PType = const_cast(getBlockStructType(BlockLiteralReg, MRI)); + // TODO: these numbers should be obtained from block literal structure. + // Param Size: Size of block literal structure. + MIB.addUse(buildConstantIntReg(DL.getTypeStoreSize(PType), MIRBuilder, GR)); + // Param Aligment: Aligment of block literal structure. + MIB.addUse( + buildConstantIntReg(DL.getPrefTypeAlignment(PType), MIRBuilder, GR)); + + for (unsigned i = 0; i < LocalSizes.size(); i++) + MIB.addUse(LocalSizes[i]); + return true; +} + static bool generateEnqueueInst(const SPIRV::IncomingCall *Call, MachineIRBuilder &MIRBuilder, SPIRVGlobalRegistry *GR) { @@ -1450,6 +1601,8 @@ static bool generateEnqueueInst(const SPIRV::IncomingCall *Call, .addUse(Call->Arguments[0]) .addUse(TmpReg); } + case SPIRV::OpEnqueueKernel: + return buildEnqueueKernel(Call, MIRBuilder, GR); default: return false; } @@ -1856,6 +2009,9 @@ SPIRVType *lowerBuiltinType(const StructType *OpaqueType, case SPIRV::OpTypePipe: TargetType = getPipeType(OpaqueType, MIRBuilder, GR); break; + case SPIRV::OpTypeDeviceEvent: + TargetType = GR->getOrCreateOpTypeDeviceEvent(MIRBuilder); + break; case SPIRV::OpTypeSampler: TargetType = getSamplerType(MIRBuilder, GR); break; diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td index f9b7e55e29f959..c82354bb39f431 100644 --- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td +++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td @@ -533,6 +533,10 @@ defm : DemangledNativeBuiltin<"barrier", OpenCL_std, Barrier, 1, 3, OpControlBar defm : DemangledNativeBuiltin<"work_group_barrier", OpenCL_std, Barrier, 1, 3, OpControlBarrier>; // Kernel enqueue builtin records: +defm : DemangledNativeBuiltin<"__enqueue_kernel_basic", OpenCL_std, Enqueue, 5, 5, OpEnqueueKernel>; +defm : DemangledNativeBuiltin<"__enqueue_kernel_basic_events", OpenCL_std, Enqueue, 8, 8, OpEnqueueKernel>; +defm : DemangledNativeBuiltin<"__enqueue_kernel_varargs", OpenCL_std, Enqueue, 7, 7, OpEnqueueKernel>; +defm : DemangledNativeBuiltin<"__enqueue_kernel_events_varargs", OpenCL_std, Enqueue, 10, 10, OpEnqueueKernel>; defm : DemangledNativeBuiltin<"retain_event", OpenCL_std, Enqueue, 1, 1, OpRetainEvent>; defm : DemangledNativeBuiltin<"release_event", OpenCL_std, Enqueue, 1, 1, OpReleaseEvent>; defm : DemangledNativeBuiltin<"create_user_event", OpenCL_std, Enqueue, 0, 0, OpCreateUserEvent>; diff --git a/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.h b/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.h index 64df5064793aa6..00553d9710b69c 100644 --- a/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.h +++ b/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.h @@ -58,6 +58,7 @@ struct SpecialTypeDescriptor { STK_SampledImage, STK_Sampler, STK_Pipe, + STK_DeviceEvent, STK_Last = -1 }; SpecialTypeKind Kind; @@ -147,6 +148,18 @@ struct PipeTypeDescriptor : public SpecialTypeDescriptor { return TD->Kind == SpecialTypeKind::STK_Pipe; } }; + +struct DeviceEventTypeDescriptor : public SpecialTypeDescriptor { + + DeviceEventTypeDescriptor() + : SpecialTypeDescriptor(SpecialTypeKind::STK_DeviceEvent) { + Hash = Kind; + } + + static bool classof(const SpecialTypeDescriptor *TD) { + return TD->Kind == SpecialTypeKind::STK_DeviceEvent; + } +}; } // namespace SPIRV template <> struct DenseMapInfo { diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp index bbb86ce5595a53..0f85c4839e107c 100644 --- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp @@ -390,6 +390,26 @@ SPIRVGlobalRegistry::getOrCreateConsIntArray(uint64_t Val, LLVMArrTy->getNumElements()); } +Register +SPIRVGlobalRegistry::getOrCreateConstNullPtr(MachineIRBuilder &MIRBuilder, + SPIRVType *SpvType) { + const Type *LLVMTy = getTypeForSPIRVType(SpvType); + const PointerType *LLVMPtrTy = cast(LLVMTy); + // Find a constant in DT or build a new one. + Constant *CP = ConstantPointerNull::get(const_cast(LLVMPtrTy)); + Register Res = DT.find(CP, CurMF); + if (!Res.isValid()) { + LLT LLTy = LLT::pointer(LLVMPtrTy->getAddressSpace(), PointerSize); + Res = CurMF->getRegInfo().createGenericVirtualRegister(LLTy); + assignSPIRVTypeToVReg(SpvType, Res, *CurMF); + MIRBuilder.buildInstr(SPIRV::OpConstantNull) + .addDef(Res) + .addUse(getSPIRVTypeID(SpvType)); + DT.add(CP, CurMF, Res); + } + return Res; +} + Register SPIRVGlobalRegistry::buildConstantSampler( Register ResReg, unsigned AddrMode, unsigned Param, unsigned FilerMode, MachineIRBuilder &MIRBuilder, SPIRVType *SpvType) { @@ -847,6 +867,16 @@ SPIRVType *SPIRVGlobalRegistry::getOrCreateOpTypePipe( .addImm(AccessQual); } +SPIRVType *SPIRVGlobalRegistry::getOrCreateOpTypeDeviceEvent( + MachineIRBuilder &MIRBuilder) { + SPIRV::DeviceEventTypeDescriptor TD; + if (auto *Res = checkSpecialInstr(TD, MIRBuilder)) + return Res; + Register ResVReg = createTypeVReg(MIRBuilder); + DT.add(TD, &MIRBuilder.getMF(), ResVReg); + return MIRBuilder.buildInstr(SPIRV::OpTypeDeviceEvent).addDef(ResVReg); +} + SPIRVType *SPIRVGlobalRegistry::getOrCreateOpTypeSampledImage( SPIRVType *ImageType, MachineIRBuilder &MIRBuilder) { SPIRV::SampledImageTypeDescriptor TD( diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h index 667802a84ee475..88769f84b3e504 100644 --- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h +++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h @@ -245,6 +245,8 @@ class SPIRVGlobalRegistry { SPIRVType *SpvType, bool EmitIR = true); Register getOrCreateConsIntArray(uint64_t Val, MachineIRBuilder &MIRBuilder, SPIRVType *SpvType, bool EmitIR = true); + Register getOrCreateConstNullPtr(MachineIRBuilder &MIRBuilder, + SPIRVType *SpvType); Register buildConstantSampler(Register Res, unsigned AddrMode, unsigned Param, unsigned FilerMode, MachineIRBuilder &MIRBuilder, @@ -300,6 +302,7 @@ class SPIRVGlobalRegistry { SPIRVType * getOrCreateOpTypePipe(MachineIRBuilder &MIRBuilder, SPIRV::AccessQualifier::AccessQualifier AccQual); + SPIRVType *getOrCreateOpTypeDeviceEvent(MachineIRBuilder &MIRBuilder); SPIRVType *getOrCreateOpTypeFunctionWithArgs( const Type *Ty, SPIRVType *RetType, const SmallVectorImpl &ArgTypes, diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td index c58a3ba0403be6..e1521d44e4e53e 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td +++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td @@ -688,6 +688,9 @@ def OpGroupUMax: OpGroup<"UMax", 270>; def OpGroupSMax: OpGroup<"SMax", 271>; // TODO: 3.42.22. Device-Side Enqueue Instructions +def OpEnqueueKernel: Op<292, (outs ID:$res), (ins TYPE:$type, ID:$queue, ID:$flags, ID:$NDR, ID:$nevents, ID:$wevents, + ID:$revent, ID:$invoke, ID:$param, ID:$psize, ID:$palign, variable_ops), + "$res = OpEnqueueKernel $type $queue $flags $NDR $nevents $wevents $revent $invoke $param $psize $palign">; def OpRetainEvent: Op<297, (outs), (ins ID:$event), "OpRetainEvent $event">; def OpReleaseEvent: Op<298, (outs), (ins ID:$event), "OpReleaseEvent $event">; def OpCreateUserEvent: Op<299, (outs ID:$res), (ins TYPE:$type), diff --git a/llvm/lib/Target/SPIRV/SPIRVUtils.cpp b/llvm/lib/Target/SPIRV/SPIRVUtils.cpp index 07bcdbdc05da26..0f024efdc329d7 100644 --- a/llvm/lib/Target/SPIRV/SPIRVUtils.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVUtils.cpp @@ -342,12 +342,15 @@ static bool isSPIRVBuiltinType(const StructType *SType) { SType->getName().startswith("spirv."); } +const Type *getTypedPtrEltType(const Type *Ty) { + auto PType = dyn_cast(Ty); + if (!PType || PType->isOpaque()) + return Ty; + return PType->getNonOpaquePointerElementType(); +} + bool isSpecialOpaqueType(const Type *Ty) { - if (auto PType = dyn_cast(Ty)) { - if (!PType->isOpaque()) - Ty = PType->getNonOpaquePointerElementType(); - } - if (auto SType = dyn_cast(Ty)) + if (auto SType = dyn_cast(getTypedPtrEltType(Ty))) return isOpenCLBuiltinType(SType) || isSPIRVBuiltinType(SType); return false; } diff --git a/llvm/lib/Target/SPIRV/SPIRVUtils.h b/llvm/lib/Target/SPIRV/SPIRVUtils.h index e4e07dc68a37d8..09e14a0f84a3a8 100644 --- a/llvm/lib/Target/SPIRV/SPIRVUtils.h +++ b/llvm/lib/Target/SPIRV/SPIRVUtils.h @@ -88,6 +88,10 @@ Type *getMDOperandAsType(const MDNode *N, unsigned I); // name, otherwise return an empty string. std::string getOclOrSpirvBuiltinDemangledName(StringRef Name); +// If Type is a pointer type and it is not opaque pointer, return its +// element type, otherwise return Type. +const Type *getTypedPtrEltType(const Type *Type); + // Check if given LLVM type is a special opaque builtin type. bool isSpecialOpaqueType(const Type *Ty); diff --git a/llvm/test/CodeGen/SPIRV/EnqueueEmptyKernel.ll b/llvm/test/CodeGen/SPIRV/EnqueueEmptyKernel.ll new file mode 100644 index 00000000000000..679f8ff7a0017e --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/EnqueueEmptyKernel.ll @@ -0,0 +1,64 @@ +; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV + +;; This test checks that Invoke parameter of OpEnueueKernel instruction meet the +;; following specification requirements in case of enqueueing empty block: +;; "Invoke must be an OpFunction whose OpTypeFunction operand has: +;; - Result Type must be OpTypeVoid. +;; - The first parameter must have a type of OpTypePointer to an 8-bit OpTypeInt. +;; - An optional list of parameters, each of which must have a type of OpTypePointer to the Workgroup Storage Class. +;; ... " +;; __kernel void test_enqueue_empty() { +;; enqueue_kernel(get_default_queue(), +;; CLK_ENQUEUE_FLAGS_WAIT_KERNEL, +;; ndrange_1D(1), +;; 0, NULL, NULL, +;; ^(){}); +;; } + +%struct.ndrange_t = type { i32, [3 x i64], [3 x i64], [3 x i64] } +%opencl.queue_t = type opaque +%opencl.clk_event_t = type opaque + +@__block_literal_global = internal addrspace(1) constant { i32, i32 } { i32 8, i32 4 }, align 4 + +; CHECK-SPIRV: OpName %[[#Block:]] "__block_literal_global" +; CHECK-SPIRV: %[[#Void:]] = OpTypeVoid +; CHECK-SPIRV: %[[#Int8:]] = OpTypeInt 8 +; CHECK-SPIRV: %[[#Int8PtrGen:]] = OpTypePointer Generic %[[#Int8]] +; CHECK-SPIRV: %[[#Int8Ptr:]] = OpTypePointer CrossWorkgroup %[[#Int8]] +; CHECK-SPIRV: %[[#Block]] = OpVariable %[[#]] + +define spir_kernel void @test_enqueue_empty() { +entry: + %tmp = alloca %struct.ndrange_t, align 8 + %call = call spir_func %opencl.queue_t* @_Z17get_default_queuev() + call spir_func void @_Z10ndrange_1Dm(%struct.ndrange_t* sret(%struct.ndrange_t*) %tmp, i64 1) + %0 = call i32 @__enqueue_kernel_basic_events(%opencl.queue_t* %call, i32 1, %struct.ndrange_t* %tmp, i32 0, %opencl.clk_event_t* addrspace(4)* null, %opencl.clk_event_t* addrspace(4)* null, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*)* @__test_enqueue_empty_block_invoke_kernel to i8*) to i8 addrspace(4)*), i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32 } addrspace(1)* @__block_literal_global to i8 addrspace(1)*) to i8 addrspace(4)*)) + ret void +; CHECK-SPIRV: %[[#Int8PtrBlock:]] = OpBitcast %[[#Int8Ptr]] %[[#Block]] +; CHECK-SPIRV: %[[#Int8PtrGenBlock:]] = OpPtrCastToGeneric %[[#Int8PtrGen]] %[[#Int8PtrBlock]] +; CHECK-SPIRV: %[[#]] = OpEnqueueKernel %[[#]] %[[#]] %[[#]] %[[#]] %[[#]] %[[#]] %[[#]] %[[#Invoke:]] %[[#Int8PtrGenBlock]] %[[#]] %[[#]] +} + +declare spir_func %opencl.queue_t* @_Z17get_default_queuev() + +declare spir_func void @_Z10ndrange_1Dm(%struct.ndrange_t* sret(%struct.ndrange_t*), i64) + +define internal spir_func void @__test_enqueue_empty_block_invoke(i8 addrspace(4)* %.block_descriptor) { +entry: + %.block_descriptor.addr = alloca i8 addrspace(4)*, align 8 + store i8 addrspace(4)* %.block_descriptor, i8 addrspace(4)** %.block_descriptor.addr, align 8 + %block = bitcast i8 addrspace(4)* %.block_descriptor to <{ i32, i32 }> addrspace(4)* + ret void +} + +define internal spir_kernel void @__test_enqueue_empty_block_invoke_kernel(i8 addrspace(4)*) { +entry: + call void @__test_enqueue_empty_block_invoke(i8 addrspace(4)* %0) + ret void +} + +declare i32 @__enqueue_kernel_basic_events(%opencl.queue_t*, i32, %struct.ndrange_t*, i32, %opencl.clk_event_t* addrspace(4)*, %opencl.clk_event_t* addrspace(4)*, i8 addrspace(4)*, i8 addrspace(4)*) + +; CHECK-SPIRV: %[[#Invoke]] = OpFunction %[[#Void]] None %[[#]] +; CHECK-SPIRV-NEXT: %[[#]] = OpFunctionParameter %[[#Int8PtrGen]] diff --git a/llvm/test/CodeGen/SPIRV/transcoding/enqueue_kernel.ll b/llvm/test/CodeGen/SPIRV/transcoding/enqueue_kernel.ll new file mode 100644 index 00000000000000..6de03dd1651879 --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/transcoding/enqueue_kernel.ll @@ -0,0 +1,385 @@ +; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV + +; CHECK-SPIRV: OpEntryPoint Kernel %[[#BlockKer1:]] "__device_side_enqueue_block_invoke_kernel" +; CHECK-SPIRV: OpEntryPoint Kernel %[[#BlockKer2:]] "__device_side_enqueue_block_invoke_2_kernel" +; CHECK-SPIRV: OpEntryPoint Kernel %[[#BlockKer3:]] "__device_side_enqueue_block_invoke_3_kernel" +; CHECK-SPIRV: OpEntryPoint Kernel %[[#BlockKer4:]] "__device_side_enqueue_block_invoke_4_kernel" +; CHECK-SPIRV: OpEntryPoint Kernel %[[#BlockKer5:]] "__device_side_enqueue_block_invoke_5_kernel" +; CHECK-SPIRV: OpName %[[#BlockGlb1:]] "__block_literal_global" +; CHECK-SPIRV: OpName %[[#BlockGlb2:]] "__block_literal_global.1" + +; CHECK-SPIRV: %[[#Int32Ty:]] = OpTypeInt 32 +; CHECK-SPIRV: %[[#Int8Ty:]] = OpTypeInt 8 +; CHECK-SPIRV: %[[#VoidTy:]] = OpTypeVoid +; CHECK-SPIRV: %[[#Int8PtrGenTy:]] = OpTypePointer Generic %[[#Int8Ty]] +; CHECK-SPIRV: %[[#EventTy:]] = OpTypeDeviceEvent +; CHECK-SPIRV: %[[#EventPtrTy:]] = OpTypePointer Generic %[[#EventTy]] +; CHECK-SPIRV: %[[#Int32LocPtrTy:]] = OpTypePointer Function %[[#Int32Ty]] +; CHECK-SPIRV: %[[#BlockStructTy:]] = OpTypeStruct +; CHECK-SPIRV: %[[#BlockStructLocPtrTy:]] = OpTypePointer Function %[[#BlockStructTy]] +; CHECK-SPIRV: %[[#BlockTy1:]] = OpTypeFunction %[[#VoidTy]] %[[#Int8PtrGenTy]] +; CHECK-SPIRV: %[[#BlockTy2:]] = OpTypeFunction %[[#VoidTy]] %[[#Int8PtrGenTy]] +; CHECK-SPIRV: %[[#BlockTy3:]] = OpTypeFunction %[[#VoidTy]] %[[#Int8PtrGenTy]] + +; CHECK-SPIRV: %[[#ConstInt0:]] = OpConstant %[[#Int32Ty]] 0 +; CHECK-SPIRV: %[[#EventNull:]] = OpConstantNull %[[#EventPtrTy]] +; CHECK-SPIRV: %[[#ConstInt21:]] = OpConstant %[[#Int32Ty]] 21 +; CHECK-SPIRV: %[[#ConstInt8:]] = OpConstant %[[#Int32Ty]] 8 +; CHECK-SPIRV: %[[#ConstInt24:]] = OpConstant %[[#Int32Ty]] 24 +; CHECK-SPIRV: %[[#ConstInt12:]] = OpConstant %[[#Int32Ty]] 12 +; CHECK-SPIRV: %[[#ConstInt2:]] = OpConstant %[[#Int32Ty]] 2 + +;; typedef struct {int a;} ndrange_t; +;; #define NULL ((void*)0) + +;; kernel void device_side_enqueue(global int *a, global int *b, int i, char c0) { +;; queue_t default_queue; +;; unsigned flags = 0; +;; ndrange_t ndrange; +;; clk_event_t clk_event; +;; clk_event_t event_wait_list; +;; clk_event_t event_wait_list2[] = {clk_event}; + +;; Emits block literal on stack and block kernel. + +; CHECK-SPIRV: %[[#BlockLitPtr1:]] = OpBitcast %[[#BlockStructLocPtrTy]] +; CHECK-SPIRV-NEXT: %[[#BlockLit1:]] = OpPtrCastToGeneric %[[#Int8PtrGenTy]] %[[#BlockLitPtr1]] +; CHECK-SPIRV-NEXT: %[[#]] = OpEnqueueKernel %[[#Int32Ty]] %[[#]] %[[#]] %[[#]] %[[#ConstInt0]] %[[#EventNull]] %[[#EventNull]] %[[#BlockKer1]] %[[#BlockLit1]] %[[#ConstInt21]] %[[#ConstInt8]] + +;; enqueue_kernel(default_queue, flags, ndrange, +;; ^(void) { +;; a[i] = c0; +;; }); + +;; Emits block literal on stack and block kernel. + +; CHECK-SPIRV: %[[#Event1:]] = OpPtrCastToGeneric %[[#EventPtrTy]] +; CHECK-SPIRV: %[[#Event2:]] = OpPtrCastToGeneric %[[#EventPtrTy]] +; CHECK-SPIRV: %[[#BlockLitPtr2:]] = OpBitcast %[[#BlockStructLocPtrTy]] +; CHECK-SPIRV-NEXT: %[[#BlockLit2:]] = OpPtrCastToGeneric %[[#Int8PtrGenTy]] %[[#BlockLitPtr2]] +; CHECK-SPIRV-NEXT: %[[#]] = OpEnqueueKernel %[[#Int32Ty]] %[[#]] %[[#]] %[[#]] %[[#ConstInt2]] %[[#Event1]] %[[#Event2]] %[[#BlockKer2]] %[[#BlockLit2]] %[[#ConstInt24]] %[[#ConstInt8]] + +;; enqueue_kernel(default_queue, flags, ndrange, 2, &event_wait_list, &clk_event, +;; ^(void) { +;; a[i] = b[i]; +;; }); + +;; char c; +;; Emits global block literal and block kernel. + +; CHECK-SPIRV: %[[#Event1:]] = OpPtrCastToGeneric %[[#EventPtrTy]] +; CHECK-SPIRV: %[[#Event2:]] = OpPtrCastToGeneric %[[#EventPtrTy]] +; CHECK-SPIRV: %[[#BlockLit3Tmp:]] = OpBitcast %[[#]] %[[#BlockGlb1]] +; CHECK-SPIRV: %[[#BlockLit3:]] = OpPtrCastToGeneric %[[#Int8PtrGenTy]] %[[#BlockLit3Tmp]] +; CHECK-SPIRV: %[[#LocalBuf31:]] = OpPtrAccessChain %[[#Int32LocPtrTy]] +; CHECK-SPIRV: %[[#]] = OpEnqueueKernel %[[#Int32Ty]] %[[#]] %[[#]] %[[#]] %[[#ConstInt2]] %[[#Event1]] %[[#Event2]] %[[#BlockKer3]] %[[#BlockLit3]] %[[#ConstInt12]] %[[#ConstInt8]] %[[#LocalBuf31]] + +;; enqueue_kernel(default_queue, flags, ndrange, 2, event_wait_list2, &clk_event, +;; ^(local void *p) { +;; return; +;; }, +;; c); + +;; Emits global block literal and block kernel. + +; CHECK-SPIRV: %[[#BlockLit4Tmp:]] = OpBitcast %[[#]] %[[#BlockGlb2]] +; CHECK-SPIRV: %[[#BlockLit4:]] = OpPtrCastToGeneric %[[#Int8PtrGenTy]] %[[#BlockLit4Tmp]] +; CHECK-SPIRV: %[[#LocalBuf41:]] = OpPtrAccessChain %[[#Int32LocPtrTy]] +; CHECK-SPIRV-NEXT: %[[#LocalBuf42:]] = OpPtrAccessChain %[[#Int32LocPtrTy]] +; CHECK-SPIRV-NEXT: %[[#LocalBuf43:]] = OpPtrAccessChain %[[#Int32LocPtrTy]] +; CHECK-SPIRV-NEXT: %[[#]] = OpEnqueueKernel %[[#Int32Ty]] %[[#]] %[[#]] %[[#]] %[[#ConstInt0]] %[[#EventNull]] %[[#EventNull]] %[[#BlockKer4]] %[[#BlockLit4]] %[[#ConstInt12]] %[[#ConstInt8]] %[[#LocalBuf41]] %[[#LocalBuf42]] %[[#LocalBuf43]] + +;; enqueue_kernel(default_queue, flags, ndrange, +;; ^(local void *p1, local void *p2, local void *p3) { +;; return; +;; }, +;; 1, 2, 4); + +;; Emits block literal on stack and block kernel. + +; CHECK-SPIRV: %[[#Event1:]] = OpPtrCastToGeneric %[[#EventPtrTy]] +; CHECK-SPIRV: %[[#BlockLit5Tmp:]] = OpBitcast %[[#BlockStructLocPtrTy]] +; CHECK-SPIRV-NEXT: %[[#BlockLit5:]] = OpPtrCastToGeneric %[[#Int8PtrGenTy]] %[[#BlockLit5Tmp]] +; CHECK-SPIRV-NEXT: %[[#]] = OpEnqueueKernel %[[#Int32Ty]] %[[#]] %[[#]] %[[#]] %[[#ConstInt0]] %[[#EventNull]] %[[#Event1]] %[[#BlockKer5]] %[[#BlockLit5]] %[[#ConstInt24]] %[[#ConstInt8]] + +;; enqueue_kernel(default_queue, flags, ndrange, 0, NULL, &clk_event, +;; ^(void) { +;; a[i] = b[i]; +;; }); +;; } + +; CHECK-SPIRV-DAG: %[[#BlockKer1]] = OpFunction %[[#VoidTy]] None %[[#BlockTy1]] +; CHECK-SPIRV-DAG: %[[#BlockKer2]] = OpFunction %[[#VoidTy]] None %[[#BlockTy1]] +; CHECK-SPIRV-DAG: %[[#BlockKer3]] = OpFunction %[[#VoidTy]] None %[[#BlockTy3]] +; CHECK-SPIRV-DAG: %[[#BlockKer4]] = OpFunction %[[#VoidTy]] None %[[#BlockTy2]] +; CHECK-SPIRV-DAG: %[[#BlockKer5]] = OpFunction %[[#VoidTy]] None %[[#BlockTy1]] + +%opencl.queue_t = type opaque +%struct.ndrange_t = type { i32 } +%opencl.clk_event_t = type opaque +%struct.__opencl_block_literal_generic = type { i32, i32, i8 addrspace(4)* } + +@__block_literal_global = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 12, i32 4, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* @__device_side_enqueue_block_invoke_3 to i8*) to i8 addrspace(4)*) }, align 4 +@__block_literal_global.1 = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 12, i32 4, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*, i8 addrspace(3)*, i8 addrspace(3)*)* @__device_side_enqueue_block_invoke_4 to i8*) to i8 addrspace(4)*) }, align 4 + +define dso_local spir_kernel void @device_side_enqueue(i32 addrspace(1)* noundef %a, i32 addrspace(1)* noundef %b, i32 noundef %i, i8 noundef signext %c0) { +entry: + %a.addr = alloca i32 addrspace(1)*, align 4 + %b.addr = alloca i32 addrspace(1)*, align 4 + %i.addr = alloca i32, align 4 + %c0.addr = alloca i8, align 1 + %default_queue = alloca %opencl.queue_t*, align 4 + %flags = alloca i32, align 4 + %ndrange = alloca %struct.ndrange_t, align 4 + %clk_event = alloca %opencl.clk_event_t*, align 4 + %event_wait_list = alloca %opencl.clk_event_t*, align 4 + %event_wait_list2 = alloca [1 x %opencl.clk_event_t*], align 4 + %tmp = alloca %struct.ndrange_t, align 4 + %block = alloca <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }>, align 4 + %tmp3 = alloca %struct.ndrange_t, align 4 + %block4 = alloca <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, align 4 + %c = alloca i8, align 1 + %tmp11 = alloca %struct.ndrange_t, align 4 + %block_sizes = alloca [1 x i32], align 4 + %tmp12 = alloca %struct.ndrange_t, align 4 + %block_sizes13 = alloca [3 x i32], align 4 + %tmp14 = alloca %struct.ndrange_t, align 4 + %block15 = alloca <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, align 4 + store i32 addrspace(1)* %a, i32 addrspace(1)** %a.addr, align 4 + store i32 addrspace(1)* %b, i32 addrspace(1)** %b.addr, align 4 + store i32 %i, i32* %i.addr, align 4 + store i8 %c0, i8* %c0.addr, align 1 + store i32 0, i32* %flags, align 4 + %arrayinit.begin = getelementptr inbounds [1 x %opencl.clk_event_t*], [1 x %opencl.clk_event_t*]* %event_wait_list2, i32 0, i32 0 + %0 = load %opencl.clk_event_t*, %opencl.clk_event_t** %clk_event, align 4 + store %opencl.clk_event_t* %0, %opencl.clk_event_t** %arrayinit.begin, align 4 + %1 = load %opencl.queue_t*, %opencl.queue_t** %default_queue, align 4 + %2 = load i32, i32* %flags, align 4 + %3 = bitcast %struct.ndrange_t* %tmp to i8* + %4 = bitcast %struct.ndrange_t* %ndrange to i8* + call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %3, i8* align 4 %4, i32 4, i1 false) + %block.size = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }>* %block, i32 0, i32 0 + store i32 21, i32* %block.size, align 4 + %block.align = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }>* %block, i32 0, i32 1 + store i32 4, i32* %block.align, align 4 + %block.invoke = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }>* %block, i32 0, i32 2 + store i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*)* @__device_side_enqueue_block_invoke to i8*) to i8 addrspace(4)*), i8 addrspace(4)** %block.invoke, align 4 + %block.captured = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }>* %block, i32 0, i32 3 + %5 = load i32 addrspace(1)*, i32 addrspace(1)** %a.addr, align 4 + store i32 addrspace(1)* %5, i32 addrspace(1)** %block.captured, align 4 + %block.captured1 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }>* %block, i32 0, i32 4 + %6 = load i32, i32* %i.addr, align 4 + store i32 %6, i32* %block.captured1, align 4 + %block.captured2 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }>* %block, i32 0, i32 5 + %7 = load i8, i8* %c0.addr, align 1 + store i8 %7, i8* %block.captured2, align 4 + %8 = bitcast <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }>* %block to %struct.__opencl_block_literal_generic* + %9 = addrspacecast %struct.__opencl_block_literal_generic* %8 to i8 addrspace(4)* + %10 = call spir_func i32 @__enqueue_kernel_basic(%opencl.queue_t* %1, i32 %2, %struct.ndrange_t* byval(%struct.ndrange_t) %tmp, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*)* @__device_side_enqueue_block_invoke_kernel to i8*) to i8 addrspace(4)*), i8 addrspace(4)* %9) + %11 = load %opencl.queue_t*, %opencl.queue_t** %default_queue, align 4 + %12 = load i32, i32* %flags, align 4 + %13 = bitcast %struct.ndrange_t* %tmp3 to i8* + %14 = bitcast %struct.ndrange_t* %ndrange to i8* + call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %13, i8* align 4 %14, i32 4, i1 false) + %15 = addrspacecast %opencl.clk_event_t** %event_wait_list to %opencl.clk_event_t* addrspace(4)* + %16 = addrspacecast %opencl.clk_event_t** %clk_event to %opencl.clk_event_t* addrspace(4)* + %block.size5 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block4, i32 0, i32 0 + store i32 24, i32* %block.size5, align 4 + %block.align6 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block4, i32 0, i32 1 + store i32 4, i32* %block.align6, align 4 + %block.invoke7 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block4, i32 0, i32 2 + store i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*)* @__device_side_enqueue_block_invoke_2 to i8*) to i8 addrspace(4)*), i8 addrspace(4)** %block.invoke7, align 4 + %block.captured8 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block4, i32 0, i32 3 + %17 = load i32 addrspace(1)*, i32 addrspace(1)** %a.addr, align 4 + store i32 addrspace(1)* %17, i32 addrspace(1)** %block.captured8, align 4 + %block.captured9 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block4, i32 0, i32 4 + %18 = load i32, i32* %i.addr, align 4 + store i32 %18, i32* %block.captured9, align 4 + %block.captured10 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block4, i32 0, i32 5 + %19 = load i32 addrspace(1)*, i32 addrspace(1)** %b.addr, align 4 + store i32 addrspace(1)* %19, i32 addrspace(1)** %block.captured10, align 4 + %20 = bitcast <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block4 to %struct.__opencl_block_literal_generic* + %21 = addrspacecast %struct.__opencl_block_literal_generic* %20 to i8 addrspace(4)* + %22 = call spir_func i32 @__enqueue_kernel_basic_events(%opencl.queue_t* %11, i32 %12, %struct.ndrange_t* %tmp3, i32 2, %opencl.clk_event_t* addrspace(4)* %15, %opencl.clk_event_t* addrspace(4)* %16, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*)* @__device_side_enqueue_block_invoke_2_kernel to i8*) to i8 addrspace(4)*), i8 addrspace(4)* %21) + %23 = load %opencl.queue_t*, %opencl.queue_t** %default_queue, align 4 + %24 = load i32, i32* %flags, align 4 + %25 = bitcast %struct.ndrange_t* %tmp11 to i8* + %26 = bitcast %struct.ndrange_t* %ndrange to i8* + call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %25, i8* align 4 %26, i32 4, i1 false) + %arraydecay = getelementptr inbounds [1 x %opencl.clk_event_t*], [1 x %opencl.clk_event_t*]* %event_wait_list2, i32 0, i32 0 + %27 = addrspacecast %opencl.clk_event_t** %arraydecay to %opencl.clk_event_t* addrspace(4)* + %28 = addrspacecast %opencl.clk_event_t** %clk_event to %opencl.clk_event_t* addrspace(4)* + %29 = getelementptr [1 x i32], [1 x i32]* %block_sizes, i32 0, i32 0 + %30 = load i8, i8* %c, align 1 + %31 = zext i8 %30 to i32 + store i32 %31, i32* %29, align 4 + %32 = call spir_func i32 @__enqueue_kernel_events_varargs(%opencl.queue_t* %23, i32 %24, %struct.ndrange_t* %tmp11, i32 2, %opencl.clk_event_t* addrspace(4)* %27, %opencl.clk_event_t* addrspace(4)* %28, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* @__device_side_enqueue_block_invoke_3_kernel to i8*) to i8 addrspace(4)*), i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i32* %29) + %33 = load %opencl.queue_t*, %opencl.queue_t** %default_queue, align 4 + %34 = load i32, i32* %flags, align 4 + %35 = bitcast %struct.ndrange_t* %tmp12 to i8* + %36 = bitcast %struct.ndrange_t* %ndrange to i8* + call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %35, i8* align 4 %36, i32 4, i1 false) + %37 = getelementptr [3 x i32], [3 x i32]* %block_sizes13, i32 0, i32 0 + store i32 1, i32* %37, align 4 + %38 = getelementptr [3 x i32], [3 x i32]* %block_sizes13, i32 0, i32 1 + store i32 2, i32* %38, align 4 + %39 = getelementptr [3 x i32], [3 x i32]* %block_sizes13, i32 0, i32 2 + store i32 4, i32* %39, align 4 + %40 = call spir_func i32 @__enqueue_kernel_varargs(%opencl.queue_t* %33, i32 %34, %struct.ndrange_t* %tmp12, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*, i8 addrspace(3)*, i8 addrspace(3)*)* @__device_side_enqueue_block_invoke_4_kernel to i8*) to i8 addrspace(4)*), i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global.1 to i8 addrspace(1)*) to i8 addrspace(4)*), i32 3, i32* %37) + %41 = load %opencl.queue_t*, %opencl.queue_t** %default_queue, align 4 + %42 = load i32, i32* %flags, align 4 + %43 = bitcast %struct.ndrange_t* %tmp14 to i8* + %44 = bitcast %struct.ndrange_t* %ndrange to i8* + call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %43, i8* align 4 %44, i32 4, i1 false) + %45 = addrspacecast %opencl.clk_event_t** %clk_event to %opencl.clk_event_t* addrspace(4)* + %block.size16 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block15, i32 0, i32 0 + store i32 24, i32* %block.size16, align 4 + %block.align17 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block15, i32 0, i32 1 + store i32 4, i32* %block.align17, align 4 + %block.invoke18 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block15, i32 0, i32 2 + store i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*)* @__device_side_enqueue_block_invoke_5 to i8*) to i8 addrspace(4)*), i8 addrspace(4)** %block.invoke18, align 4 + %block.captured19 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block15, i32 0, i32 3 + %46 = load i32 addrspace(1)*, i32 addrspace(1)** %a.addr, align 4 + store i32 addrspace(1)* %46, i32 addrspace(1)** %block.captured19, align 4 + %block.captured20 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block15, i32 0, i32 4 + %47 = load i32, i32* %i.addr, align 4 + store i32 %47, i32* %block.captured20, align 4 + %block.captured21 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block15, i32 0, i32 5 + %48 = load i32 addrspace(1)*, i32 addrspace(1)** %b.addr, align 4 + store i32 addrspace(1)* %48, i32 addrspace(1)** %block.captured21, align 4 + %49 = bitcast <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block15 to %struct.__opencl_block_literal_generic* + %50 = addrspacecast %struct.__opencl_block_literal_generic* %49 to i8 addrspace(4)* + %51 = call spir_func i32 @__enqueue_kernel_basic_events(%opencl.queue_t* %41, i32 %42, %struct.ndrange_t* %tmp14, i32 0, %opencl.clk_event_t* addrspace(4)* null, %opencl.clk_event_t* addrspace(4)* %45, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*)* @__device_side_enqueue_block_invoke_5_kernel to i8*) to i8 addrspace(4)*), i8 addrspace(4)* %50) + ret void +} + +declare void @llvm.memcpy.p0i8.p0i8.i32(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i32, i1 immarg) + +define internal spir_func void @__device_side_enqueue_block_invoke(i8 addrspace(4)* noundef %.block_descriptor) { +entry: + %.block_descriptor.addr = alloca i8 addrspace(4)*, align 4 + %block.addr = alloca <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }> addrspace(4)*, align 4 + store i8 addrspace(4)* %.block_descriptor, i8 addrspace(4)** %.block_descriptor.addr, align 4 + %block = bitcast i8 addrspace(4)* %.block_descriptor to <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }> addrspace(4)* + store <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }> addrspace(4)* %block, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }> addrspace(4)** %block.addr, align 4 + %block.capture.addr = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }> addrspace(4)* %block, i32 0, i32 5 + %0 = load i8, i8 addrspace(4)* %block.capture.addr, align 4 + %conv = sext i8 %0 to i32 + %block.capture.addr1 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }> addrspace(4)* %block, i32 0, i32 3 + %1 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* %block.capture.addr1, align 4 + %block.capture.addr2 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }> addrspace(4)* %block, i32 0, i32 4 + %2 = load i32, i32 addrspace(4)* %block.capture.addr2, align 4 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %1, i32 %2 + store i32 %conv, i32 addrspace(1)* %arrayidx, align 4 + ret void +} + +define spir_kernel void @__device_side_enqueue_block_invoke_kernel(i8 addrspace(4)* %0) { +entry: + call spir_func void @__device_side_enqueue_block_invoke(i8 addrspace(4)* %0) + ret void +} + +declare spir_func i32 @__enqueue_kernel_basic(%opencl.queue_t*, i32, %struct.ndrange_t*, i8 addrspace(4)*, i8 addrspace(4)*) + +define internal spir_func void @__device_side_enqueue_block_invoke_2(i8 addrspace(4)* noundef %.block_descriptor) { +entry: + %.block_descriptor.addr = alloca i8 addrspace(4)*, align 4 + %block.addr = alloca <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)*, align 4 + store i8 addrspace(4)* %.block_descriptor, i8 addrspace(4)** %.block_descriptor.addr, align 4 + %block = bitcast i8 addrspace(4)* %.block_descriptor to <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)* + store <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)* %block, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)** %block.addr, align 4 + %block.capture.addr = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)* %block, i32 0, i32 5 + %0 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* %block.capture.addr, align 4 + %block.capture.addr1 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)* %block, i32 0, i32 4 + %1 = load i32, i32 addrspace(4)* %block.capture.addr1, align 4 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %0, i32 %1 + %2 = load i32, i32 addrspace(1)* %arrayidx, align 4 + %block.capture.addr2 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)* %block, i32 0, i32 3 + %3 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* %block.capture.addr2, align 4 + %block.capture.addr3 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)* %block, i32 0, i32 4 + %4 = load i32, i32 addrspace(4)* %block.capture.addr3, align 4 + %arrayidx4 = getelementptr inbounds i32, i32 addrspace(1)* %3, i32 %4 + store i32 %2, i32 addrspace(1)* %arrayidx4, align 4 + ret void +} + +define spir_kernel void @__device_side_enqueue_block_invoke_2_kernel(i8 addrspace(4)* %0) { +entry: + call spir_func void @__device_side_enqueue_block_invoke_2(i8 addrspace(4)* %0) + ret void +} + +declare spir_func i32 @__enqueue_kernel_basic_events(%opencl.queue_t*, i32, %struct.ndrange_t*, i32, %opencl.clk_event_t* addrspace(4)*, %opencl.clk_event_t* addrspace(4)*, i8 addrspace(4)*, i8 addrspace(4)*) + +define internal spir_func void @__device_side_enqueue_block_invoke_3(i8 addrspace(4)* noundef %.block_descriptor, i8 addrspace(3)* noundef %p) { +entry: + %.block_descriptor.addr = alloca i8 addrspace(4)*, align 4 + %p.addr = alloca i8 addrspace(3)*, align 4 + %block.addr = alloca <{ i32, i32, i8 addrspace(4)* }> addrspace(4)*, align 4 + store i8 addrspace(4)* %.block_descriptor, i8 addrspace(4)** %.block_descriptor.addr, align 4 + %block = bitcast i8 addrspace(4)* %.block_descriptor to <{ i32, i32, i8 addrspace(4)* }> addrspace(4)* + store i8 addrspace(3)* %p, i8 addrspace(3)** %p.addr, align 4 + store <{ i32, i32, i8 addrspace(4)* }> addrspace(4)* %block, <{ i32, i32, i8 addrspace(4)* }> addrspace(4)** %block.addr, align 4 + ret void +} + +define spir_kernel void @__device_side_enqueue_block_invoke_3_kernel(i8 addrspace(4)* %0, i8 addrspace(3)* %1) { +entry: + call spir_func void @__device_side_enqueue_block_invoke_3(i8 addrspace(4)* %0, i8 addrspace(3)* %1) + ret void +} + +declare spir_func i32 @__enqueue_kernel_events_varargs(%opencl.queue_t*, i32, %struct.ndrange_t*, i32, %opencl.clk_event_t* addrspace(4)*, %opencl.clk_event_t* addrspace(4)*, i8 addrspace(4)*, i8 addrspace(4)*, i32, i32*) + +define internal spir_func void @__device_side_enqueue_block_invoke_4(i8 addrspace(4)* noundef %.block_descriptor, i8 addrspace(3)* noundef %p1, i8 addrspace(3)* noundef %p2, i8 addrspace(3)* noundef %p3) { +entry: + %.block_descriptor.addr = alloca i8 addrspace(4)*, align 4 + %p1.addr = alloca i8 addrspace(3)*, align 4 + %p2.addr = alloca i8 addrspace(3)*, align 4 + %p3.addr = alloca i8 addrspace(3)*, align 4 + %block.addr = alloca <{ i32, i32, i8 addrspace(4)* }> addrspace(4)*, align 4 + store i8 addrspace(4)* %.block_descriptor, i8 addrspace(4)** %.block_descriptor.addr, align 4 + %block = bitcast i8 addrspace(4)* %.block_descriptor to <{ i32, i32, i8 addrspace(4)* }> addrspace(4)* + store i8 addrspace(3)* %p1, i8 addrspace(3)** %p1.addr, align 4 + store i8 addrspace(3)* %p2, i8 addrspace(3)** %p2.addr, align 4 + store i8 addrspace(3)* %p3, i8 addrspace(3)** %p3.addr, align 4 + store <{ i32, i32, i8 addrspace(4)* }> addrspace(4)* %block, <{ i32, i32, i8 addrspace(4)* }> addrspace(4)** %block.addr, align 4 + ret void +} + +define spir_kernel void @__device_side_enqueue_block_invoke_4_kernel(i8 addrspace(4)* %0, i8 addrspace(3)* %1, i8 addrspace(3)* %2, i8 addrspace(3)* %3) { +entry: + call spir_func void @__device_side_enqueue_block_invoke_4(i8 addrspace(4)* %0, i8 addrspace(3)* %1, i8 addrspace(3)* %2, i8 addrspace(3)* %3) + ret void +} + +declare spir_func i32 @__enqueue_kernel_varargs(%opencl.queue_t*, i32, %struct.ndrange_t*, i8 addrspace(4)*, i8 addrspace(4)*, i32, i32*) + +define internal spir_func void @__device_side_enqueue_block_invoke_5(i8 addrspace(4)* noundef %.block_descriptor) { +entry: + %.block_descriptor.addr = alloca i8 addrspace(4)*, align 4 + %block.addr = alloca <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)*, align 4 + store i8 addrspace(4)* %.block_descriptor, i8 addrspace(4)** %.block_descriptor.addr, align 4 + %block = bitcast i8 addrspace(4)* %.block_descriptor to <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)* + store <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)* %block, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)** %block.addr, align 4 + %block.capture.addr = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)* %block, i32 0, i32 5 + %0 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* %block.capture.addr, align 4 + %block.capture.addr1 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)* %block, i32 0, i32 4 + %1 = load i32, i32 addrspace(4)* %block.capture.addr1, align 4 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %0, i32 %1 + %2 = load i32, i32 addrspace(1)* %arrayidx, align 4 + %block.capture.addr2 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)* %block, i32 0, i32 3 + %3 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* %block.capture.addr2, align 4 + %block.capture.addr3 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)* %block, i32 0, i32 4 + %4 = load i32, i32 addrspace(4)* %block.capture.addr3, align 4 + %arrayidx4 = getelementptr inbounds i32, i32 addrspace(1)* %3, i32 %4 + store i32 %2, i32 addrspace(1)* %arrayidx4, align 4 + ret void +} + +define spir_kernel void @__device_side_enqueue_block_invoke_5_kernel(i8 addrspace(4)* %0) { +entry: + call spir_func void @__device_side_enqueue_block_invoke_5(i8 addrspace(4)* %0) + ret void +}