llvm · fabianmcg · Dec 5, 2023 · Nov 6, 2023 · Nov 13, 2023 · Nov 13, 2023
diff --git a/mlir/include/mlir/Target/LLVMIR/Dialect/All.h b/mlir/include/mlir/Target/LLVMIR/Dialect/All.h
@@ -26,6 +26,7 @@
 #include "mlir/Target/LLVMIR/Dialect/OpenACC/OpenACCToLLVMIRTranslation.h"
 #include "mlir/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.h"
 #include "mlir/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.h"
+#include "mlir/Target/LLVMIR/Dialect/SPIRV/SPIRVToLLVMIRTranslation.h"
 #include "mlir/Target/LLVMIR/Dialect/X86Vector/X86VectorToLLVMIRTranslation.h"
 
 namespace mlir {
@@ -45,6 +46,7 @@ static inline void registerAllToLLVMIRTranslations(DialectRegistry &registry) {
   registerOpenACCDialectTranslation(registry);
   registerOpenMPDialectTranslation(registry);
   registerROCDLDialectTranslation(registry);
+  registerSPIRVDialectTranslation(registry);
   registerX86VectorDialectTranslation(registry);
 
   // Extension required for translating GPU offloading Ops.
@@ -61,6 +63,7 @@ registerAllGPUToLLVMIRTranslations(DialectRegistry &registry) {
   registerLLVMDialectTranslation(registry);
   registerNVVMDialectTranslation(registry);
   registerROCDLDialectTranslation(registry);
+  registerSPIRVDialectTranslation(registry);
 
   // Extension required for translating GPU offloading Ops.
   gpu::registerOffloadingLLVMTranslationInterfaceExternalModels(registry);

@@ -0,0 +1,31 @@
+//===- SPIRVToLLVMIRTranslation.h - SPIRV to LLVM IR ------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This provides registration calls for SPIRV dialect to LLVM IR translation.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TARGET_LLVMIR_DIALECT_SPIRV_SPIRVTOLLVMIRTRANSLATION_H
+#define MLIR_TARGET_LLVMIR_DIALECT_SPIRV_SPIRVTOLLVMIRTRANSLATION_H
+
+namespace mlir {
+
+class DialectRegistry;
+class MLIRContext;
+
+/// Register the SPIRV dialect and the translation from it to the LLVM IR in the
+/// given registry;
+void registerSPIRVDialectTranslation(DialectRegistry &registry);
+
+/// Register the SPIRV dialect and the translation from it in the registry
+/// associated with the given context.
+void registerSPIRVDialectTranslation(MLIRContext &context);
+
+} // namespace mlir
+
+#endif // MLIR_TARGET_LLVMIR_DIALECT_SPIRV_SPIRVTOLLVMIRTRANSLATION_H
diff --git a/mlir/lib/Target/LLVMIR/CMakeLists.txt b/mlir/lib/Target/LLVMIR/CMakeLists.txt
@@ -58,6 +58,7 @@ add_mlir_translation_library(MLIRToLLVMIRTranslationRegistration
   MLIROpenACCToLLVMIRTranslation
   MLIROpenMPToLLVMIRTranslation
   MLIRROCDLToLLVMIRTranslation
+  MLIRSPIRVToLLVMIRTranslation
   )
 
 add_mlir_translation_library(MLIRTargetLLVMIRImport

diff --git a/mlir/lib/Target/LLVMIR/Dialect/CMakeLists.txt b/mlir/lib/Target/LLVMIR/Dialect/CMakeLists.txt
@@ -9,4 +9,5 @@ add_subdirectory(NVVM)
 add_subdirectory(OpenACC)
 add_subdirectory(OpenMP)
 add_subdirectory(ROCDL)
+add_subdirectory(SPIRV)
 add_subdirectory(X86Vector)
diff --git a/mlir/lib/Target/LLVMIR/Dialect/GPU/SelectObjectAttr.cpp b/mlir/lib/Target/LLVMIR/Dialect/GPU/SelectObjectAttr.cpp
@@ -52,6 +52,10 @@ class SelectObjectAttrImpl
 std::string getBinaryIdentifier(StringRef binaryName) {
   return binaryName.str() + "_bin_cst";
 }
+// Returns an identifier for the global int64 holding the binary size.
+std::string getBinarySizeIdentifier(StringRef binaryName) {
+  return binaryName.str() + "_bin_size_cst";
+}
 } // namespace
 
 void mlir::gpu::registerOffloadingLLVMTranslationInterfaceExternalModels(
@@ -124,6 +128,17 @@ LogicalResult SelectObjectAttrImpl::embedBinary(
   serializedObj->setLinkage(llvm::GlobalValue::LinkageTypes::InternalLinkage);
   serializedObj->setAlignment(llvm::MaybeAlign(8));
   serializedObj->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::None);
+
+  // Embed the object size as a global constant.
+  llvm::Constant *binarySize =
+      llvm::ConstantInt::get(builder.getInt64Ty(), object.getObject().size());
+  llvm::GlobalVariable *serializedSize = new llvm::GlobalVariable(
+      *module, binarySize->getType(), true,
+      llvm::GlobalValue::LinkageTypes::InternalLinkage, binarySize,
+      getBinarySizeIdentifier(op.getName()));
+  serializedSize->setLinkage(llvm::GlobalValue::LinkageTypes::InternalLinkage);
+  serializedSize->setAlignment(llvm::MaybeAlign(8));
+  serializedSize->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::None);
   return success();
 }
 
@@ -172,6 +187,7 @@ class LaunchKernel {
   IRBuilderBase &builder;
   mlir::LLVM::ModuleTranslation &moduleTranslation;
   Type *i32Ty{};
+  Type *i64Ty{};
   Type *voidTy{};
   Type *intPtrTy{};
   PointerType *ptrTy{};
@@ -213,6 +229,7 @@ llvm::LaunchKernel::LaunchKernel(
     mlir::LLVM::ModuleTranslation &moduleTranslation)
     : module(module), builder(builder), moduleTranslation(moduleTranslation) {
   i32Ty = builder.getInt32Ty();
+  i64Ty = builder.getInt64Ty();
   ptrTy = builder.getPtrTy(0);
   voidTy = builder.getVoidTy();
   intPtrTy = builder.getIntPtrTy(module.getDataLayout());
@@ -221,11 +238,11 @@ llvm::LaunchKernel::LaunchKernel(
 llvm::FunctionCallee llvm::LaunchKernel::getKernelLaunchFn() {
   return module.getOrInsertFunction(
       "mgpuLaunchKernel",
-      FunctionType::get(
-          voidTy,
-          ArrayRef<Type *>({ptrTy, intPtrTy, intPtrTy, intPtrTy, intPtrTy,
-                            intPtrTy, intPtrTy, i32Ty, ptrTy, ptrTy, ptrTy}),
-          false));
+      FunctionType::get(voidTy,
+                        ArrayRef<Type *>({ptrTy, intPtrTy, intPtrTy, intPtrTy,
+                                          intPtrTy, intPtrTy, intPtrTy, i32Ty,
+                                          ptrTy, ptrTy, ptrTy, i64Ty}),
+                        false));
 }
 
 llvm::FunctionCallee llvm::LaunchKernel::getModuleFunctionFn() {
@@ -237,7 +254,7 @@ llvm::FunctionCallee llvm::LaunchKernel::getModuleFunctionFn() {
 llvm::FunctionCallee llvm::LaunchKernel::getModuleLoadFn() {
   return module.getOrInsertFunction(
       "mgpuModuleLoad",
-      FunctionType::get(ptrTy, ArrayRef<Type *>({ptrTy}), false));
+      FunctionType::get(ptrTy, ArrayRef<Type *>({ptrTy, i64Ty}), false));
 }
 
 llvm::FunctionCallee llvm::LaunchKernel::getModuleLoadJITFn() {
@@ -377,10 +394,21 @@ llvm::LaunchKernel::createKernelLaunch(mlir::gpu::LaunchFuncOp op,
   if (!binary)
     return op.emitError() << "Couldn't find the binary: " << binaryIdentifier;
 
+  llvm::Constant *paramsCount =
+      llvm::ConstantInt::get(i64Ty, op.getNumKernelOperands());
+
+  std::string binarySizeIdentifier = getBinarySizeIdentifier(moduleName);
+  Value *binarySizeVar = module.getGlobalVariable(binarySizeIdentifier, true);
+  if (!binarySizeVar)
+    return op.emitError() << "Couldn't find the binary size: "
+                          << binarySizeIdentifier;
+  Value *binarySize =
+      dyn_cast<llvm::GlobalVariable>(binarySizeVar)->getInitializer();
+
   Value *moduleObject =
       object.getFormat() == gpu::CompilationTarget::Assembly
           ? builder.CreateCall(getModuleLoadJITFn(), {binary, optV})
-          : builder.CreateCall(getModuleLoadFn(), {binary});
+          : builder.CreateCall(getModuleLoadFn(), {binary, binarySize});
 
   // Load the kernel function.
   Value *moduleFunction = builder.CreateCall(
@@ -401,10 +429,10 @@ llvm::LaunchKernel::createKernelLaunch(mlir::gpu::LaunchFuncOp op,
 
   // Create the launch call.
   Value *nullPtr = ConstantPointerNull::get(ptrTy);
-  builder.CreateCall(
-      getKernelLaunchFn(),
-      ArrayRef<Value *>({moduleFunction, gx, gy, gz, bx, by, bz,
-                         dynamicMemorySize, stream, argArray, nullPtr}));
+  builder.CreateCall(getKernelLaunchFn(),
+                     ArrayRef<Value *>({moduleFunction, gx, gy, gz, bx, by, bz,
+                                        dynamicMemorySize, stream, argArray,
+                                        nullPtr, paramsCount}));
 
   // Sync & destroy the stream, for synchronous launches.
   if (handleStream) {

@@ -0,0 +1,13 @@
+add_mlir_translation_library(MLIRSPIRVToLLVMIRTranslation
+  SPIRVToLLVMIRTranslation.cpp
+
+  LINK_COMPONENTS
+  Core
+
+  LINK_LIBS PUBLIC
+  MLIRIR
+  MLIRLLVMDialect
+  MLIRSPIRVDialect
+  MLIRSupport
+  MLIRTargetLLVMIRExport
+  )
@@ -0,0 +1,31 @@
+//===- SPIRVToLLVMIRTranslation.cpp - Translate SPIRV to LLVM IR ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a translation between the MLIR SPIRV dialect and
+// LLVM IR.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Target/LLVMIR/Dialect/SPIRV/SPIRVToLLVMIRTranslation.h"
+#include "mlir/Dialect/SPIRV/IR/SPIRVDialect.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/Target/LLVMIR/ModuleTranslation.h"
+
+using namespace mlir;
+using namespace mlir::LLVM;
+
+void mlir::registerSPIRVDialectTranslation(DialectRegistry &registry) {
+  registry.insert<spirv::SPIRVDialect>();
+}
+
+void mlir::registerSPIRVDialectTranslation(MLIRContext &context) {
+  DialectRegistry registry;
+  registerSPIRVDialectTranslation(registry);
+  context.appendDialectRegistry(registry);
+}
diff --git a/mlir/test/CMakeLists.txt b/mlir/test/CMakeLists.txt
@@ -139,6 +139,10 @@ if(MLIR_ENABLE_ROCM_RUNNER)
   list(APPEND MLIR_TEST_DEPENDS mlir_rocm_runtime)
 endif()
 
+if(MLIR_ENABLE_SYCL_RUNNER)
+  list(APPEND MLIR_TEST_DEPENDS mlir_sycl_runtime)
+endif()
+
 list(APPEND MLIR_TEST_DEPENDS MLIRUnitTests)
 
 if(LLVM_BUILD_EXAMPLES)

diff --git a/mlir/test/Integration/GPU/SYCL/gpu-to-spirv.mlir b/mlir/test/Integration/GPU/SYCL/gpu-to-spirv.mlir
@@ -0,0 +1,54 @@
+// RUN: mlir-opt %s -pass-pipeline='builtin.module(spirv-attach-target{ver=v1.0 caps=Addresses,Int64,Kernel},convert-gpu-to-spirv{use-64bit-index=true},gpu.module(spirv.module(spirv-lower-abi-attrs,spirv-update-vce)),func.func(llvm-request-c-wrappers),convert-scf-to-cf,convert-cf-to-llvm,convert-arith-to-llvm,convert-math-to-llvm,convert-func-to-llvm,gpu-to-llvm{use-bare-pointers-for-kernels=true},gpu-module-to-binary,expand-strided-metadata,lower-affine,finalize-memref-to-llvm,reconcile-unrealized-casts)' \
+// RUN: | mlir-cpu-runner \
+// RUN:   --shared-libs=%mlir_sycl_runtime \
+// RUN:   --shared-libs=%mlir_runner_utils \
+// RUN:   --entry-point-result=void \
+// RUN: | FileCheck %s
+
+module @add attributes {gpu.container_module} {
+  memref.global "private" constant @__constant_3x3xi64_0 : memref<3x3xi64> = dense<[[1, 4098, 3], [16777220, 5, 4294967302], [7, 1099511627784, 9]]>
+  memref.global "private" constant @__constant_3x3xi64 : memref<3x3xi64> = dense<[[1, 2, 3], [4, 5, 4102], [16777223, 4294967304, 1099511627785]]>
+  func.func @main() {
+    %0 = memref.get_global @__constant_3x3xi64 : memref<3x3xi64>
+    %1 = memref.get_global @__constant_3x3xi64_0 : memref<3x3xi64>
+    %2 = call @test(%0, %1) : (memref<3x3xi64>, memref<3x3xi64>) -> memref<3x3xi64>
+    %cast = memref.cast %2 : memref<3x3xi64> to memref<*xi64>
+    call @printMemrefI64(%cast) : (memref<*xi64>) -> ()
+    return
+  }
+  func.func private @printMemrefI64(memref<*xi64>)
+  func.func @test(%arg0: memref<3x3xi64>, %arg1: memref<3x3xi64>) -> memref<3x3xi64> {
+  %c3 = arith.constant 3 : index
+  %c1 = arith.constant 1 : index
+  %mem = gpu.alloc host_shared () : memref<3x3xi64>
+  memref.copy %arg1, %mem : memref<3x3xi64> to memref<3x3xi64>
+  %memref_0 = gpu.alloc host_shared () : memref<3x3xi64>
+  memref.copy %arg0, %memref_0 : memref<3x3xi64> to memref<3x3xi64>
+  %memref_2 = gpu.alloc host_shared () : memref<3x3xi64>
+  %2 = gpu.wait async
+  %3 = gpu.launch_func async [%2] @test_kernel::@test_kernel blocks in (%c3, %c3, %c1) threads in (%c1, %c1, %c1) args(%memref_0 : memref<3x3xi64>, %mem : memref<3x3xi64>, %memref_2 : memref<3x3xi64>)
+  gpu.wait [%3]
+  %alloc = memref.alloc() : memref<3x3xi64>
+  memref.copy %memref_2, %alloc : memref<3x3xi64> to memref<3x3xi64>
+  %4 = gpu.wait async
+  %5 = gpu.dealloc async [%4] %memref_2 : memref<3x3xi64>
+  %6 = gpu.dealloc async [%5] %memref_0 : memref<3x3xi64>
+  %7 = gpu.dealloc async [%6] %mem : memref<3x3xi64>
+  gpu.wait [%7]
+  return %alloc : memref<3x3xi64>
+  }
+  gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Int64, Kernel], []>, api=OpenCL, #spirv.resource_limits<>>} {
+    gpu.func @test_kernel(%arg0: memref<3x3xi64>, %arg1: memref<3x3xi64>, %arg2: memref<3x3xi64>) kernel attributes {gpu.known_block_size = array<i32: 1, 1, 1>, gpu.known_grid_size = array<i32: 3, 3, 1>, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
+      %0 = gpu.block_id  x
+      %1 = gpu.block_id  y
+      %2 = memref.load %arg0[%0, %1] : memref<3x3xi64>
+      %3 = memref.load %arg1[%0, %1] : memref<3x3xi64>
+      %4 = arith.addi %2, %3 : i64
+      memref.store %4, %arg2[%0, %1] : memref<3x3xi64>
+      gpu.return
+    }
+  }
+  // CHECK: [2,   4100,   6],
+  // CHECK: [16777224,   10,   4294971404],
+  // CHECK: [16777230,   1103806595088,   1099511627794]
+}
diff --git a/mlir/test/Integration/GPU/SYCL/lit.local.cfg b/mlir/test/Integration/GPU/SYCL/lit.local.cfg
@@ -0,0 +1,2 @@
+if not config.enable_sycl_runner:
+    config.unsupported = True
diff --git a/mlir/test/Target/LLVMIR/gpu.mlir b/mlir/test/Target/LLVMIR/gpu.mlir
@@ -4,6 +4,7 @@
 module attributes {gpu.container_module} {
   // CHECK: [[ARGS_TY:%.*]] = type { i32, i32 }
   // CHECK: @kernel_module_bin_cst = internal constant [4 x i8] c"BLOB", align 8
+  // CHECK:  @kernel_module_bin_size_cst = internal constant i64 4, align 8
   // CHECK: @kernel_module_kernel_kernel_name = private unnamed_addr constant [7 x i8] c"kernel\00", align 1
   gpu.binary @kernel_module  [#gpu.object<#nvvm.target, "BLOB">]
   llvm.func @foo() {
@@ -17,10 +18,10 @@ module attributes {gpu.container_module} {
     // CHECK: store i32 32, ptr [[ARG1]], align 4
     // CHECK: %{{.*}} = getelementptr ptr, ptr [[ARGS_ARRAY]], i32 1
     // CHECK: store ptr [[ARG1]], ptr %{{.*}}, align 8
-    // CHECK: [[MODULE:%.*]] = call ptr @mgpuModuleLoad(ptr @kernel_module_bin_cst)
+    // CHECK: [[MODULE:%.*]] = call ptr @mgpuModuleLoad(ptr @kernel_module_bin_cst, i64 4)
     // CHECK: [[FUNC:%.*]] = call ptr @mgpuModuleGetFunction(ptr [[MODULE]], ptr @kernel_module_kernel_kernel_name)
     // CHECK: [[STREAM:%.*]] = call ptr @mgpuStreamCreate()
-    // CHECK: call void @mgpuLaunchKernel(ptr [[FUNC]], i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i32 256, ptr [[STREAM]], ptr [[ARGS_ARRAY]], ptr null)
+    // CHECK: call void @mgpuLaunchKernel(ptr [[FUNC]], i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i32 256, ptr [[STREAM]], ptr [[ARGS_ARRAY]], ptr null, i64 2)
     // CHECK: call void @mgpuStreamSynchronize(ptr [[STREAM]])
     // CHECK: call void @mgpuStreamDestroy(ptr [[STREAM]])
     // CHECK: call void @mgpuModuleUnload(ptr [[MODULE]])
@@ -59,9 +60,9 @@ module attributes {gpu.container_module} {
     // CHECK: = call ptr @mgpuStreamCreate()
     // CHECK-NEXT: = alloca {{.*}}, align 8
     // CHECK-NEXT: [[ARGS:%.*]] = alloca ptr, i64 0, align 8
-    // CHECK-NEXT: [[MODULE:%.*]] = call ptr @mgpuModuleLoad(ptr @kernel_module_bin_cst)
+    // CHECK-NEXT: [[MODULE:%.*]] = call ptr @mgpuModuleLoad(ptr @kernel_module_bin_cst, i64 4)
     // CHECK-NEXT: [[FUNC:%.*]] = call ptr @mgpuModuleGetFunction(ptr [[MODULE]], ptr @kernel_module_kernel_kernel_name)
-    // CHECK-NEXT: call void @mgpuLaunchKernel(ptr [[FUNC]], i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i32 0, ptr {{.*}}, ptr [[ARGS]], ptr null)
+    // CHECK-NEXT: call void @mgpuLaunchKernel(ptr [[FUNC]], i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i32 0, ptr {{.*}}, ptr [[ARGS]], ptr null, i64 0)
     // CHECK-NEXT: call void @mgpuModuleUnload(ptr [[MODULE]])
     // CHECK-NEXT: call void @mgpuStreamSynchronize(ptr %{{.*}})
     // CHECK-NEXT: call void @mgpuStreamDestroy(ptr %{{.*}})

diff --git a/mlir/test/lit.cfg.py b/mlir/test/lit.cfg.py
@@ -126,6 +126,9 @@ def add_runtime(name):
 if config.enable_cuda_runner:
     tools.extend([add_runtime("mlir_cuda_runtime")])
 
+if config.enable_sycl_runner:
+    tools.extend([add_runtime("mlir_sycl_runtime")])
+
 # The following tools are optional
 tools.extend(
     [

diff --git a/mlir/test/lit.site.cfg.py.in b/mlir/test/lit.site.cfg.py.in
@@ -31,6 +31,7 @@ config.run_rocm_tests = @MLIR_ENABLE_ROCM_CONVERSIONS@
 config.enable_rocm_runner = @MLIR_ENABLE_ROCM_RUNNER@
 config.gpu_compilation_format = "@MLIR_GPU_COMPILATION_TEST_FORMAT@"
 config.rocm_test_chipset = "@ROCM_TEST_CHIPSET@"
+config.enable_sycl_runner = @MLIR_ENABLE_SYCL_RUNNER@
 config.enable_spirv_cpu_runner = @MLIR_ENABLE_SPIRV_CPU_RUNNER@
 config.enable_vulkan_runner = @MLIR_ENABLE_VULKAN_RUNNER@
 config.enable_bindings_python = @MLIR_ENABLE_BINDINGS_PYTHON@