[spirv] Push GPU target conversion to before SPIR-V conversion (iree-…

…org#17816) This commit moves the `SPIRVConvertGPUTargetPass` to right before the `ConvertToSPIRVPass` in the pipeline. This makes sure we use the same `#iree_gpu.target` in the majority of the configuration and lowering passes in the CodeGen flow, and scopes the SPIR-V target environment to only the final SPIR-V conversion. With this, we are able to unify and simplify lots of SPIR-V tests. Progress towards iree-org#16341 ci-extra: test_nvidia_gpu,test_nvidia_a100,test_amd_mi250,test_amd_w7900,build_test_all_macos_arm64,build_and_test_android --------- Signed-off-by: Lei Zhang <antiagainst@gmail.com>
nod-ai · Jul 13, 2024 · 9d6b425 · 9d6b425
1 parent 2ed3f92
commit 9d6b425
Show file tree

Hide file tree

Showing 71 changed files with 1,102 additions and 1,732 deletions.
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.td b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.td
@@ -362,15 +362,47 @@ def IREEGPU_TargetAttr : AttrDef<IREEGPU_Dialect, "Target"> {
   let assemblyFormat = "`<` struct(params) `>`";
 
   let extraClassDeclaration = [{
-    int getPreferredSubgroupSize() const {
-      return getWgp().getSubgroupSizeChoices().asArrayRef().front();
+    // Subgroup size related APIs
+
+    int getMinSubgroupSize() const {
+      return *llvm::min_element(getWgp().getSubgroupSizeChoices().asArrayRef());
+    }
+    int getMaxSubgroupSize() const {
+      return *llvm::max_element(getWgp().getSubgroupSizeChoices().asArrayRef());
     }
+    // Returns the preferred subgroup size. If the target supports multiple
+    // subgroup sizes, pickLargest controls whether to return the largest one.
+    //
+    // AMD RDNA GPUs supports multiple subgroup sizes and the preferred one
+    // differ given the API--HIP prefers 32 while Vulkan prefers 64.
+    // TODO: We should be able to force Vulkan side to use 32 consistently
+    // too with subgroup size control; it might have perf implications though.
+    int getPreferredSubgroupSize(bool pickLargest=false) const {
+      if (pickLargest) {
+        return getMaxSubgroupSize();
+      }
+      return getMinSubgroupSize();
+    }
+
+    // Hardware feature related APIs
 
     bool supportsSubgroupShuffle() const {
       return bitEnumContainsAll(getWgp().getSubgroup().getValue(),
                                 SubgroupOps::Shuffle);
     }
 
+    // Vendor querying APIs
+
+    bool isAMD() const {
+      return getArch().starts_with("gfx") || getArch().starts_with("rdna");
+    }
+    bool isApple() const { return getArch().starts_with("apple"); }
+    bool isARM() const { return getArch().starts_with("valhall"); }
+    bool isNVIDIA() const { return getArch().starts_with("sm_"); }
+    bool isQualcomm() const { return getArch().starts_with("adreno"); }
+
+    // CUDA specific querying APIs
+
     std::optional<int> getCUDAComputeCapability() const;
     // Returns true if this target supports TensoreCore MMA ops with TF32
     // input types.

diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/KnownTargets.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/KnownTargets.cpp
@@ -563,7 +563,7 @@ std::optional<TargetDetails> getAndroidProfileDetails(StringRef target) {
 //===----------------------------------------------------------------------===//
 
 TargetAttr getMetalTargetDetails(MLIRContext *context) {
-  return createTargetAttr(*getAppleTargetDetails(), /*arch=*/"",
+  return createTargetAttr(*getAppleTargetDetails(), /*arch=*/"apple",
                           /*features=*/"spirv:v1.3,cap:Shader", context);
 }
 
@@ -603,6 +603,8 @@ TargetAttr getVulkanTargetDetails(llvm::StringRef target,
   // SPIR-V 1.4. For non-mobile GPUs we target Vulkan 1.3, which accepts
   // SPIR-V 1.6 as the maximum.
 
+  // TODO: Add feature bits for physical storage buffer.
+
   if (std::optional<TargetDetails> details = getAMDGPUTargetDetails(target)) {
     return createTargetAttr(*details, normalizeAMDGPUTarget(target),
                             /*features=*/"spirv:v1.6,cap:Shader", context);
@@ -654,7 +656,8 @@ TargetAttr getFullTarget(StringRef targetAPI, StringRef aliasTarget,
                          StringRef features, MLIRContext *context) {
   return llvm::StringSwitch<TargetAttr>(targetAPI)
       .Case("cuda", getCUDATargetDetails(aliasTarget, features, context))
-      .Case("rocm", getHIPTargetDetails(aliasTarget, features, context))
+      .Case("hip", getHIPTargetDetails(aliasTarget, features, context))
+      .Case("vulkan", getVulkanTargetDetails(aliasTarget, context))
       .Default(nullptr);
 }
 

diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/AMDConfig.cpp b/compiler/src/iree/compiler/Codegen/SPIRV/AMDConfig.cpp
@@ -10,15 +10,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h"
 #include "iree/compiler/Codegen/SPIRV/KernelConfig.h"
-#include "iree/compiler/Codegen/Utils/Utils.h"
 #include "iree/compiler/Dialect/Util/IR/UtilTypes.h"
-#include "llvm/Support/Debug.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
-#include "mlir/Dialect/SPIRV/IR/SPIRVAttributes.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/IR/BuiltinOps.h"
 
 #define DEBUG_TYPE "iree-spirv-amd-config"
 
@@ -35,15 +30,14 @@ constexpr unsigned AMDNumSubgroupsPerWorkgroup = 4;
 constexpr unsigned AMDNumMNTilesPerSubgroup = 8;
 
 static LogicalResult setAMDMatmulConfig(linalg::LinalgOp op,
-                                        const spirv::TargetEnv &targetEnv) {
+                                        IREE::GPU::TargetAttr target) {
   if (succeeded(setCooperativeMatrixConfig(
-          targetEnv, op, AMDNumSubgroupsPerWorkgroup, AMDNumMNTilesPerSubgroup,
+          target, op, AMDNumSubgroupsPerWorkgroup, AMDNumMNTilesPerSubgroup,
           AMDCoopMatrixSoftwarePipelineDepth,
           AMDCoopMatrixSoftwarePipelineStoreStage)))
     return success();
 
-  spirv::ResourceLimitsAttr limits = targetEnv.getResourceLimits();
-  const int subgroupSize = limits.getSubgroupSize();
+  int subgroupSize = target.getPreferredSubgroupSize(/*pickLargest=*/true);
   const std::array<int64_t, 2> workgroupXY = {subgroupSize / 2, 8};
   std::array<int64_t, 3> threadMNK;
   auto inputType =
@@ -53,7 +47,7 @@ static LogicalResult setAMDMatmulConfig(linalg::LinalgOp op,
   } else {
     threadMNK = {8, 4, 16};
   }
-  return setMatmulOpConfig(limits, op, workgroupXY, threadMNK,
+  return setMatmulOpConfig(target, op, workgroupXY, threadMNK,
                            /*enablePromotion=*/true,
                            AMDSimtSoftwarePipelineDepth,
                            AMDSimtSoftwarePipelineStoreStage);
@@ -71,14 +65,13 @@ static LogicalResult setAMDMatmulConfig(linalg::LinalgOp op,
 // * Max 20 waves per SIMD32
 // * Max 64KB LDS per workgroup
 
-LogicalResult setAMDCodeGenConfig(const spirv::TargetEnv &targetEnv,
+LogicalResult setAMDCodeGenConfig(IREE::GPU::TargetAttr target,
                                   Operation *rootOp) {
-  spirv::ResourceLimitsAttr limits = targetEnv.getResourceLimits();
-  int subgroupSize = limits.getSubgroupSize();
+  int subgroupSize = target.getPreferredSubgroupSize(/*pickLargest=*/true);
 
   if (auto linalgOp = dyn_cast<linalg::LinalgOp>(rootOp)) {
     if (isMatmulOrBatchMatmul(linalgOp))
-      return setAMDMatmulConfig(linalgOp, targetEnv);
+      return setAMDMatmulConfig(linalgOp, target);
   }
 
   if (auto convOp = dyn_cast<linalg::ConvolutionOpInterface>(rootOp)) {

diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/AdrenoConfig.cpp b/compiler/src/iree/compiler/Codegen/SPIRV/AdrenoConfig.cpp
@@ -14,15 +14,13 @@
 
 #include "iree/compiler/Codegen/SPIRV/KernelConfig.h"
 #include "iree/compiler/Dialect/Util/IR/UtilTypes.h"
-#include "llvm/ADT/TypeSwitch.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
-#include "mlir/IR/BuiltinOps.h"
 
 namespace mlir::iree_compiler::detail {
 
 static LogicalResult setAdrenoMatmulConfig(linalg::LinalgOp op,
-                                           spirv::ResourceLimitsAttr limits) {
-  const int subgroupSize = limits.getSubgroupSize();
+                                           IREE::GPU::TargetAttr target) {
+  const int subgroupSize = target.getPreferredSubgroupSize();
   const std::array<int64_t, 2> workgroupXY = {subgroupSize / 2, 2};
   std::array<int64_t, 3> threadMNK;
   auto inputType =
@@ -32,24 +30,23 @@ static LogicalResult setAdrenoMatmulConfig(linalg::LinalgOp op,
   } else {
     threadMNK = {16, 4, 4};
   }
-  return setMatmulOpConfig(limits, op, workgroupXY, threadMNK);
+  return setMatmulOpConfig(target, op, workgroupXY, threadMNK);
 }
 
 //===----------------------------------------------------------------------===//
 // Entry Point
 //===----------------------------------------------------------------------===//
 
-LogicalResult setAdrenoCodeGenConfig(const spirv::TargetEnv &targetEnv,
+LogicalResult setAdrenoCodeGenConfig(IREE::GPU::TargetAttr target,
                                      Operation *rootOp) {
-  spirv::ResourceLimitsAttr limits = targetEnv.getResourceLimits();
-  int subgroupSize = limits.getSubgroupSize();
+  int subgroupSize = target.getPreferredSubgroupSize();
 
   if (!isa<linalg::LinalgOp>(rootOp))
     return failure();
 
   auto linalgOp = cast<linalg::LinalgOp>(rootOp);
   if (isMatmulOrBatchMatmul(linalgOp))
-    return setAdrenoMatmulConfig(linalgOp, limits);
+    return setAdrenoMatmulConfig(linalgOp, target);
 
   if (auto convOp = dyn_cast<linalg::ConvolutionOpInterface>(rootOp)) {
     // Use the result type in case of larger bitwidth for accumulators.

diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/AppleConfig.cpp b/compiler/src/iree/compiler/Codegen/SPIRV/AppleConfig.cpp
@@ -14,15 +14,12 @@
 
 #include "iree/compiler/Codegen/SPIRV/KernelConfig.h"
 #include "iree/compiler/Dialect/Util/IR/UtilTypes.h"
-#include "llvm/ADT/TypeSwitch.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
-#include "mlir/Dialect/SPIRV/IR/SPIRVAttributes.h"
-#include "mlir/IR/BuiltinOps.h"
 
 namespace mlir::iree_compiler::detail {
 
 static LogicalResult setAppleMatmulConfig(linalg::LinalgOp op,
-                                          spirv::ResourceLimitsAttr limits) {
+                                          IREE::GPU::TargetAttr target) {
   const std::array<int64_t, 2> workgroupXY = {256, 1};
   std::array<int64_t, 3> threadMNK;
   auto inputType =
@@ -32,21 +29,20 @@ static LogicalResult setAppleMatmulConfig(linalg::LinalgOp op,
   } else {
     threadMNK = {4, 4, 4};
   }
-  return setMatmulOpConfig(limits, op, workgroupXY, threadMNK);
+  return setMatmulOpConfig(target, op, workgroupXY, threadMNK);
 }
 
 //===----------------------------------------------------------------------===//
 // Entry Point
 //===----------------------------------------------------------------------===//
 
-LogicalResult setAppleCodeGenConfig(const spirv::TargetEnv &targetEnv,
+LogicalResult setAppleCodeGenConfig(IREE::GPU::TargetAttr target,
                                     Operation *rootOp) {
-  spirv::ResourceLimitsAttr limits = targetEnv.getResourceLimits();
-  int subgroupSize = limits.getSubgroupSize();
+  int subgroupSize = target.getPreferredSubgroupSize();
 
   if (auto linalgOp = dyn_cast<linalg::LinalgOp>(rootOp)) {
     if (isMatmulOrBatchMatmul(linalgOp))
-      return setAppleMatmulConfig(linalgOp, limits);
+      return setAppleMatmulConfig(linalgOp, target);
   }
 
   if (auto convOp = dyn_cast<linalg::ConvolutionOpInterface>(rootOp)) {

diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/BUILD.bazel b/compiler/src/iree/compiler/Codegen/SPIRV/BUILD.bazel
@@ -91,6 +91,7 @@ iree_compiler_cc_library(
         "//compiler/src/iree/compiler/Codegen/Common/GPU:CommonGPUPasses",
         "//compiler/src/iree/compiler/Codegen/Common/GPU:GPUHeuristics",
         "//compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR:IREECodegenDialect",
+        "//compiler/src/iree/compiler/Codegen/Dialect/GPU/IR:IREEGPUDialect",
         "//compiler/src/iree/compiler/Codegen/Interfaces:PartitionableLoopsInterface",
         "//compiler/src/iree/compiler/Codegen/TransformStrategies/GPU",
         "//compiler/src/iree/compiler/Codegen/Transforms",

diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/SPIRV/CMakeLists.txt
@@ -140,6 +140,7 @@ iree_cc_library(
     iree::compiler::Codegen::Common::GPU::GPUHeuristics
     iree::compiler::Codegen::Common::TransformDialectInterpreterPass
     iree::compiler::Codegen::Dialect::Codegen::IR::IREECodegenDialect
+    iree::compiler::Codegen::Dialect::GPU::IR::IREEGPUDialect
     iree::compiler::Codegen::Interfaces::PartitionableLoopsInterface
     iree::compiler::Codegen::TransformStrategies::GPU
     iree::compiler::Codegen::Transforms

diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/ConvertToSPIRVPass.cpp b/compiler/src/iree/compiler/Codegen/SPIRV/ConvertToSPIRVPass.cpp
@@ -17,16 +17,12 @@
 #include <cstdint>
 #include <tuple>
 
-#include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h"
 #include "iree/compiler/Codegen/SPIRV/PassDetail.h"
 #include "iree/compiler/Codegen/SPIRV/Passes.h"
 #include "iree/compiler/Codegen/SPIRV/Utils.h"
-#include "iree/compiler/Codegen/Utils/MarkerUtils.h"
 #include "iree/compiler/Codegen/Utils/Utils.h"
 #include "iree/compiler/Dialect/HAL/IR/HALOps.h"
 #include "iree/compiler/Dialect/HAL/IR/HALTypes.h"
-#include "iree/compiler/Dialect/Util/IR/UtilOps.h"
-#include "llvm/ADT/DenseMapInfo.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/FormatVariadic.h"
@@ -41,15 +37,11 @@
 #include "mlir/Conversion/TensorToSPIRV/TensorToSPIRV.h"
 #include "mlir/Conversion/VectorToSPIRV/VectorToSPIRV.h"
 #include "mlir/Dialect/Arith/Transforms/Passes.h"
-#include "mlir/Dialect/Arith/Utils/Utils.h"
 #include "mlir/Dialect/Bufferization/IR/Bufferization.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/GPU/IR/GPUDialect.h"
-#include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/Math/Transforms/Passes.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/SPIRV/IR/SPIRVDialect.h"
-#include "mlir/Dialect/SPIRV/IR/SPIRVEnums.h"
 #include "mlir/Dialect/SPIRV/IR/SPIRVOps.h"
 #include "mlir/Dialect/SPIRV/IR/SPIRVTypes.h"
 #include "mlir/Dialect/SPIRV/Transforms/SPIRVConversion.h"
@@ -596,17 +588,21 @@ void ConvertToSPIRVPass::runOnOperation() {
     }
   }
 
-  spirv::TargetEnvAttr targetAttr = getSPIRVTargetEnvAttr(moduleOp);
-  moduleOp->setAttr(spirv::getTargetEnvAttrName(), targetAttr);
-
   if (indexBits != 32 && indexBits != 64) {
     moduleOp.emitOpError(
-        "Only 32-bit or 64-bit indices are supported for SPIR-V");
+        "only 32-bit or 64-bit indices are supported for SPIR-V");
     return signalPassFailure();
   }
-
   bool use64bitIndex = indexBits == 64;
+
+  auto targetAttr = moduleOp->getAttrOfType<spirv::TargetEnvAttr>(
+      spirv::getTargetEnvAttrName());
+  if (!targetAttr) {
+    moduleOp.emitOpError("should contain a spirv.target_env attribute");
+    return signalPassFailure();
+  }
   spirv::TargetEnv targetEnv(targetAttr);
+
   if (use64bitIndex && !targetEnv.allows(spirv::Capability::Int64)) {
     moduleOp.emitOpError(
         "64-bit indices are not supported for the specified target "