-
Notifications
You must be signed in to change notification settings - Fork 10.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[mlir] Lower math dialect later in gpu-lower-to-nvvm-pipeline #78556
Conversation
@llvm/pr-subscribers-mlir @llvm/pr-subscribers-mlir-gpu Author: Guray Ozen (grypp) ChangesThis PR moves lowering of math dialect later in the pipeline. Because math dialect is lowered correctly by Patch is 89.79 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/78556.diff 3 Files Affected:
diff --git a/mlir/lib/Dialect/GPU/Pipelines/GPUToNVVMPipeline.cpp b/mlir/lib/Dialect/GPU/Pipelines/GPUToNVVMPipeline.cpp
index 0b4739214bf2f1..935f0deaf9c8a6 100644
--- a/mlir/lib/Dialect/GPU/Pipelines/GPUToNVVMPipeline.cpp
+++ b/mlir/lib/Dialect/GPU/Pipelines/GPUToNVVMPipeline.cpp
@@ -51,7 +51,6 @@ void buildCommonPassPipeline(
pm.addPass(createConvertVectorToSCFPass());
pm.addPass(createConvertSCFToCFPass());
pm.addPass(createConvertNVVMToLLVMPass());
- pm.addPass(createConvertMathToLLVMPass());
pm.addPass(createConvertFuncToLLVMPass());
pm.addPass(memref::createExpandStridedMetadataPass());
@@ -98,6 +97,7 @@ void buildHostPostPipeline(OpPassManager &pm,
GpuModuleToBinaryPassOptions gpuModuleToBinaryPassOptions;
gpuModuleToBinaryPassOptions.compilationTarget = options.cubinFormat;
pm.addPass(createGpuModuleToBinaryPass(gpuModuleToBinaryPassOptions));
+ pm.addPass(createConvertMathToLLVMPass());
pm.addPass(createCanonicalizerPass());
pm.addPass(createCSEPass());
pm.addPass(createReconcileUnrealizedCastsPass());
diff --git a/mlir/test/Dialect/GPU/test-nvvm-pipeline.mlir b/mlir/test/Dialect/GPU/test-nvvm-pipeline.mlir
new file mode 100644
index 00000000000000..7c55059f4a84dc
--- /dev/null
+++ b/mlir/test/Dialect/GPU/test-nvvm-pipeline.mlir
@@ -0,0 +1,18 @@
+// RUN: mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=isa" -split-input-file %s | FileCheck %s
+
+// CHECK-LABEL: llvm.func @test_math(%arg0: f32) {
+func.func @test_math(%arg0 : f32) {
+ %c2 = arith.constant 2 : index
+ %c1 = arith.constant 1 : index
+ // CHECK: gpu.launch_func @test_math_kernel::@test_math_kernel
+ // CHECK: gpu.binary @test_math_kernel [#gpu.object<#nvvm.target
+ gpu.launch
+ blocks(%0, %1, %2) in (%3 = %c1, %4 = %c1, %5 = %c1)
+ threads(%6, %7, %8) in (%9 = %c2, %10 = %c1, %11 = %c1) {
+ %s1 = math.exp %arg0 : f32
+ gpu.printf "%f" %s1 : f32
+ gpu.printf "Hello from %d\n" %6 : index
+ gpu.terminator
+ }
+ return
+}
\ No newline at end of file
diff --git a/stmatrix/everything-casted.mlir b/stmatrix/everything-casted.mlir
new file mode 100644
index 00000000000000..28ebad10c6a3ce
--- /dev/null
+++ b/stmatrix/everything-casted.mlir
@@ -0,0 +1,1267 @@
+// -----// IR Dump After ConvertNVGPUToNVVMPass (convert-nvgpu-to-nvvm) //----- //
+module {
+ gpu.module @asd {
+ gpu.func @foo(%arg0: memref<64x32xf16, 3>, %arg1: i64, %arg2: i64) {
+ %0 = builtin.unrealized_conversion_cast %arg0 : memref<64x32xf16, 3> to !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)>
+ memref.assume_alignment %arg0, 32 : memref<64x32xf16, 3>
+ %1 = llvm.mlir.constant(0 : i32) : i32
+ %2 = llvm.mlir.undef : !llvm.struct<(struct<(i32, i32, i32, i32, i32, i32, i32, i32)>)>
+ %3 = llvm.extractvalue %2[0] : !llvm.struct<(struct<(i32, i32, i32, i32, i32, i32, i32, i32)>)>
+ %4 = llvm.insertvalue %1, %3[0] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %5 = llvm.insertvalue %1, %4[1] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %6 = llvm.insertvalue %1, %5[2] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %7 = llvm.insertvalue %1, %6[3] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %8 = llvm.insertvalue %1, %7[4] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %9 = llvm.insertvalue %1, %8[5] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %10 = llvm.insertvalue %1, %9[6] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %11 = llvm.insertvalue %1, %10[7] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %12 = llvm.insertvalue %11, %2[0] : !llvm.struct<(struct<(i32, i32, i32, i32, i32, i32, i32, i32)>)>
+ %13 = builtin.unrealized_conversion_cast %arg1 : i64 to !nvgpu.warpgroup.descriptor<tensor = memref<64x16xf16, 3>>
+ %14 = builtin.unrealized_conversion_cast %arg2 : i64 to !nvgpu.warpgroup.descriptor<tensor = memref<16x32xf16, 3>>
+ nvvm.wgmma.fence.aligned
+ %15 = llvm.mlir.undef : !llvm.struct<(struct<(i32, i32, i32, i32, i32, i32, i32, i32)>)>
+ %16 = llvm.extractvalue %12[0] : !llvm.struct<(struct<(i32, i32, i32, i32, i32, i32, i32, i32)>)>
+ %17 = nvvm.wgmma.mma_async %arg1, %arg2, %16, <m = 64, n = 32, k = 16>, D[<f16>, <one>, <wrapped>], A[<f16>, <one>, <row>], B[<f16>, <one>, <row>] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)> -> !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %18 = llvm.insertvalue %17, %15[0] : !llvm.struct<(struct<(i32, i32, i32, i32, i32, i32, i32, i32)>)>
+ nvvm.wgmma.commit.group.sync.aligned
+ nvvm.wgmma.wait.group.sync.aligned 1
+ %19 = llvm.extractvalue %18[0] : !llvm.struct<(struct<(i32, i32, i32, i32, i32, i32, i32, i32)>)>
+ %20 = llvm.mlir.constant(1 : i32) : i32
+ %21 = llvm.mlir.constant(2 : i32) : i32
+ %22 = llvm.mlir.constant(4 : i32) : i32
+ %23 = llvm.mlir.constant(8 : i32) : i32
+ %24 = llvm.mlir.constant(16 : i32) : i32
+ %25 = llvm.mlir.constant(32 : i32) : i32
+ %26 = nvvm.read.ptx.sreg.tid.x : i32
+ %27 = llvm.urem %26, %25 : i32
+ %28 = llvm.udiv %26, %25 : i32
+ %29 = llvm.udiv %27, %22 : i32
+ %30 = llvm.urem %27, %22 : i32
+ %31 = llvm.mul %30, %21 : i32
+ %32 = llvm.mul %28, %24 : i32
+ %33 = llvm.add %29, %32 : i32
+ %34 = llvm.mlir.constant(0 : i32) : i32
+ %35 = llvm.mul %34, %23 : i32
+ %36 = llvm.add %33, %35 : i32
+ %37 = llvm.mlir.constant(0 : i32) : i32
+ %38 = llvm.mul %37, %23 : i32
+ %39 = llvm.add %31, %38 : i32
+ %40 = arith.index_cast %36 : i32 to index
+ %41 = arith.index_cast %39 : i32 to index
+ %42 = llvm.extractvalue %19[0] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %43 = llvm.bitcast %42 : i32 to vector<2xf16>
+ vector.store %43, %arg0[%40, %41] : memref<64x32xf16, 3>, vector<2xf16>
+ %44 = llvm.mlir.constant(1 : i32) : i32
+ %45 = llvm.mul %44, %23 : i32
+ %46 = llvm.add %33, %45 : i32
+ %47 = llvm.mlir.constant(0 : i32) : i32
+ %48 = llvm.mul %47, %23 : i32
+ %49 = llvm.add %31, %48 : i32
+ %50 = arith.index_cast %46 : i32 to index
+ %51 = arith.index_cast %49 : i32 to index
+ %52 = llvm.extractvalue %19[2] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %53 = llvm.bitcast %52 : i32 to vector<2xf16>
+ vector.store %53, %arg0[%50, %51] : memref<64x32xf16, 3>, vector<2xf16>
+ gpu.return
+ }
+ }
+}
+
+
+// -----// IR Dump After GpuKernelOutlining (gpu-kernel-outlining) //----- //
+module {
+ gpu.module @asd {
+ gpu.func @foo(%arg0: memref<64x32xf16, 3>, %arg1: i64, %arg2: i64) {
+ %0 = builtin.unrealized_conversion_cast %arg0 : memref<64x32xf16, 3> to !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)>
+ memref.assume_alignment %arg0, 32 : memref<64x32xf16, 3>
+ %1 = llvm.mlir.constant(0 : i32) : i32
+ %2 = llvm.mlir.undef : !llvm.struct<(struct<(i32, i32, i32, i32, i32, i32, i32, i32)>)>
+ %3 = llvm.extractvalue %2[0] : !llvm.struct<(struct<(i32, i32, i32, i32, i32, i32, i32, i32)>)>
+ %4 = llvm.insertvalue %1, %3[0] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %5 = llvm.insertvalue %1, %4[1] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %6 = llvm.insertvalue %1, %5[2] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %7 = llvm.insertvalue %1, %6[3] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %8 = llvm.insertvalue %1, %7[4] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %9 = llvm.insertvalue %1, %8[5] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %10 = llvm.insertvalue %1, %9[6] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %11 = llvm.insertvalue %1, %10[7] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %12 = llvm.insertvalue %11, %2[0] : !llvm.struct<(struct<(i32, i32, i32, i32, i32, i32, i32, i32)>)>
+ %13 = builtin.unrealized_conversion_cast %arg1 : i64 to !nvgpu.warpgroup.descriptor<tensor = memref<64x16xf16, 3>>
+ %14 = builtin.unrealized_conversion_cast %arg2 : i64 to !nvgpu.warpgroup.descriptor<tensor = memref<16x32xf16, 3>>
+ nvvm.wgmma.fence.aligned
+ %15 = llvm.mlir.undef : !llvm.struct<(struct<(i32, i32, i32, i32, i32, i32, i32, i32)>)>
+ %16 = llvm.extractvalue %12[0] : !llvm.struct<(struct<(i32, i32, i32, i32, i32, i32, i32, i32)>)>
+ %17 = nvvm.wgmma.mma_async %arg1, %arg2, %16, <m = 64, n = 32, k = 16>, D[<f16>, <one>, <wrapped>], A[<f16>, <one>, <row>], B[<f16>, <one>, <row>] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)> -> !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %18 = llvm.insertvalue %17, %15[0] : !llvm.struct<(struct<(i32, i32, i32, i32, i32, i32, i32, i32)>)>
+ nvvm.wgmma.commit.group.sync.aligned
+ nvvm.wgmma.wait.group.sync.aligned 1
+ %19 = llvm.extractvalue %18[0] : !llvm.struct<(struct<(i32, i32, i32, i32, i32, i32, i32, i32)>)>
+ %20 = llvm.mlir.constant(1 : i32) : i32
+ %21 = llvm.mlir.constant(2 : i32) : i32
+ %22 = llvm.mlir.constant(4 : i32) : i32
+ %23 = llvm.mlir.constant(8 : i32) : i32
+ %24 = llvm.mlir.constant(16 : i32) : i32
+ %25 = llvm.mlir.constant(32 : i32) : i32
+ %26 = nvvm.read.ptx.sreg.tid.x : i32
+ %27 = llvm.urem %26, %25 : i32
+ %28 = llvm.udiv %26, %25 : i32
+ %29 = llvm.udiv %27, %22 : i32
+ %30 = llvm.urem %27, %22 : i32
+ %31 = llvm.mul %30, %21 : i32
+ %32 = llvm.mul %28, %24 : i32
+ %33 = llvm.add %29, %32 : i32
+ %34 = llvm.mlir.constant(0 : i32) : i32
+ %35 = llvm.mul %34, %23 : i32
+ %36 = llvm.add %33, %35 : i32
+ %37 = llvm.mlir.constant(0 : i32) : i32
+ %38 = llvm.mul %37, %23 : i32
+ %39 = llvm.add %31, %38 : i32
+ %40 = arith.index_cast %36 : i32 to index
+ %41 = arith.index_cast %39 : i32 to index
+ %42 = llvm.extractvalue %19[0] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %43 = llvm.bitcast %42 : i32 to vector<2xf16>
+ vector.store %43, %arg0[%40, %41] : memref<64x32xf16, 3>, vector<2xf16>
+ %44 = llvm.mlir.constant(1 : i32) : i32
+ %45 = llvm.mul %44, %23 : i32
+ %46 = llvm.add %33, %45 : i32
+ %47 = llvm.mlir.constant(0 : i32) : i32
+ %48 = llvm.mul %47, %23 : i32
+ %49 = llvm.add %31, %48 : i32
+ %50 = arith.index_cast %46 : i32 to index
+ %51 = arith.index_cast %49 : i32 to index
+ %52 = llvm.extractvalue %19[2] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %53 = llvm.bitcast %52 : i32 to vector<2xf16>
+ vector.store %53, %arg0[%50, %51] : memref<64x32xf16, 3>, vector<2xf16>
+ gpu.return
+ }
+ }
+}
+
+
+// -----// IR Dump After ConvertVectorToSCF (convert-vector-to-scf) //----- //
+module {
+ gpu.module @asd {
+ gpu.func @foo(%arg0: memref<64x32xf16, 3>, %arg1: i64, %arg2: i64) {
+ %0 = llvm.mlir.constant(32 : i32) : i32
+ %1 = llvm.mlir.constant(16 : i32) : i32
+ %2 = llvm.mlir.constant(8 : i32) : i32
+ %3 = llvm.mlir.constant(4 : i32) : i32
+ %4 = llvm.mlir.constant(2 : i32) : i32
+ %5 = llvm.mlir.constant(1 : i32) : i32
+ %6 = llvm.mlir.constant(0 : i32) : i32
+ memref.assume_alignment %arg0, 32 : memref<64x32xf16, 3>
+ %7 = llvm.mlir.undef : !llvm.struct<(struct<(i32, i32, i32, i32, i32, i32, i32, i32)>)>
+ %8 = llvm.extractvalue %7[0] : !llvm.struct<(struct<(i32, i32, i32, i32, i32, i32, i32, i32)>)>
+ %9 = llvm.insertvalue %6, %8[0] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %10 = llvm.insertvalue %6, %9[1] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %11 = llvm.insertvalue %6, %10[2] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %12 = llvm.insertvalue %6, %11[3] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %13 = llvm.insertvalue %6, %12[4] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %14 = llvm.insertvalue %6, %13[5] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %15 = llvm.insertvalue %6, %14[6] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %16 = llvm.insertvalue %6, %15[7] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ nvvm.wgmma.fence.aligned
+ %17 = nvvm.wgmma.mma_async %arg1, %arg2, %16, <m = 64, n = 32, k = 16>, D[<f16>, <one>, <wrapped>], A[<f16>, <one>, <row>], B[<f16>, <one>, <row>] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)> -> !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ nvvm.wgmma.commit.group.sync.aligned
+ nvvm.wgmma.wait.group.sync.aligned 1
+ %18 = nvvm.read.ptx.sreg.tid.x : i32
+ %19 = llvm.urem %18, %0 : i32
+ %20 = llvm.udiv %18, %0 : i32
+ %21 = llvm.udiv %19, %3 : i32
+ %22 = llvm.urem %19, %3 : i32
+ %23 = llvm.mul %22, %4 : i32
+ %24 = llvm.mul %20, %1 : i32
+ %25 = llvm.add %21, %24 : i32
+ %26 = llvm.mul %6, %2 : i32
+ %27 = llvm.add %25, %26 : i32
+ %28 = llvm.mul %6, %2 : i32
+ %29 = llvm.add %23, %28 : i32
+ %30 = arith.index_cast %27 : i32 to index
+ %31 = arith.index_cast %29 : i32 to index
+ %32 = llvm.extractvalue %17[0] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %33 = llvm.bitcast %32 : i32 to vector<2xf16>
+ vector.store %33, %arg0[%30, %31] : memref<64x32xf16, 3>, vector<2xf16>
+ %34 = llvm.mul %5, %2 : i32
+ %35 = llvm.add %25, %34 : i32
+ %36 = llvm.mul %6, %2 : i32
+ %37 = llvm.add %23, %36 : i32
+ %38 = arith.index_cast %35 : i32 to index
+ %39 = arith.index_cast %37 : i32 to index
+ %40 = llvm.extractvalue %17[2] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %41 = llvm.bitcast %40 : i32 to vector<2xf16>
+ vector.store %41, %arg0[%38, %39] : memref<64x32xf16, 3>, vector<2xf16>
+ gpu.return
+ }
+ }
+}
+
+
+// -----// IR Dump After SCFToControlFlow (convert-scf-to-cf) //----- //
+module {
+ gpu.module @asd {
+ gpu.func @foo(%arg0: memref<64x32xf16, 3>, %arg1: i64, %arg2: i64) {
+ %0 = llvm.mlir.constant(32 : i32) : i32
+ %1 = llvm.mlir.constant(16 : i32) : i32
+ %2 = llvm.mlir.constant(8 : i32) : i32
+ %3 = llvm.mlir.constant(4 : i32) : i32
+ %4 = llvm.mlir.constant(2 : i32) : i32
+ %5 = llvm.mlir.constant(1 : i32) : i32
+ %6 = llvm.mlir.constant(0 : i32) : i32
+ memref.assume_alignment %arg0, 32 : memref<64x32xf16, 3>
+ %7 = llvm.mlir.undef : !llvm.struct<(struct<(i32, i32, i32, i32, i32, i32, i32, i32)>)>
+ %8 = llvm.extractvalue %7[0] : !llvm.struct<(struct<(i32, i32, i32, i32, i32, i32, i32, i32)>)>
+ %9 = llvm.insertvalue %6, %8[0] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %10 = llvm.insertvalue %6, %9[1] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %11 = llvm.insertvalue %6, %10[2] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %12 = llvm.insertvalue %6, %11[3] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %13 = llvm.insertvalue %6, %12[4] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %14 = llvm.insertvalue %6, %13[5] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %15 = llvm.insertvalue %6, %14[6] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %16 = llvm.insertvalue %6, %15[7] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ nvvm.wgmma.fence.aligned
+ %17 = nvvm.wgmma.mma_async %arg1, %arg2, %16, <m = 64, n = 32, k = 16>, D[<f16>, <one>, <wrapped>], A[<f16>, <one>, <row>], B[<f16>, <one>, <row>] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)> -> !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ nvvm.wgmma.commit.group.sync.aligned
+ nvvm.wgmma.wait.group.sync.aligned 1
+ %18 = nvvm.read.ptx.sreg.tid.x : i32
+ %19 = llvm.urem %18, %0 : i32
+ %20 = llvm.udiv %18, %0 : i32
+ %21 = llvm.udiv %19, %3 : i32
+ %22 = llvm.urem %19, %3 : i32
+ %23 = llvm.mul %22, %4 : i32
+ %24 = llvm.mul %20, %1 : i32
+ %25 = llvm.add %21, %24 : i32
+ %26 = llvm.mul %6, %2 : i32
+ %27 = llvm.add %25, %26 : i32
+ %28 = llvm.mul %6, %2 : i32
+ %29 = llvm.add %23, %28 : i32
+ %30 = arith.index_cast %27 : i32 to index
+ %31 = arith.index_cast %29 : i32 to index
+ %32 = llvm.extractvalue %17[0] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %33 = llvm.bitcast %32 : i32 to vector<2xf16>
+ vector.store %33, %arg0[%30, %31] : memref<64x32xf16, 3>, vector<2xf16>
+ %34 = llvm.mul %5, %2 : i32
+ %35 = llvm.add %25, %34 : i32
+ %36 = llvm.mul %6, %2 : i32
+ %37 = llvm.add %23, %36 : i32
+ %38 = arith.index_cast %35 : i32 to index
+ %39 = arith.index_cast %37 : i32 to index
+ %40 = llvm.extractvalue %17[2] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %41 = llvm.bitcast %40 : i32 to vector<2xf16>
+ vector.store %41, %arg0[%38, %39] : memref<64x32xf16, 3>, vector<2xf16>
+ gpu.return
+ }
+ }
+}
+
+
+// -----// IR Dump After ConvertNVVMToLLVMPass (convert-nvvm-to-llvm) //----- //
+module {
+ gpu.module @asd {
+ gpu.func @foo(%arg0: memref<64x32xf16, 3>, %arg1: i64, %arg2: i64) {
+ %0 = llvm.mlir.constant(32 : i32) : i32
+ %1 = llvm.mlir.constant(16 : i32) : i32
+ %2 = llvm.mlir.constant(8 : i32) : i32
+ %3 = llvm.mlir.constant(4 : i32) : i32
+ %4 = llvm.mlir.constant(2 : i32) : i32
+ %5 = llvm.mlir.constant(1 : i32) : i32
+ %6 = llvm.mlir.constant(0 : i32) : i32
+ memref.assume_alignment %arg0, 32 : memref<64x32xf16, 3>
+ %7 = llvm.mlir.undef : !llvm.struct<(struct<(i32, i32, i32, i32, i32, i32, i32, i32)>)>
+ %8 = llvm.extractvalue %7[0] : !llvm.struct<(struct<(i32, i32, i32, i32, i32, i32, i32, i32)>)>
+ %9 = llvm.insertvalue %6, %8[0] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %10 = llvm.insertvalue %6, %9[1] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %11 = llvm.insertvalue %6, %10[2] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %12 = llvm.insertvalue %6, %11[3] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %13 = llvm.insertvalue %6, %12[4] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %14 = llvm.insertvalue %6, %13[5] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %15 = llvm.insertvalue %6, %14[6] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %16 = llvm.insertvalue %6, %15[7] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ llvm.inline_asm has_side_effects asm_dialect = att "wgmma.fence.sync.aligned;", "" : () -> ()
+ %17 = llvm.mlir.constant(1 : i32) : i32
+ %18 = llvm.mlir.constant(1 : i32) : i32
+ %19 = llvm.mlir.constant(1 : i32) : i32
+ %20 = llvm.mlir.constant(0 : i32) : i32
+ %21 = llvm.mlir.constant(1 : i32) : i32
+ %22 = llvm.extractvalue %16[0] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %23 = llvm.extractvalue %16[1] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %24 = llvm.extractvalue %16[2] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %25 = llvm.extractvalue %16[3] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %26 = llvm.extractvalue %16[4] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %27 = llvm.extractvalue %16[5] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %28 = llvm.extractvalue %16[6] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %29 = llvm.extractvalue %16[7] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %30 = llvm.inline_asm has_side_effects asm_dialect = att "{\0A.reg .pred p;\0Asetp.ne.b32 p, $18, 0;\0Awgmma.mma_async.sync.aligned.m64n32k16.f16.f16.f16 {$0, $1, $2, $3, $4, $5, $6, $7}, $16, $17, p, $19, $20, $21, $22;\0A}\0A", "=r,=r,=r,=r,=r,=r,=r,=r,0,1,2,3,4,5,6,7,l,l,n,n,n,n,n" %22, %23, %24, %25, %26, %27, %28, %29, %arg1, %...
[truncated]
|
This PR moves lowering of math dialect later in the pipeline. Because math dialect is lowered correctly by `createConvertGpuOpsToNVVMOps` for GPU target, and it needs to run it first.
Buildbot did not like the way I check PTX. I'll fix this quickly. If it's blocking someone, feel free to revert the PR. |
PR llvm#78556 added a new mlir test with gpu-lower-to-nvvm-pipeline that checks the generated PTX. However, it causes a problem on host without cuda support. This PR adds `REQUIRES: host-supports-nvptx`.
…#78556)" This reverts commit 74bf0b1. The test always fails. | mlir/test/Dialect/GPU/test-nvvm-pipeline.mlir:23:16: error: CHECK-PTX: expected string not found in input | // CHECK-PTX: __nv_expf https://lab.llvm.org/buildbot/#/builders/61/builds/53789
This PR moves lowering of math dialect later in the pipeline. Because math dialect is lowered correctly by createConvertGpuOpsToNVVMOps for GPU target, and it needs to run it first. Reland llvm#78556
This PR moves lowering of math dialect later in the pipeline. Because math dialect is lowered correctly by createConvertGpuOpsToNVVMOps for GPU target, and it needs to run it first. Reland #78556
This PR moves lowering of math dialect later in the pipeline. Because math dialect is lowered correctly by
createConvertGpuOpsToNVVMOps
for GPU target, and it needs to run it first.