diff --git a/mlir/test/Integration/GPU/CUDA/sm90/gemm_f32_f16_f16_128x128x128.mlir b/mlir/test/Integration/GPU/CUDA/sm90/gemm_f32_f16_f16_128x128x128.mlir index a5653f395a2c4..37564de7442cf 100644 --- a/mlir/test/Integration/GPU/CUDA/sm90/gemm_f32_f16_f16_128x128x128.mlir +++ b/mlir/test/Integration/GPU/CUDA/sm90/gemm_f32_f16_f16_128x128x128.mlir @@ -57,7 +57,6 @@ func.func private @printMemrefF32(memref<*xf32>) -memref.global "private" @dynamicShmem : memref<0xf16, 3> {alignment = 16 : i64} memref.global "private" @accShmem : memref<0xf32, 3> {alignment = 16 : i64} func.func @main() { @@ -148,12 +147,11 @@ func.func @main() { %c57344 = arith.constant 57344 : index %c40960 = arith.constant 40960 : index - %tidx = gpu.thread_id x - %dynamicMem = memref.get_global @dynamicShmem : memref<0xf16, 3> - %lhsShmem = memref.reinterpret_cast %dynamicMem to offset: [0], sizes: [2, 128, 64], strides: [8192, 64, 1] : memref<0xf16, 3> to memref<2x128x64xf16, 3> - %rhsShmem2 = memref.reinterpret_cast %dynamicMem to offset: [0], sizes: [4, 64, 128], strides: [8192,128,1] : memref<0xf16, 3> to memref<4x64x128xf16,3> - %rhsShmem = memref.subview %rhsShmem2[2, 0, 0][2, 64, 128][1, 1, 1] : memref<4x64x128xf16,3> to memref<2x64x128xf16, strided<[8192, 128, 1], offset: 16384>, 3> + %tidx = gpu.thread_id x %dynsmem = gpu.dynamic_shared_memory : memref> + %lhsShmem = memref.view %dynsmem[%c0][] : memref> to memref<2x128x64xf16, #gpu.address_space> + %rhsShmem = memref.view %dynsmem[%c32768][] : memref> to memref<2x64x128xf16, #gpu.address_space> + // Step 1. [GPU] Create Async Transactional Barriers (mbarriers) %barrier = nvgpu.mbarrier.create -> !barrierType %cnd = arith.cmpi eq, %tidx, %c0 : index @@ -202,11 +200,11 @@ func.func @main() { // TMA wait %phase_c0 = arith.constant 0 : i1 nvgpu.mbarrier.try_wait.parity %barrier[%i], %phase_c0, %ticks : !barrierType - %lhsSlice = memref.subview %lhsShmem [%i, 0, 0][1, 128, 64][1, 1, 1] : memref<2x128x64xf16, 3> to memref<128x64xf16, strided<[64, 1], offset: ?>, 3> - %rhsSlice = memref.subview %rhsShmem [%i, 0, 0][1, 64, 128][1, 1, 1] : memref<2x64x128xf16, strided<[8192, 128, 1], offset: 16384>, 3> to memref<64x128xf16, strided<[128, 1], offset: ?>, 3> + %lhsSlice = memref.subview %lhsShmem [%i, 0, 0][1, 128, 64][1, 1, 1] : memref<2x128x64xf16, #gpu.address_space> to memref<128x64xf16, strided<[64, 1], offset: ?>, #gpu.address_space> + %rhsSlice = memref.subview %rhsShmem [%i, 0, 0][1, 64, 128][1, 1, 1] : memref<2x64x128xf16, #gpu.address_space> to memref<64x128xf16, strided<[128, 1], offset: ?>, #gpu.address_space> // Descriptor WGMMA - %dA = nvgpu.warpgroup.generate.descriptor %lhsSlice, %descA : memref<128x64xf16, strided<[64, 1], offset: ?>, 3>, !lhsTensorMap -> !nvgpu.warpgroup.descriptor> - %dB = nvgpu.warpgroup.generate.descriptor %rhsSlice, %descB : memref<64x128xf16, strided<[128, 1], offset: ?>, 3>, !rhsTensorMap -> !nvgpu.warpgroup.descriptor> + %dA = nvgpu.warpgroup.generate.descriptor %lhsSlice, %descA : memref<128x64xf16, strided<[64, 1], offset: ?>, #gpu.address_space>, !lhsTensorMap -> !nvgpu.warpgroup.descriptor> + %dB = nvgpu.warpgroup.generate.descriptor %rhsSlice, %descB : memref<64x128xf16, strided<[128, 1], offset: ?>, #gpu.address_space>, !rhsTensorMap -> !nvgpu.warpgroup.descriptor> // Perform WGMMA 128x128x64 %md = nvgpu.warpgroup.mma %dA, %dB, %mc {transposeB} : >, >, > -> > scf.yield %md : !nvgpu.warpgroup.accumulator> @@ -271,7 +269,7 @@ func.func @main() { vector.print str "Correct Results :" vector.print %correctCount : i32 vector.print str "Incorrect Results :" - vector.print %errorCount : i32 + vector.print %errorCount : i32 return } diff --git a/mlir/test/Integration/GPU/CUDA/sm90/gemm_pred_f32_f16_f16_128x128x128.mlir b/mlir/test/Integration/GPU/CUDA/sm90/gemm_pred_f32_f16_f16_128x128x128.mlir index 197351f1921e7..db7754c89dcac 100644 --- a/mlir/test/Integration/GPU/CUDA/sm90/gemm_pred_f32_f16_f16_128x128x128.mlir +++ b/mlir/test/Integration/GPU/CUDA/sm90/gemm_pred_f32_f16_f16_128x128x128.mlir @@ -57,7 +57,6 @@ func.func private @printMemrefF32(memref<*xf32>) -memref.global "private" @dynamicShmem : memref<0xf16, 3> {alignment = 16 : i64} memref.global "private" @accShmem : memref<0xf32, 3> {alignment = 16 : i64} func.func @main() { @@ -149,11 +148,10 @@ func.func @main() { %c40960 = arith.constant 40960 : index %tidx = gpu.thread_id x - %dynamicMem = memref.get_global @dynamicShmem : memref<0xf16, 3> - %lhsShmem = memref.reinterpret_cast %dynamicMem to offset: [0], sizes: [2, 128, 64], strides: [8192, 64, 1] : memref<0xf16, 3> to memref<2x128x64xf16, 3> - %rhsShmem2 = memref.reinterpret_cast %dynamicMem to offset: [0], sizes: [4, 64, 128], strides: [8192,128,1] : memref<0xf16, 3> to memref<4x64x128xf16,3> - %rhsShmem = memref.subview %rhsShmem2[2, 0, 0][2, 64, 128][1, 1, 1] : memref<4x64x128xf16,3> to memref<2x64x128xf16, strided<[8192, 128, 1], offset: 16384>, 3> %dynsmem = gpu.dynamic_shared_memory : memref> + %lhsShmem = memref.view %dynsmem[%c0][] : memref> to memref<2x128x64xf16, #gpu.address_space> + %rhsShmem = memref.view %dynsmem[%c32768][] : memref> to memref<2x64x128xf16, #gpu.address_space> + // Step 1. [GPU] Create Async Transactional Barriers (mbarriers) %barrier = nvgpu.mbarrier.create -> !barrierType @@ -210,11 +208,11 @@ func.func @main() { // TMA wait %phase_c0 = arith.constant 0 : i1 nvgpu.mbarrier.try_wait.parity %barrier[%i], %phase_c0, %ticks : !barrierType - %lhsSlice = memref.subview %lhsShmem [%i, 0, 0][1, 128, 64][1, 1, 1] : memref<2x128x64xf16, 3> to memref<128x64xf16, strided<[64, 1], offset: ?>, 3> - %rhsSlice = memref.subview %rhsShmem [%i, 0, 0][1, 64, 128][1, 1, 1] : memref<2x64x128xf16, strided<[8192, 128, 1], offset: 16384>, 3> to memref<64x128xf16, strided<[128, 1], offset: ?>, 3> + %lhsSlice = memref.subview %lhsShmem [%i, 0, 0][1, 128, 64][1, 1, 1] : memref<2x128x64xf16, #gpu.address_space> to memref<128x64xf16, strided<[64, 1], offset: ?>, #gpu.address_space> + %rhsSlice = memref.subview %rhsShmem [%i, 0, 0][1, 64, 128][1, 1, 1] : memref<2x64x128xf16, #gpu.address_space> to memref<64x128xf16, strided<[128, 1], offset: ?>, #gpu.address_space> // Descriptor WGMMA - %dA = nvgpu.warpgroup.generate.descriptor %lhsSlice, %descA : memref<128x64xf16, strided<[64, 1], offset: ?>, 3>, !lhsTensorMap -> !nvgpu.warpgroup.descriptor> - %dB = nvgpu.warpgroup.generate.descriptor %rhsSlice, %descB : memref<64x128xf16, strided<[128, 1], offset: ?>, 3>, !rhsTensorMap -> !nvgpu.warpgroup.descriptor> + %dA = nvgpu.warpgroup.generate.descriptor %lhsSlice, %descA : memref<128x64xf16, strided<[64, 1], offset: ?>, #gpu.address_space>, !lhsTensorMap -> !nvgpu.warpgroup.descriptor> + %dB = nvgpu.warpgroup.generate.descriptor %rhsSlice, %descB : memref<64x128xf16, strided<[128, 1], offset: ?>, #gpu.address_space>, !rhsTensorMap -> !nvgpu.warpgroup.descriptor> // Perform WGMMA 128x128x64 %md = nvgpu.warpgroup.mma %dA, %dB, %mc {transposeB} : >, >, > -> > scf.yield %md : !nvgpu.warpgroup.accumulator>