diff --git a/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir b/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir index e55bca4bad42f..e43ecfd01cb50 100644 --- a/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir +++ b/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir @@ -6,16 +6,11 @@ // RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx1201 | FileCheck %s --check-prefixes=CHECK,GFX12,RDNA // RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx1250 | FileCheck %s --check-prefixes=CHECK,GFX1250 -// Note: #gpu.address_space is hardcoded to `1` here because the -// test pass doesn't set up the GPU address space conversions. - // CHECK: #[[$MMRA_TAG:.+]] = #llvm.mmra_tag<"amdgpu-synchronize-as":"local"> -#gpu_global_addrspace = 1 - // CHECK-LABEL: func @fat_raw_buffer_cast -func.func @fat_raw_buffer_cast(%buf: memref<8xi32, #gpu_global_addrspace>) -> memref<8xi32, #amdgpu.address_space> { - // CHECK: %[[desc:.*]] = builtin.unrealized_conversion_cast %{{.*}} : memref<8xi32, 1> to !llvm.struct<(ptr<1>, ptr<1>, i64, array<1 x i64>, array<1 x i64>)> +func.func @fat_raw_buffer_cast(%buf: memref<8xi32, #gpu.address_space>) -> memref<8xi32, #amdgpu.address_space> { + // CHECK: %[[desc:.*]] = builtin.unrealized_conversion_cast %{{.*}} : memref<8xi32, #gpu.address_space> to !llvm.struct<(ptr<1>, ptr<1>, i64, array<1 x i64>, array<1 x i64>)> // CHECK-DAG: %[[base:.*]] = llvm.extractvalue %[[desc]][1] // CHECK-DAG: %[[offset:.*]] = llvm.extractvalue %[[desc]][2] // CHECK-DAG: %[[sizes:.*]] = llvm.extractvalue %[[desc]][3] @@ -33,13 +28,13 @@ func.func @fat_raw_buffer_cast(%buf: memref<8xi32, #gpu_global_addrspace>) -> me // CHECK: %[[ret4:.*]] = llvm.insertvalue %[[sizes]], %[[ret3]][3] // CHECK: %[[ret5:.*]] = llvm.insertvalue %[[strides]], %[[ret4]][4] // CHECK: builtin.unrealized_conversion_cast %[[ret5]] - %ret = amdgpu.fat_raw_buffer_cast %buf : memref<8xi32, #gpu_global_addrspace> to memref<8xi32, #amdgpu.address_space> + %ret = amdgpu.fat_raw_buffer_cast %buf : memref<8xi32, #gpu.address_space> to memref<8xi32, #amdgpu.address_space> return %ret : memref<8xi32, #amdgpu.address_space> } // CHECK-LABEL: func @fat_raw_buffer_cast_0d -func.func @fat_raw_buffer_cast_0d(%buf: memref) -> memref> { - // CHECK: %[[desc:.*]] = builtin.unrealized_conversion_cast %{{.*}} : memref to !llvm.struct<(ptr<1>, ptr<1>, i64)> +func.func @fat_raw_buffer_cast_0d(%buf: memref>) -> memref> { + // CHECK: %[[desc:.*]] = builtin.unrealized_conversion_cast %{{.*}} : memref> to !llvm.struct<(ptr<1>, ptr<1>, i64)> // CHECK-DAG: %[[base:.*]] = llvm.extractvalue %[[desc]][1] // CHECK-DAG: %[[offset:.*]] = llvm.extractvalue %[[desc]][2] // CHECK-DAG: %[[numRecords:.*]] = llvm.mlir.constant(4 : i64) : i64 @@ -53,12 +48,12 @@ func.func @fat_raw_buffer_cast_0d(%buf: memref) -> m // CHECK: %[[ret2:.*]] = llvm.insertvalue %[[fatBuf]], %[[ret1]][1] // CHECK: %[[ret3:.*]] = llvm.insertvalue %[[offset]], %[[ret2]][2] // CHECK: builtin.unrealized_conversion_cast %[[ret3]] - %ret = amdgpu.fat_raw_buffer_cast %buf : memref to memref> + %ret = amdgpu.fat_raw_buffer_cast %buf : memref> to memref> return %ret : memref> } // CHECK-LABEL: func @fat_raw_buffer_cast_dyn_size_offset -func.func @fat_raw_buffer_cast_dyn_size_offset(%buf: memref, #gpu_global_addrspace>) -> memref, #amdgpu.address_space> { +func.func @fat_raw_buffer_cast_dyn_size_offset(%buf: memref, #gpu.address_space>) -> memref, #amdgpu.address_space> { // CHECK: %[[size0:.*]] = llvm.extractvalue %{{.*}}[3, 0] // CHECK: %[[stride0:.*]] = llvm.extractvalue %{{.*}}[4, 0] // CHECK: %[[maxVals:.*]] = llvm.mul %[[size0]], %[[stride0]] @@ -67,13 +62,13 @@ func.func @fat_raw_buffer_cast_dyn_size_offset(%buf: memref, #gpu_global_addrspace> to memref, #amdgpu.address_space> + %ret = amdgpu.fat_raw_buffer_cast %buf : memref, #gpu.address_space> to memref, #amdgpu.address_space> return %ret : memref, #amdgpu.address_space> } // CHECK-LABEL: func @fat_raw_buffer_cast_reset_offset -func.func @fat_raw_buffer_cast_reset_offset(%buf: memref, #gpu_global_addrspace>) -> memref> { - // CHECK: %[[desc:.*]] = builtin.unrealized_conversion_cast %{{.*}} : memref, 1> to !llvm.struct<(ptr<1>, ptr<1>, i64, array<1 x i64>, array<1 x i64>)> +func.func @fat_raw_buffer_cast_reset_offset(%buf: memref, #gpu.address_space>) -> memref> { + // CHECK: %[[desc:.*]] = builtin.unrealized_conversion_cast %{{.*}} : memref, #gpu.address_space> to !llvm.struct<(ptr<1>, ptr<1>, i64, array<1 x i64>, array<1 x i64>)> // CHECK-DAG: %[[memRefPtr:.*]] = llvm.extractvalue %[[desc]][1] // CHECK-DAG: %[[memRefOff:.*]] = llvm.extractvalue %[[desc]][2] // CHECK-DAG: %[[basePtr:.*]] = llvm.getelementptr %[[memRefPtr]][%[[memRefOff]]] @@ -81,21 +76,21 @@ func.func @fat_raw_buffer_cast_reset_offset(%buf: memref, #gpu_global_addrspace> to memref> + %ret = amdgpu.fat_raw_buffer_cast %buf resetOffset : memref, #gpu.address_space> to memref> return %ret : memref> } // CHECK-LABEL: func @fat_raw_buffer_cast_valid_bytes -func.func @fat_raw_buffer_cast_valid_bytes(%buf: memref<8xi32, #gpu_global_addrspace>) -> memref<8xi32, #amdgpu.address_space> { +func.func @fat_raw_buffer_cast_valid_bytes(%buf: memref<8xi32, #gpu.address_space>) -> memref<8xi32, #amdgpu.address_space> { // CHECK: %[[numRecords:.*]] = arith.constant -1 : i64 // CHECK: rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %{{.*}} %cu64_max = arith.constant -1 : i64 - %ret = amdgpu.fat_raw_buffer_cast %buf validBytes(%cu64_max) : memref<8xi32, #gpu_global_addrspace> to memref<8xi32, #amdgpu.address_space> + %ret = amdgpu.fat_raw_buffer_cast %buf validBytes(%cu64_max) : memref<8xi32, #gpu.address_space> to memref<8xi32, #amdgpu.address_space> return %ret : memref<8xi32, #amdgpu.address_space> } // CHECK-LABEL: func @fat_raw_buffer_cast_bounds_check -func.func @fat_raw_buffer_cast_bounds_check(%buf: memref<8xi32, #gpu_global_addrspace>) -> memref<8xi32, #amdgpu.address_space> { +func.func @fat_raw_buffer_cast_bounds_check(%buf: memref<8xi32, #gpu.address_space>) -> memref<8xi32, #amdgpu.address_space> { // GFX9: %[[numRecords:.*]] = llvm.mlir.constant({{.*}} : i64) // GFX9: %[[flags:.*]] = llvm.mlir.constant(159744 : i32) // GFX1250: %[[numRecords:.*]] = llvm.mlir.constant(35184372088831 : i64) @@ -103,13 +98,13 @@ func.func @fat_raw_buffer_cast_bounds_check(%buf: memref<8xi32, #gpu_global_addr // RDNA: %[[numRecords:.*]] = llvm.mlir.constant({{.*}} : i64) // RDNA: %[[flags:.*]] = llvm.mlir.constant(553807872 : i32) // CHECK: %[[rsrc:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %[[flags]] - %ret = amdgpu.fat_raw_buffer_cast %buf boundsCheck(false) : memref<8xi32, #gpu_global_addrspace> to memref<8xi32, #amdgpu.address_space> + %ret = amdgpu.fat_raw_buffer_cast %buf boundsCheck(false) : memref<8xi32, #gpu.address_space> to memref<8xi32, #amdgpu.address_space> return %ret : memref<8xi32, #amdgpu.address_space> } // CHECK-LABEL: func @fat_raw_buffer_cast_cache_swizzle -// CHECK-SAME: (%{{.*}}: memref<64x64xi32, 1>, %[[stride:.*]]: i14) -func.func @fat_raw_buffer_cast_cache_swizzle(%buf: memref<64x64xi32, #gpu_global_addrspace>, %stride: i14) -> memref<64x64xi32, #amdgpu.address_space> { +// CHECK-SAME: (%{{.*}}: memref<64x64xi32, #gpu.address_space>, %[[stride:.*]]: i14) +func.func @fat_raw_buffer_cast_cache_swizzle(%buf: memref<64x64xi32, #gpu.address_space>, %stride: i14) -> memref<64x64xi32, #amdgpu.address_space> { // GFX908: %[[stride:.*]] = llvm.mlir.constant(0 : i16) : i16 // GFX908: %[[flags:.*]] = llvm.mlir.constant // GFX90A: %[[stride:.*]] = llvm.mlir.constant(0 : i16) : i16 @@ -123,7 +118,7 @@ func.func @fat_raw_buffer_cast_cache_swizzle(%buf: memref<64x64xi32, #gpu_global // GFX1250: %[[stride:.*]] = llvm.mlir.constant(0 : i16) : i16 // GFX1250: %[[flags:.*]] = llvm.mlir.constant(0 : i32) // CHECK: rocdl.make.buffer.rsrc %{{.*}}, %[[stride]], %{{.*}}, %[[flags]] - %ret = amdgpu.fat_raw_buffer_cast %buf cacheSwizzleStride(%stride) : memref<64x64xi32, #gpu_global_addrspace> to memref<64x64xi32, #amdgpu.address_space> + %ret = amdgpu.fat_raw_buffer_cast %buf cacheSwizzleStride(%stride) : memref<64x64xi32, #gpu.address_space> to memref<64x64xi32, #amdgpu.address_space> return %ret : memref<64x64xi32, #amdgpu.address_space> } diff --git a/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir b/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir index 86b96ca2b4b86..986ea8d80729d 100644 --- a/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir +++ b/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir @@ -165,40 +165,28 @@ func.func @amdgpu.scaled_ext_packed_matrix_invalid_dst_elem_type(%v: vector<16xf // ----- -#gpu_global_addrspace = 1 -#gpu_lds_addrspace = 3 -#amdgpu_fat_buffer_addrspace = 7 - -func.func @amdgpu.make_dma_base.invalid_element_types(%idx: index, %mem: memref<8xi32, #gpu_global_addrspace>, %smem: memref<8xf32,#gpu_lds_addrspace>) -> (!amdgpu.tdm_base) { +func.func @amdgpu.make_dma_base.invalid_element_types(%idx: index, %mem: memref<8xi32, #gpu.address_space>, %smem: memref<8xf32,#gpu.address_space>) -> (!amdgpu.tdm_base) { // expected-error@+1 {{'amdgpu.make_dma_base' op failed to verify that all of {global, lds} have same element type}} - %0 = amdgpu.make_dma_base %mem[%idx], %smem[%idx] : memref<8xi32, #gpu_global_addrspace>, memref<8xf32, #gpu_lds_addrspace> -> !amdgpu.tdm_base + %0 = amdgpu.make_dma_base %mem[%idx], %smem[%idx] : memref<8xi32, #gpu.address_space>, memref<8xf32, #gpu.address_space> -> !amdgpu.tdm_base return %0 : !amdgpu.tdm_base } // ----- -#gpu_global_addrspace = 1 -#gpu_lds_addrspace = 3 -#amdgpu_fat_buffer_addrspace = 7 - -func.func @amdgpu.make_dma_base.invalid_element_types(%idx: index, %mem: memref<8xi7, #gpu_global_addrspace>, %smem: memref<8xi7,#gpu_lds_addrspace>) -> (!amdgpu.tdm_base) { +func.func @amdgpu.make_dma_base.invalid_element_types(%idx: index, %mem: memref<8xi7, #gpu.address_space>, %smem: memref<8xi7,#gpu.address_space>) -> (!amdgpu.tdm_base) { // expected-error@+1 {{'amdgpu.make_dma_base' op element type must be 1, 2, 4, or 8 bytes long but type was 7 bits long.}} - %0 = amdgpu.make_dma_base %mem[%idx], %smem[%idx] : memref<8xi7, #gpu_global_addrspace>, memref<8xi7, #gpu_lds_addrspace> -> !amdgpu.tdm_base + %0 = amdgpu.make_dma_base %mem[%idx], %smem[%idx] : memref<8xi7, #gpu.address_space>, memref<8xi7, #gpu.address_space> -> !amdgpu.tdm_base return %0 : !amdgpu.tdm_base } // ----- -#gpu_global_addrspace = 1 -#gpu_lds_addrspace = 3 -#amdgpu_fat_buffer_addrspace = 7 - // CHECK-LABEL: func @make_dma_base -// CHECK-SAME: (%[[IDX:.+]]: index, %[[MEM:.+]]: memref<8xi32, 1>, %[[SMEM:.+]]: memref<8xi32, 3>) -func.func @make_dma_base(%idx: index, %mem: memref<8xi32, #gpu_global_addrspace>, %smem: memref<8xi32,#gpu_lds_addrspace>) -> (!amdgpu.tdm_base) { +// CHECK-SAME: (%[[IDX:.+]]: index, %[[MEM:.+]]: memref<8xi32, #gpu.address_space>, %[[SMEM:.+]]: memref<8xi32, #gpu.address_space>) +func.func @make_dma_base(%idx: index, %mem: memref<8xi32, #gpu.address_space>, %smem: memref<8xi32,#gpu.address_space>) -> (!amdgpu.tdm_base) { // CHECK-DAG: %[[INT:.+]] = builtin.unrealized_conversion_cast %[[IDX]] : index to i64 - // CHECK-DAG: %[[MEMREF_DESC_MEM:.+]] = builtin.unrealized_conversion_cast %[[MEM]] : memref<8xi32, 1> - // CHECK-DAG: %[[MEMREF_DESC_SMEM:.+]] = builtin.unrealized_conversion_cast %[[SMEM]] : memref<8xi32, 3> + // CHECK-DAG: %[[MEMREF_DESC_MEM:.+]] = builtin.unrealized_conversion_cast %[[MEM]] : memref<8xi32, #gpu.address_space> + // CHECK-DAG: %[[MEMREF_DESC_SMEM:.+]] = builtin.unrealized_conversion_cast %[[SMEM]] : memref<8xi32, #gpu.address_space> // CHECK-DAG: %[[C0:.+]] = llvm.mlir.constant(0 : i32) : i32 // CHECK-DAG: %[[C1:.+]] = llvm.mlir.constant(1 : i32) : i32 @@ -231,19 +219,16 @@ func.func @make_dma_base(%idx: index, %mem: memref<8xi32, #gpu_global_addrspace> // CHECK: %[[V4I32_0_3:.+]] = llvm.insertelement %[[MEM_INT_LOW]], %[[V4I32_0_2]][%[[C2]] : i32] // CHECK: %[[V4I32_0_4:.+]] = llvm.insertelement %[[MEM_INT_HIGH_TYPE]], %[[V4I32_0_3]][%[[C3]] : i32] - %0 = amdgpu.make_dma_base %mem[%idx], %smem[%idx] : memref<8xi32, #gpu_global_addrspace>, memref<8xi32, #gpu_lds_addrspace> -> !amdgpu.tdm_base + %0 = amdgpu.make_dma_base %mem[%idx], %smem[%idx] : memref<8xi32, #gpu.address_space>, memref<8xi32, #gpu.address_space> -> !amdgpu.tdm_base func.return %0 : !amdgpu.tdm_base } // ----- -#gpu_global_addrspace = 1 -#gpu_lds_addrspace = 3 - // CHECK-LABEL: func @make_gather_dma_base -// CHECK-SAME: (%[[IDX:.+]]: index, %[[MEM:.+]]: memref<8xi32, 1>, %[[SMEM:.+]]: memref<8xi32, 3>) -func.func @make_gather_dma_base(%idx: index, %mem: memref<8xi32, #gpu_global_addrspace>, %smem: memref<8xi32,#gpu_lds_addrspace>) -> (!amdgpu.tdm_gather_base, !amdgpu.tdm_gather_base) { +// CHECK-SAME: (%[[IDX:.+]]: index, %[[MEM:.+]]: memref<8xi32, #gpu.address_space>, %[[SMEM:.+]]: memref<8xi32, #gpu.address_space>) +func.func @make_gather_dma_base(%idx: index, %mem: memref<8xi32, #gpu.address_space>, %smem: memref<8xi32,#gpu.address_space>) -> (!amdgpu.tdm_gather_base, !amdgpu.tdm_gather_base) { // CHECK-DAG: %[[C0:.+]] = llvm.mlir.constant(0 : i32) : i32 // CHECK-DAG: %[[C1:.+]] = llvm.mlir.constant(1 : i32) : i32 @@ -257,7 +242,7 @@ func.func @make_gather_dma_base(%idx: index, %mem: memref<8xi32, #gpu_global_add // CHECK: %[[V4I32_0_0:.+]] = llvm.mlir.poison : vector<4xi32> // CHECK: %[[V4I32_0_1:.+]] = llvm.insertelement %[[SGPR0]], %[[V4I32_0_0]][%[[C0]] : i32] - %0 = amdgpu.make_gather_dma_base %mem[%idx], %smem[%idx] : memref<8xi32, #gpu_global_addrspace>, memref<8xi32, #gpu_lds_addrspace> -> !amdgpu.tdm_gather_base + %0 = amdgpu.make_gather_dma_base %mem[%idx], %smem[%idx] : memref<8xi32, #gpu.address_space>, memref<8xi32, #gpu.address_space> -> !amdgpu.tdm_gather_base // CHECK-DAG: %[[C0:.+]] = llvm.mlir.constant(0 : i32) : i32 // CHECK-DAG: %[[C1:.+]] = llvm.mlir.constant(1 : i32) : i32 @@ -276,7 +261,7 @@ func.func @make_gather_dma_base(%idx: index, %mem: memref<8xi32, #gpu_global_add // CHECK: %[[V4I32_0_1:.+]] = llvm.insertelement %[[SGPR0]], %[[V4I32_0_0]][%[[C0]] : i32] - %1 = amdgpu.make_gather_dma_base %mem[%idx], %smem[%idx] : memref<8xi32, #gpu_global_addrspace>, memref<8xi32, #gpu_lds_addrspace> -> !amdgpu.tdm_gather_base + %1 = amdgpu.make_gather_dma_base %mem[%idx], %smem[%idx] : memref<8xi32, #gpu.address_space>, memref<8xi32, #gpu.address_space> -> !amdgpu.tdm_gather_base func.return %0, %1 : !amdgpu.tdm_gather_base, !amdgpu.tdm_gather_base } @@ -354,13 +339,9 @@ func.func @make_dma_descriptor(%base: !amdgpu.tdm_base) -> !amdgpu.tdm_desc // ----- -#gpu_global_addrspace = 1 -#gpu_lds_addrspace = 3 -#amdgpu_fat_buffer_addrspace = 7 - // CHECK-LABEL: func @make_dma_descriptor_atomic_barrier // CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base, %[[BARRIER:.+]]: {{.*}}, %[[IDX:.+]]: index) -func.func @make_dma_descriptor_atomic_barrier(%base: !amdgpu.tdm_base, %barrier : memref<2x!amdgpu.ds_barrier_state, #gpu_lds_addrspace>, %idx: index) -> !amdgpu.tdm_descriptor { +func.func @make_dma_descriptor_atomic_barrier(%base: !amdgpu.tdm_base, %barrier : memref<2x!amdgpu.ds_barrier_state, #gpu.address_space>, %idx: index) -> !amdgpu.tdm_descriptor { // CHECK-DAG: %[[INDEX:.+]] = builtin.unrealized_conversion_cast %[[IDX]] : index to i64 // CHECK-DAG: %[[BARRIER_MEMREF_DESC:.+]] = builtin.unrealized_conversion_cast %[[BARRIER]] // CHECK-DAG: %[[DGROUP0:.+]] = builtin.unrealized_conversion_cast %[[BASE]] @@ -403,7 +384,7 @@ func.func @make_dma_descriptor_atomic_barrier(%base: !amdgpu.tdm_base, %bar %descriptor = amdgpu.make_dma_descriptor %base globalSize [128, 64] globalStride [64, 1] sharedSize [128, 64] - atomicBarrier(%barrier[%idx] : memref<2x!amdgpu.ds_barrier_state, #gpu_lds_addrspace>) + atomicBarrier(%barrier[%idx] : memref<2x!amdgpu.ds_barrier_state, #gpu.address_space>) : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor func.return %descriptor : !amdgpu.tdm_descriptor } diff --git a/mlir/test/Conversion/AMDGPUToROCDL/load_lds-gfx950.mlir b/mlir/test/Conversion/AMDGPUToROCDL/load_lds-gfx950.mlir index 42fb18006bea4..5bbbf8405105e 100644 --- a/mlir/test/Conversion/AMDGPUToROCDL/load_lds-gfx950.mlir +++ b/mlir/test/Conversion/AMDGPUToROCDL/load_lds-gfx950.mlir @@ -1,17 +1,13 @@ // RUN: not mlir-opt %s --split-input-file -convert-amdgpu-to-rocdl=chipset=gfx942 2>&1 | FileCheck %s --check-prefix=GFX942 // RUN: mlir-opt %s --split-input-file -convert-amdgpu-to-rocdl=chipset=gfx950 | FileCheck %s --check-prefix=GFX950 -#gpu_global_addrspace = 1 -#gpu_lds_addrspace = 3 -#amdgpu_fat_buffer_addrspace = 7 - // GFX950-LABEL: func @fat_buffer_load_to_rocdl_f96 -// GFX950-SAME: (%[[ARG0:.*]]: memref<128x72xf32, 7>) -func.func @fat_buffer_load_to_rocdl_f96(%global : memref<128x72xf32, #amdgpu_fat_buffer_addrspace>) { +// GFX950-SAME: (%[[ARG0:.*]]: memref<128x72xf32, #amdgpu.address_space>) +func.func @fat_buffer_load_to_rocdl_f96(%global : memref<128x72xf32, #amdgpu.address_space>) { %c0 = arith.constant 0 : index %c12 = arith.constant 12 : index %c32 = arith.constant 32 : index - %alloc = memref.alloc() : memref<64x64xf32, #gpu_lds_addrspace> + %alloc = memref.alloc() : memref<64x64xf32, #gpu.address_space> // GFX950: %[[BUFFER_DESC:.*]] = builtin.unrealized_conversion_cast %[[ARG0]] // GFX950: %[[C0:.*]] = arith.constant 0 : index @@ -40,23 +36,19 @@ func.func @fat_buffer_load_to_rocdl_f96(%global : memref<128x72xf32, #amdgpu_fat // GFX950: rocdl.load.to.lds %[[GLOBAL_PTR]], %[[LDS_PTR]], 12 // GFX942: error: 'amdgpu.gather_to_lds' op Gather to LDS instructions with 12-byte and 16-byte load widths are only supported on gfx950 amdgpu.gather_to_lds %global[%c12, %c0], %alloc[%c32, %c0] - : vector<16xf6E3M2FN>, memref<128x72xf32, #amdgpu_fat_buffer_addrspace>, memref<64x64xf32, #gpu_lds_addrspace> + : vector<16xf6E3M2FN>, memref<128x72xf32, #amdgpu.address_space>, memref<64x64xf32, #gpu.address_space> func.return } // ----- -#gpu_global_addrspace = 1 -#gpu_lds_addrspace = 3 -#amdgpu_fat_buffer_addrspace = 7 - // GFX950-LABEL: func @fat_buffer_load_to_rocdl_f128 -// GFX950-SAME: (%[[ARG0:.*]]: memref<128x72xf32, 7>) -func.func @fat_buffer_load_to_rocdl_f128(%global : memref<128x72xf32, #amdgpu_fat_buffer_addrspace>) { +// GFX950-SAME: (%[[ARG0:.*]]: memref<128x72xf32, #amdgpu.address_space>) +func.func @fat_buffer_load_to_rocdl_f128(%global : memref<128x72xf32, #amdgpu.address_space>) { %c0 = arith.constant 0 : index %c12 = arith.constant 12 : index %c32 = arith.constant 32 : index - %alloc = memref.alloc() : memref<64x64xf32, #gpu_lds_addrspace> + %alloc = memref.alloc() : memref<64x64xf32, #gpu.address_space> // GFX950: %[[BUFFER_DESC:.*]] = builtin.unrealized_conversion_cast %[[ARG0]] // GFX950: %[[C0:.*]] = arith.constant 0 : index @@ -85,6 +77,6 @@ func.func @fat_buffer_load_to_rocdl_f128(%global : memref<128x72xf32, #amdgpu_fa // GFX950: rocdl.load.to.lds %[[GLOBAL_PTR]], %[[LDS_PTR]], 16 // GFX942: error: 'amdgpu.gather_to_lds' op Gather to LDS instructions with 12-byte and 16-byte load widths are only supported on gfx950 amdgpu.gather_to_lds %global[%c12, %c0], %alloc[%c32, %c0] - : f128, memref<128x72xf32, #amdgpu_fat_buffer_addrspace>, memref<64x64xf32, #gpu_lds_addrspace> + : f128, memref<128x72xf32, #amdgpu.address_space>, memref<64x64xf32, #gpu.address_space> func.return } diff --git a/mlir/test/Conversion/AMDGPUToROCDL/load_lds.mlir b/mlir/test/Conversion/AMDGPUToROCDL/load_lds.mlir index a24430c5b86cc..5502ef49a1f7a 100644 --- a/mlir/test/Conversion/AMDGPUToROCDL/load_lds.mlir +++ b/mlir/test/Conversion/AMDGPUToROCDL/load_lds.mlir @@ -1,17 +1,13 @@ // RUN: mlir-opt %s --convert-amdgpu-to-rocdl=chipset=gfx942 | FileCheck %s // RUN: mlir-opt %s --convert-amdgpu-to-rocdl=chipset=gfx950 | FileCheck %s -#gpu_global_addrspace = 1 -#gpu_lds_addrspace = 3 -#amdgpu_fat_buffer_addrspace = 7 - // CHECK-LABEL: func @global_load_to_rocdl_f32 -// CHECK-SAME: (%[[ARG0:.*]]: memref<128x72xf32, 1>) -func.func @global_load_to_rocdl_f32(%global : memref<128x72xf32, #gpu_global_addrspace>) { +// CHECK-SAME: (%[[ARG0:.*]]: memref<128x72xf32, #gpu.address_space>) +func.func @global_load_to_rocdl_f32(%global : memref<128x72xf32, #gpu.address_space>) { %c0 = arith.constant 0 : index %c12 = arith.constant 12 : index %c32 = arith.constant 32 : index - %alloc = memref.alloc() : memref<64x64xf32, #gpu_lds_addrspace> + %alloc = memref.alloc() : memref<64x64xf32, #gpu.address_space> // CHECK: %[[GLOBAL_DESC:.*]] = builtin.unrealized_conversion_cast %[[ARG0]] // CHECK: %[[C0:.*]] = arith.constant 0 : index @@ -39,7 +35,7 @@ func.func @global_load_to_rocdl_f32(%global : memref<128x72xf32, #gpu_global_add // CHECK: %[[LDS_PTR:.*]] = llvm.getelementptr %[[LDS_BASE]][%[[DST_OFFSET]]] // CHECK: rocdl.load.to.lds %[[GLOBAL_PTR]], %[[LDS_PTR]], 4 amdgpu.gather_to_lds %global[%c12, %c0], %alloc[%c32, %c0] - : f32, memref<128x72xf32, #gpu_global_addrspace>, memref<64x64xf32, #gpu_lds_addrspace> + : f32, memref<128x72xf32, #gpu.address_space>, memref<64x64xf32, #gpu.address_space> func.return } @@ -100,8 +96,8 @@ func.func @global_load_to_rocdl_0d(%global : memref) { } // CHECK-LABEL: func @global_load_to_rocdl_i8 -// CHECK-SAME: (%[[ARG0:.*]]: memref<128x72xi8, 1>) -func.func @global_load_to_rocdl_i8(%global : memref<128x72xi8, #gpu_global_addrspace>) { +// CHECK-SAME: (%[[ARG0:.*]]: memref<128x72xi8, #gpu.address_space>) +func.func @global_load_to_rocdl_i8(%global : memref<128x72xi8, #gpu.address_space>) { // CHECK: %[[GLOBAL_DESC:.*]] = builtin.unrealized_conversion_cast %[[ARG0]] // CHECK: %[[C0:.*]] = arith.constant 0 : index @@ -127,19 +123,19 @@ func.func @global_load_to_rocdl_i8(%global : memref<128x72xi8, #gpu_global_addrs // CHECK: %[[DST_OFFSET:.*]] = llvm.add %[[MUL_2]], %[[IC0]] : i64 // CHECK: %[[LDS_PTR:.*]] = llvm.getelementptr %[[LDS_BASE]][%[[DST_OFFSET]]] - // CHECK: rocdl.load.to.lds %[[GLOBAL_PTR]], %[[LDS_PTR]], 1 + // CHECK: rocdl.load.to.lds %[[GLOBAL_PTR]], %[[LDS_PTR]], 1, 0, 0 : <1> %c0 = arith.constant 0 : index %c12 = arith.constant 12 : index %c32 = arith.constant 32 : index - %alloc = memref.alloc() : memref<64x64xi8, #gpu_lds_addrspace> + %alloc = memref.alloc() : memref<64x64xi8, #gpu.address_space> amdgpu.gather_to_lds %global[%c12, %c0], %alloc[%c32, %c0] - : i8, memref<128x72xi8, #gpu_global_addrspace>, memref<64x64xi8, #gpu_lds_addrspace> + : i8, memref<128x72xi8, #gpu.address_space>, memref<64x64xi8, #gpu.address_space> func.return } // CHECK-LABEL: func @global_load_to_rocdl_vec -// CHECK-SAME: (%[[ARG0:.*]]: memref<128x72xi16, 1>) -func.func @global_load_to_rocdl_vec(%global : memref<128x72xi16, #gpu_global_addrspace>) { +// CHECK-SAME: (%[[ARG0:.*]]: memref<128x72xi16, #gpu.address_space>) +func.func @global_load_to_rocdl_vec(%global : memref<128x72xi16, #gpu.address_space>) { // CHECK: %[[GLOBAL_DESC:.*]] = builtin.unrealized_conversion_cast %[[ARG0]] // CHECK: %[[C0:.*]] = arith.constant 0 : index @@ -169,15 +165,15 @@ func.func @global_load_to_rocdl_vec(%global : memref<128x72xi16, #gpu_global_add %c0 = arith.constant 0 : index %c12 = arith.constant 12 : index %c32 = arith.constant 32 : index - %alloc = memref.alloc() : memref<64x128xi16, #gpu_lds_addrspace> + %alloc = memref.alloc() : memref<64x128xi16, #gpu.address_space> amdgpu.gather_to_lds %global[%c12, %c0], %alloc[%c32, %c0] - : vector<2 x i16>, memref<128x72xi16, #gpu_global_addrspace>, memref<64x128xi16, #gpu_lds_addrspace> + : vector<2 x i16>, memref<128x72xi16, #gpu.address_space>, memref<64x128xi16, #gpu.address_space> func.return } // CHECK-LABEL: func @global_load_to_rocdl_dynamic_indices -// CHECK-SAME: (%[[ARG0:.*]]: memref<512xi32, 1>, %[[SRC_IDX:.*]]: index, %[[DST_IDX:.*]]: index) -func.func @global_load_to_rocdl_dynamic_indices(%global : memref<512xi32, #gpu_global_addrspace>, %src_idx : index, %dst_idx : index) { +// CHECK-SAME: (%[[ARG0:.*]]: memref<512xi32, #gpu.address_space>, %[[SRC_IDX:.*]]: index, %[[DST_IDX:.*]]: index) +func.func @global_load_to_rocdl_dynamic_indices(%global : memref<512xi32, #gpu.address_space>, %src_idx : index, %dst_idx : index) { // CHECK: %[[DSTIDX_CAST:.*]] = builtin.unrealized_conversion_cast %[[DST_IDX]] // CHECK: %[[SRCIDX_CAST:.*]] = builtin.unrealized_conversion_cast %[[SRC_IDX]] // CHECK: %[[GLOBAL_DESC:.*]] = builtin.unrealized_conversion_cast %[[ARG0]] @@ -193,20 +189,20 @@ func.func @global_load_to_rocdl_dynamic_indices(%global : memref<512xi32, #gpu_g // CHECK: %[[DSTIDX1:.*]] = llvm.add %[[DSTIDX]], %[[C0_I64]] : i64 // CHECK: %[[LDS_PTR:.*]] = llvm.getelementptr %[[LDS_BASE]][%[[DSTIDX1]]] // CHECK: rocdl.load.to.lds %[[GLOBAL_PTR]], %[[LDS_PTR]], 4 - %alloc = memref.alloc() : memref<4x64xi32, #gpu_lds_addrspace> + %alloc = memref.alloc() : memref<4x64xi32, #gpu.address_space> %c0 = arith.constant 0 : index amdgpu.gather_to_lds %global[%src_idx], %alloc[%dst_idx, %c0] - : i32, memref<512xi32, #gpu_global_addrspace>, memref<4x64xi32, #gpu_lds_addrspace> + : i32, memref<512xi32, #gpu.address_space>, memref<4x64xi32, #gpu.address_space> func.return } // CHECK-LABEL: func @fat_buffer_load_to_rocdl_f32 -// CHECK-SAME: (%[[ARG0:.*]]: memref<128x72xf32, 7>) -func.func @fat_buffer_load_to_rocdl_f32(%global : memref<128x72xf32, #amdgpu_fat_buffer_addrspace>) { +// CHECK-SAME: (%[[ARG0:.*]]: memref<128x72xf32, #amdgpu.address_space>) +func.func @fat_buffer_load_to_rocdl_f32(%global : memref<128x72xf32, #amdgpu.address_space>) { %c0 = arith.constant 0 : index %c12 = arith.constant 12 : index %c32 = arith.constant 32 : index - %alloc = memref.alloc() : memref<64x64xf32, #gpu_lds_addrspace> + %alloc = memref.alloc() : memref<64x64xf32, #gpu.address_space> // CHECK: %[[BUFFER_DESC:.*]] = builtin.unrealized_conversion_cast %[[ARG0]] // CHECK: %[[C0:.*]] = arith.constant 0 : index @@ -234,6 +230,6 @@ func.func @fat_buffer_load_to_rocdl_f32(%global : memref<128x72xf32, #amdgpu_fat // CHECK: %[[LDS_PTR:.*]] = llvm.getelementptr %[[LDS_BASE]][%[[DST_OFFSET]]] // CHECK: rocdl.load.to.lds %[[GLOBAL_PTR]], %[[LDS_PTR]], 4 amdgpu.gather_to_lds %global[%c12, %c0], %alloc[%c32, %c0] - : f32, memref<128x72xf32, #amdgpu_fat_buffer_addrspace>, memref<64x64xf32, #gpu_lds_addrspace> + : f32, memref<128x72xf32, #amdgpu.address_space>, memref<64x64xf32, #gpu.address_space> func.return }