From 97f6ccd8622d2dc7dac6573796c4815b72fb5a8e Mon Sep 17 00:00:00 2001 From: Changpeng Fang Date: Tue, 11 Nov 2025 17:10:56 -0800 Subject: [PATCH 1/8] [AMDGPU] update LDS block size for gfx1250 Should be 2056 bytes (512 dwords) based on current spec. --- llvm/docs/AMDGPUUsage.rst | 2 +- llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp | 9 +++------ llvm/test/CodeGen/AMDGPU/extra-lds-size.ll | 4 ++-- llvm/test/CodeGen/AMDGPU/lds-size-hsa-gfx1250.ll | 6 +++--- llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.gfx1250.ll | 2 +- 5 files changed, 10 insertions(+), 13 deletions(-) diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index ba0e53bceade8..98aaead96b766 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -5855,7 +5855,7 @@ The fields used by CP for code objects before V3 also match those specified in GFX950 roundup(lds-size / (320 * 4)) GFX125* - roundup(lds-size / (256 * 4)) + roundup(lds-size / (512 * 4)) 24 1 bit ENABLE_EXCEPTION_IEEE_754_FP Wavefront starts execution _INVALID_OPERATION with specified exceptions diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index 29f8f9bc8b54c..f47eaea9b136c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -1161,12 +1161,9 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, ProgInfo.DX10Clamp = Mode.DX10Clamp; unsigned LDSAlignShift; - if (STM.getFeatureBits().test(FeatureAddressableLocalMemorySize327680)) { - // LDS is allocated in 256 dword blocks. - LDSAlignShift = 10; - } else if (STM.getFeatureBits().test( - FeatureAddressableLocalMemorySize163840)) { - // LDS is allocated in 320 dword blocks. + if (STM.getFeatureBits().test(FeatureAddressableLocalMemorySize327680) || + STM.getFeatureBits().test(FeatureAddressableLocalMemorySize163840)) { + // LDS is allocated in 512 or 320 dword blocks. LDSAlignShift = 11; } else if (STM.getFeatureBits().test( FeatureAddressableLocalMemorySize65536)) { diff --git a/llvm/test/CodeGen/AMDGPU/extra-lds-size.ll b/llvm/test/CodeGen/AMDGPU/extra-lds-size.ll index 4349b18fd394c..b31e87c54b563 100644 --- a/llvm/test/CodeGen/AMDGPU/extra-lds-size.ll +++ b/llvm/test/CodeGen/AMDGPU/extra-lds-size.ll @@ -31,10 +31,10 @@ ; GFX1200-MESA: .long 45100 ; GFX1200-MESA-NEXT: .long 1024 -; GFX1250-PAL: '0x2c0b (SPI_SHADER_PGM_RSRC2_PS)': 0x200 +; GFX1250-PAL: '0x2c0b (SPI_SHADER_PGM_RSRC2_PS)': 0x100 ; GFX1250-MESA: .long 45100 -; GFX1250-MESA-NEXT: .long 512 +; GFX1250-MESA-NEXT: .long 256 @lds = internal addrspace(3) global [4096 x i8] poison diff --git a/llvm/test/CodeGen/AMDGPU/lds-size-hsa-gfx1250.ll b/llvm/test/CodeGen/AMDGPU/lds-size-hsa-gfx1250.ll index 3db0fa8f21759..7e8d5e0f30b9e 100644 --- a/llvm/test/CodeGen/AMDGPU/lds-size-hsa-gfx1250.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-size-hsa-gfx1250.ll @@ -41,7 +41,7 @@ define amdgpu_kernel void @test_lds_i32(i32 %val) { ; GCN-LABEL: test_lds_array_i8: ; GCN: .amdhsa_group_segment_fixed_size 327680 ; GCN: ; LDSByteSize: 327680 bytes/workgroup -; MESA: granulated_lds_size = 320 +; MESA: granulated_lds_size = 160 define amdgpu_kernel void @test_lds_array_i8() { %gep = getelementptr inbounds [327679 x i8], ptr addrspace(3) @lds.array.i8, i32 0, i32 5 %val = load i8, ptr addrspace(3) %gep @@ -52,7 +52,7 @@ define amdgpu_kernel void @test_lds_array_i8() { ; GCN-LABEL: test_lds_array_i16: ; GCN: .amdhsa_group_segment_fixed_size 327680 ; GCN: ; LDSByteSize: 327680 bytes/workgroup -; MESA: granulated_lds_size = 320 +; MESA: granulated_lds_size = 160 define amdgpu_kernel void @test_lds_array_i16() { %gep = getelementptr inbounds [163839 x i16], ptr addrspace(3) @lds.array.i16, i32 0, i32 10 %val = load i16, ptr addrspace(3) %gep @@ -63,7 +63,7 @@ define amdgpu_kernel void @test_lds_array_i16() { ; GCN-LABEL: test_lds_array_i32: ; GCN: .amdhsa_group_segment_fixed_size 327680 ; GCN: ; LDSByteSize: 327680 bytes/workgroup -; MESA: granulated_lds_size = 320 +; MESA: granulated_lds_size = 160 define amdgpu_kernel void @test_lds_array_i32() { %gep = getelementptr inbounds [81919 x i32], ptr addrspace(3) @lds.array.i32, i32 0, i32 20 %val = load i32, ptr addrspace(3) %gep diff --git a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.gfx1250.ll b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.gfx1250.ll index f934c85f68e0f..0df20f6d349f7 100644 --- a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.gfx1250.ll +++ b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.gfx1250.ll @@ -126,7 +126,7 @@ ; CHECK-NEXT: .entry_point: _amdgpu_hs ; CHECK-NEXT: .entry_point_symbol: hs_shader ; CHECK-NEXT: .forward_progress: true -; CHECK-NEXT: .lds_size: 0x1000 +; CHECK-NEXT: .lds_size: 0x800 ; CHECK-NEXT: .mem_ordered: true ; CHECK-NEXT: .scratch_en: false ; CHECK-NEXT: .scratch_memory_size: 0 From b9844d9e0c73b0167df1c35380ddb19e5db284f5 Mon Sep 17 00:00:00 2001 From: Changpeng Fang Date: Wed, 12 Nov 2025 15:55:30 -0800 Subject: [PATCH 2/8] Update AMDGPU::getLdsDwGranularity --- llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 2 +- llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.gfx1250.ll | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 37bf2d2463ae2..c6e985ab84ac5 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -3546,7 +3546,7 @@ bool isDPALU_DPP(const MCInstrDesc &OpDesc, const MCInstrInfo &MII, } unsigned getLdsDwGranularity(const MCSubtargetInfo &ST) { - return ST.hasFeature(AMDGPU::FeatureAddressableLocalMemorySize327680) ? 256 + return ST.hasFeature(AMDGPU::FeatureAddressableLocalMemorySize327680) ? 512 : 128; } diff --git a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.gfx1250.ll b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.gfx1250.ll index 0df20f6d349f7..68694faf833e9 100644 --- a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.gfx1250.ll +++ b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.gfx1250.ll @@ -114,7 +114,7 @@ ; CHECK-NEXT: .entry_point: _amdgpu_gs ; CHECK-NEXT: .entry_point_symbol: gs_shader ; CHECK-NEXT: .forward_progress: true -; CHECK-NEXT: .lds_size: 0x400 +; CHECK-NEXT: .lds_size: 0x800 ; CHECK-NEXT: .mem_ordered: true ; CHECK-NEXT: .scratch_en: false ; CHECK-NEXT: .scratch_memory_size: 0 @@ -126,7 +126,7 @@ ; CHECK-NEXT: .entry_point: _amdgpu_hs ; CHECK-NEXT: .entry_point_symbol: hs_shader ; CHECK-NEXT: .forward_progress: true -; CHECK-NEXT: .lds_size: 0x800 +; CHECK-NEXT: .lds_size: 0x1000 ; CHECK-NEXT: .mem_ordered: true ; CHECK-NEXT: .scratch_en: false ; CHECK-NEXT: .scratch_memory_size: 0 From c17c39ca72b324c9e8bd2fe7934ce0bb11467125 Mon Sep 17 00:00:00 2001 From: Changpeng Fang Date: Wed, 12 Nov 2025 16:56:23 -0800 Subject: [PATCH 3/8] Update maximum lds size for hsa-gfx125*-v4.s --- llvm/test/MC/AMDGPU/hsa-gfx1250-v4.s | 6 +++--- llvm/test/MC/AMDGPU/hsa-gfx1251-v4.s | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/llvm/test/MC/AMDGPU/hsa-gfx1250-v4.s b/llvm/test/MC/AMDGPU/hsa-gfx1250-v4.s index 80a340c1f6261..566e8554765ca 100644 --- a/llvm/test/MC/AMDGPU/hsa-gfx1250-v4.s +++ b/llvm/test/MC/AMDGPU/hsa-gfx1250-v4.s @@ -52,7 +52,7 @@ // OBJDUMP-NEXT: 00e0 00000000 00000000 00000000 00000000 // OBJDUMP-NEXT: 00f0 00000cc0 80000000 00040000 00000000 // max_lds_size -// OBJDUMP-NEXT: 0100 00000600 00000000 00000000 00000000 +// OBJDUMP-NEXT: 0100 00000500 00000000 00000000 00000000 // OBJDUMP-NEXT: 0110 00000000 00000000 00000000 00000000 // OBJDUMP-NEXT: 0120 00000000 00000000 00000000 00000000 // OBJDUMP-NEXT: 0130 00000cc0 80000000 00040000 00000000 @@ -231,13 +231,13 @@ max_vgprs: .p2align 6 .amdhsa_kernel max_lds_size - .amdhsa_group_segment_fixed_size 393216 + .amdhsa_group_segment_fixed_size 327680 .amdhsa_next_free_vgpr 1 .amdhsa_next_free_sgpr 1 .end_amdhsa_kernel // ASM: .amdhsa_kernel max_lds_size -// ASM: .amdhsa_group_segment_fixed_size 393216 +// ASM: .amdhsa_group_segment_fixed_size 327680 // ASM: .end_amdhsa_kernel // Test maximum VGPR allocation diff --git a/llvm/test/MC/AMDGPU/hsa-gfx1251-v4.s b/llvm/test/MC/AMDGPU/hsa-gfx1251-v4.s index 642e62df0437a..0d6bc61ac7753 100644 --- a/llvm/test/MC/AMDGPU/hsa-gfx1251-v4.s +++ b/llvm/test/MC/AMDGPU/hsa-gfx1251-v4.s @@ -52,7 +52,7 @@ // OBJDUMP-NEXT: 00e0 00000000 00000000 00000000 00000000 // OBJDUMP-NEXT: 00f0 00000cc0 80000000 00040000 00000000 // max_lds_size -// OBJDUMP-NEXT: 0100 00000600 00000000 00000000 00000000 +// OBJDUMP-NEXT: 0100 00000500 00000000 00000000 00000000 // OBJDUMP-NEXT: 0110 00000000 00000000 00000000 00000000 // OBJDUMP-NEXT: 0120 00000000 00000000 00000000 00000000 // OBJDUMP-NEXT: 0130 00000cc0 80000000 00040000 00000000 @@ -231,13 +231,13 @@ max_vgprs: .p2align 6 .amdhsa_kernel max_lds_size - .amdhsa_group_segment_fixed_size 393216 + .amdhsa_group_segment_fixed_size 327680 .amdhsa_next_free_vgpr 1 .amdhsa_next_free_sgpr 1 .end_amdhsa_kernel // ASM: .amdhsa_kernel max_lds_size -// ASM: .amdhsa_group_segment_fixed_size 393216 +// ASM: .amdhsa_group_segment_fixed_size 327680 // ASM: .end_amdhsa_kernel // Test maximum VGPR allocation From 4316519bd7eed5a34b407cc71f801c5d471ccea9 Mon Sep 17 00:00:00 2001 From: Changpeng Fang Date: Fri, 14 Nov 2025 15:22:15 -0800 Subject: [PATCH 4/8] [AMDGPU] Add the missing LdsDwGranularity for GFX950 and R600 Also call getLdsDwGranularity to compute the number of LDSBlocks. --- llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp | 20 +- .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 11 +- .../CodeGen/AMDGPU/pal-metadata-3.0.gfx950.ll | 215 ++++++++++++++++++ 3 files changed, 235 insertions(+), 11 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.gfx950.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index f47eaea9b136c..e84fc0ceadd50 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -1160,18 +1160,20 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, // Make clamp modifier on NaN input returns 0. ProgInfo.DX10Clamp = Mode.DX10Clamp; - unsigned LDSAlignShift; - if (STM.getFeatureBits().test(FeatureAddressableLocalMemorySize327680) || - STM.getFeatureBits().test(FeatureAddressableLocalMemorySize163840)) { - // LDS is allocated in 512 or 320 dword blocks. + unsigned LDSAlignShift = 8; + switch(getLdsDwGranularity(STM)) { + case 512: + case 320: LDSAlignShift = 11; - } else if (STM.getFeatureBits().test( - FeatureAddressableLocalMemorySize65536)) { - // LDS is allocated in 128 dword blocks. + break; + case 128: LDSAlignShift = 9; - } else { - // LDS is allocated in 64 dword blocks. + break; + case 64: LDSAlignShift = 8; + break; + default: + llvm_unreachable("invald LDS block size"); } ProgInfo.SGPRSpill = MFI->getNumSpilledSGPRs(); diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index c6e985ab84ac5..29f26f6ce4cd2 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -3546,8 +3546,15 @@ bool isDPALU_DPP(const MCInstrDesc &OpDesc, const MCInstrInfo &MII, } unsigned getLdsDwGranularity(const MCSubtargetInfo &ST) { - return ST.hasFeature(AMDGPU::FeatureAddressableLocalMemorySize327680) ? 512 - : 128; + if (ST.getFeatureBits().test(FeatureAddressableLocalMemorySize32768)) + return 64; + if (ST.getFeatureBits().test(FeatureAddressableLocalMemorySize65536)) + return 128; + if (ST.getFeatureBits().test(FeatureAddressableLocalMemorySize163840)) + return 320; + if (ST.getFeatureBits().test(FeatureAddressableLocalMemorySize327680)) + return 512; + return 64; } bool isPackedFP32Inst(unsigned Opc) { diff --git a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.gfx950.ll b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.gfx950.ll new file mode 100644 index 0000000000000..6aee8097d14f5 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.gfx950.ll @@ -0,0 +1,215 @@ +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx950 <%s | FileCheck %s --check-prefixes=CHECK + +; CHECK-LABEL: {{^}}_amdgpu_cs_main: +; CHECK: ; TotalNumSgprs: 6 +; CHECK: ; NumVgprs: 1 +; CHECK: .amdgpu_pal_metadata +; CHECK-NEXT: --- +; CHECK-NEXT: amdpal.pipelines: +; CHECK-NEXT: - .api: Vulkan +; CHECK-NEXT: .compute_registers: +; CHECK-NEXT: .tg_size_en: true +; CHECK-NEXT: .tgid_x_en: false +; CHECK-NEXT: .tgid_y_en: false +; CHECK-NEXT: .tgid_z_en: false +; CHECK-NEXT: .tidig_comp_cnt: 0x1 +; CHECK-NEXT: .graphics_registers: +; CHECK-NEXT: .ps_extra_lds_size: 0 +; CHECK-NEXT: .spi_ps_input_addr: +; CHECK-NEXT: .ancillary_ena: false +; CHECK-NEXT: .front_face_ena: true +; CHECK-NEXT: .line_stipple_tex_ena: false +; CHECK-NEXT: .linear_center_ena: true +; CHECK-NEXT: .linear_centroid_ena: true +; CHECK-NEXT: .linear_sample_ena: true +; CHECK-NEXT: .persp_center_ena: true +; CHECK-NEXT: .persp_centroid_ena: true +; CHECK-NEXT: .persp_pull_model_ena: false +; CHECK-NEXT: .persp_sample_ena: true +; CHECK-NEXT: .pos_fixed_pt_ena: true +; CHECK-NEXT: .pos_w_float_ena: false +; CHECK-NEXT: .pos_x_float_ena: false +; CHECK-NEXT: .pos_y_float_ena: false +; CHECK-NEXT: .pos_z_float_ena: false +; CHECK-NEXT: .sample_coverage_ena: false +; CHECK-NEXT: .spi_ps_input_ena: +; CHECK-NEXT: .ancillary_ena: false +; CHECK-NEXT: .front_face_ena: false +; CHECK-NEXT: .line_stipple_tex_ena: false +; CHECK-NEXT: .linear_center_ena: false +; CHECK-NEXT: .linear_centroid_ena: false +; CHECK-NEXT: .linear_sample_ena: false +; CHECK-NEXT: .persp_center_ena: false +; CHECK-NEXT: .persp_centroid_ena: false +; CHECK-NEXT: .persp_pull_model_ena: false +; CHECK-NEXT: .persp_sample_ena: true +; CHECK-NEXT: .pos_fixed_pt_ena: false +; CHECK-NEXT: .pos_w_float_ena: false +; CHECK-NEXT: .pos_x_float_ena: false +; CHECK-NEXT: .pos_y_float_ena: false +; CHECK-NEXT: .pos_z_float_ena: false +; CHECK-NEXT: .sample_coverage_ena: false +; CHECK-NEXT: .hardware_stages: +; CHECK-NEXT: .cs: +; CHECK-NEXT: .agpr_count: 0 +; CHECK-NEXT: .checksum_value: 0x9444d7d0 +; CHECK-NEXT: .debug_mode: false +; CHECK-NEXT: .entry_point: _amdgpu_cs +; CHECK-NEXT: .entry_point_symbol: _amdgpu_cs_main +; CHECK-NEXT: .excp_en: 0 +; CHECK-NEXT: .float_mode: 0xc0 +; CHECK-NEXT: .forward_progress: false +; CHECK-NEXT: .ieee_mode: false +; CHECK-NEXT: .image_op: false +; CHECK-NEXT: .lds_size: 0 +; CHECK-NEXT: .mem_ordered: false +; CHECK-NEXT: .scratch_en: false +; CHECK-NEXT: .scratch_memory_size: 0 +; CHECK-NEXT: .sgpr_count: 0xa +; CHECK-NEXT: .sgpr_limit: 0x6a +; CHECK-NEXT: .threadgroup_dimensions: +; CHECK-NEXT: - 0x1 +; CHECK-NEXT: - 0x400 +; CHECK-NEXT: - 0x1 +; CHECK-NEXT: .trap_present: false +; CHECK-NEXT: .user_data_reg_map: +; CHECK-NEXT: - 0x10000000 +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0 +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: .user_sgprs: 0x3 +; CHECK-NEXT: .vgpr_count: 0x2 +; CHECK-NEXT: .vgpr_limit: 0x100 +; CHECK-NEXT: .wavefront_size: 0x20 +; CHECK-NEXT: .wgp_mode: false +; CHECK-NEXT: .gs: +; CHECK-NEXT: .agpr_count: 0 +; CHECK-NEXT: .debug_mode: false +; CHECK-NEXT: .entry_point: _amdgpu_gs +; CHECK-NEXT: .entry_point_symbol: gs_shader +; CHECK-NEXT: .forward_progress: false +; CHECK-NEXT: .ieee_mode: false +; CHECK-NEXT: .lds_size: 0x500 +; CHECK-NEXT: .mem_ordered: false +; CHECK-NEXT: .scratch_en: false +; CHECK-NEXT: .scratch_memory_size: 0 +; CHECK-NEXT: .sgpr_count: 0x6 +; CHECK-NEXT: .vgpr_count: 0x1 +; CHECK-NEXT: .wgp_mode: false +; CHECK-NEXT: .hs: +; CHECK-NEXT: .agpr_count: 0 +; CHECK-NEXT: .debug_mode: false +; CHECK-NEXT: .entry_point: _amdgpu_hs +; CHECK-NEXT: .entry_point_symbol: hs_shader +; CHECK-NEXT: .forward_progress: false +; CHECK-NEXT: .ieee_mode: false +; CHECK-NEXT: .lds_size: 0xa00 +; CHECK-NEXT: .mem_ordered: false +; CHECK-NEXT: .scratch_en: false +; CHECK-NEXT: .scratch_memory_size: 0 +; CHECK-NEXT: .sgpr_count: 0x6 +; CHECK-NEXT: .vgpr_count: 0x1 +; CHECK-NEXT: .wgp_mode: false +; CHECK-NEXT: .ps: +; CHECK-NEXT: .agpr_count: 0 +; CHECK-NEXT: .debug_mode: false +; CHECK-NEXT: .entry_point: _amdgpu_ps +; CHECK-NEXT: .entry_point_symbol: ps_shader +; CHECK-NEXT: .forward_progress: false +; CHECK-NEXT: .ieee_mode: false +; CHECK-NEXT: .lds_size: 0 +; CHECK-NEXT: .mem_ordered: false +; CHECK-NEXT: .scratch_en: false +; CHECK-NEXT: .scratch_memory_size: 0 +; CHECK-NEXT: .sgpr_count: 0x6 +; CHECK-NEXT: .vgpr_count: 0x1 +; CHECK-NEXT: .wgp_mode: false +; CHECK: .registers: {} +; CHECK:amdpal.version: +; CHECK-NEXT: - 0x3 +; CHECK-NEXT: - 0 +; CHECK-NEXT:... +; CHECK-NEXT: .end_amdgpu_pal_metadata + +define dllexport amdgpu_cs void @_amdgpu_cs_main(i32 inreg %arg1, i32 %arg2) #0 !lgc.shaderstage !1 { +.entry: + %i = call i64 @llvm.amdgcn.s.getpc() + %i1 = and i64 %i, -4294967296 + %i2 = zext i32 %arg1 to i64 + %i3 = or i64 %i1, %i2 + %i4 = inttoptr i64 %i3 to ptr addrspace(4) + %i5 = and i32 %arg2, 1023 + %i6 = lshr i32 %arg2, 10 + %i7 = and i32 %i6, 1023 + %i8 = add nuw nsw i32 %i7, %i5 + %i9 = load <4 x i32>, ptr addrspace(4) %i4, align 16 + %.idx = shl nuw nsw i32 %i8, 2 + call void @llvm.amdgcn.raw.buffer.store.i32(i32 1, <4 x i32> %i9, i32 %.idx, i32 0, i32 0) + ret void +} + +define dllexport amdgpu_ps void @ps_shader() #1 { + ret void +} + +@LDS.GS = external addrspace(3) global [1 x i32], align 4 + +define dllexport amdgpu_gs void @gs_shader() #2 { + %ptr = getelementptr i32, ptr addrspace(3) @LDS.GS, i32 0 + store i32 0, ptr addrspace(3) %ptr, align 4 + ret void +} + +@LDS.HS = external addrspace(3) global [1024 x i32], align 4 + +define dllexport amdgpu_hs void @hs_shader() #2 { + %ptr = getelementptr i32, ptr addrspace(3) @LDS.HS, i32 0 + store i32 0, ptr addrspace(3) %ptr, align 4 + ret void +} + +!amdgpu.pal.metadata.msgpack = !{!0} + +; Function Attrs: nounwind willreturn memory(none) +declare ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32>) #1 + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare i64 @llvm.amdgcn.s.getpc() #2 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(write) +declare void @llvm.amdgcn.raw.buffer.store.i32(i32, <4 x i32>, i32, i32, i32 immarg) #3 + +attributes #0 = { nounwind memory(readwrite) "amdgpu-flat-work-group-size"="1024,1024" "amdgpu-memory-bound"="false" "amdgpu-unroll-threshold"="700" "amdgpu-wave-limiter"="false" "amdgpu-work-group-info-arg-no"="4" "denormal-fp-math-f32"="preserve-sign" } + +attributes #1 = { nounwind memory(readwrite) "InitialPSInputAddr"="36983" } + +!0 = !{!"\82\B0amdpal.pipelines\91\8A\A4.api\A6Vulkan\B2.compute_registers\85\AB.tg_size_en\C3\AA.tgid_x_en\C2\AA.tgid_y_en\C2\AA.tgid_z_en\C2\AF.tidig_comp_cnt\01\B0.hardware_stages\81\A3.cs\8C\AF.checksum_value\CE\94D\D7\D0\AB.debug_mode\00\AB.float_mode\CC\C0\A9.image_op\C2\AC.mem_ordered\C3\AB.sgpr_limitj\B7.threadgroup_dimensions\93\01\CD\04\00\01\AD.trap_present\00\B2.user_data_reg_map\DC\00 \CE\10\00\00\00\CE\FF\FF\FF\FF\00\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\AB.user_sgprs\03\AB.vgpr_limit\CD\01\00\AF.wavefront_size \B7.internal_pipeline_hash\92\CF\E7\10k\A6:\A6%\F7\CF\B2\1F\1A\D4{\DA\E1T\AA.registers\80\A8.shaders\81\A8.compute\82\B0.api_shader_hash\92\CF\E9Zn7}\1E\B9\E7\00\B1.hardware_mapping\91\A3.cs\B0.spill_threshold\CE\FF\FF\FF\FF\A5.type\A2Cs\B0.user_data_limit\01\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\B4X\B8\11[\A4\88P\CF\A0;\B0\AF\FF\B4\BE\C0\AD.llpc_version\A461.1\AEamdpal.version\92\03\00"} +!1 = !{i32 7} From 28d13dd3d560cccf4186d0b5d1f595ab52c3c7b9 Mon Sep 17 00:00:00 2001 From: Changpeng Fang Date: Fri, 14 Nov 2025 15:35:17 -0800 Subject: [PATCH 5/8] Fix clang format --- llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index e84fc0ceadd50..d3eea50fd514b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -1161,7 +1161,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, ProgInfo.DX10Clamp = Mode.DX10Clamp; unsigned LDSAlignShift = 8; - switch(getLdsDwGranularity(STM)) { + switch (getLdsDwGranularity(STM)) { case 512: case 320: LDSAlignShift = 11; From 60681ffb1b5376042d806f834ff181b3b233b8ff Mon Sep 17 00:00:00 2001 From: Changpeng Fang Date: Sun, 16 Nov 2025 22:16:36 -0800 Subject: [PATCH 6/8] [AMDGPU] Use llvm_unreachable when target is unknown Also simplified pal-metadata-3.0.gfx950.ll --- llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 2 +- llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.gfx950.ll | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 29f26f6ce4cd2..539e5c86b38f7 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -3554,7 +3554,7 @@ unsigned getLdsDwGranularity(const MCSubtargetInfo &ST) { return 320; if (ST.getFeatureBits().test(FeatureAddressableLocalMemorySize327680)) return 512; - return 64; + llvm_unreachable("Unknown Subtarget"); } bool isPackedFP32Inst(unsigned Opc) { diff --git a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.gfx950.ll b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.gfx950.ll index 6aee8097d14f5..b3575c68b892f 100644 --- a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.gfx950.ll +++ b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.gfx950.ll @@ -159,7 +159,7 @@ ; CHECK-NEXT:... ; CHECK-NEXT: .end_amdgpu_pal_metadata -define dllexport amdgpu_cs void @_amdgpu_cs_main(i32 inreg %arg1, i32 %arg2) #0 !lgc.shaderstage !1 { +define amdgpu_cs void @_amdgpu_cs_main(i32 inreg %arg1, i32 %arg2) #0 !lgc.shaderstage !1 { .entry: %i = call i64 @llvm.amdgcn.s.getpc() %i1 = and i64 %i, -4294967296 @@ -176,13 +176,13 @@ define dllexport amdgpu_cs void @_amdgpu_cs_main(i32 inreg %arg1, i32 %arg2) #0 ret void } -define dllexport amdgpu_ps void @ps_shader() #1 { +define amdgpu_ps void @ps_shader() #1 { ret void } @LDS.GS = external addrspace(3) global [1 x i32], align 4 -define dllexport amdgpu_gs void @gs_shader() #2 { +define amdgpu_gs void @gs_shader() { %ptr = getelementptr i32, ptr addrspace(3) @LDS.GS, i32 0 store i32 0, ptr addrspace(3) %ptr, align 4 ret void @@ -190,7 +190,7 @@ define dllexport amdgpu_gs void @gs_shader() #2 { @LDS.HS = external addrspace(3) global [1024 x i32], align 4 -define dllexport amdgpu_hs void @hs_shader() #2 { +define amdgpu_hs void @hs_shader() { %ptr = getelementptr i32, ptr addrspace(3) @LDS.HS, i32 0 store i32 0, ptr addrspace(3) %ptr, align 4 ret void From 9befbe2e58d6c9b478263960523a6cb5e858ffa0 Mon Sep 17 00:00:00 2001 From: Changpeng Fang Date: Mon, 17 Nov 2025 13:25:21 -0800 Subject: [PATCH 7/8] [AMDGPU] Should return a value instead of llvm_unreachable in getLdsDwGranularity We may compile with no target, and do not expect a unreachanle failure. --- llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 539e5c86b38f7..73b46c65d0bc8 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -3554,7 +3554,7 @@ unsigned getLdsDwGranularity(const MCSubtargetInfo &ST) { return 320; if (ST.getFeatureBits().test(FeatureAddressableLocalMemorySize327680)) return 512; - llvm_unreachable("Unknown Subtarget"); + return 64; //In sync with getAddressableLocalMemorySize } bool isPackedFP32Inst(unsigned Opc) { From 8ffd4cece77c8317b20608679b3ff3bd34bf28d5 Mon Sep 17 00:00:00 2001 From: Changpeng Fang Date: Mon, 17 Nov 2025 14:53:55 -0800 Subject: [PATCH 8/8] Fix a clang format issue --- llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 73b46c65d0bc8..4167d09ec2b12 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -3554,7 +3554,7 @@ unsigned getLdsDwGranularity(const MCSubtargetInfo &ST) { return 320; if (ST.getFeatureBits().test(FeatureAddressableLocalMemorySize327680)) return 512; - return 64; //In sync with getAddressableLocalMemorySize + return 64; // In sync with getAddressableLocalMemorySize } bool isPackedFP32Inst(unsigned Opc) {