diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index f7a847ec7f38f..b6d61a62f50ff 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -5405,7 +5405,21 @@ The fields used by CP for code objects before V3 also match those specified in Used by CP to set up ``COMPUTE_PGM_RSRC1.FP16_OVFL``. - 28:27 2 bits Reserved, must be 0. + 27 1 bit RESERVED GFX6-GFX120* + Reserved, must be 0. + FLAT_SCRATCH_IS_NV GFX125* + 0 - Use the NV ISA as indication + that scratch is NV. 1 - Force + scratch to NV = 1, even if + ISA.NV == 0 if the address falls + into scratch space (not global). + This allows global.NV = 0 and + scratch.NV = 1 for flat ops. Other + threads use the ISA bit value. + + Used by CP to set up + ``COMPUTE_PGM_RSRC1.FLAT_SCRATCH_IS_NV``. + 28 1 bit RESERVED Reserved, must be 0. 29 1 bit WGP_MODE GFX6-GFX9 Reserved, must be 0. GFX10-GFX12 @@ -5487,15 +5501,16 @@ The fields used by CP for code objects before V3 also match those specified in Used by CP to set up ``COMPUTE_PGM_RSRC2.SCRATCH_EN``. - 5:1 5 bits USER_SGPR_COUNT The total number of SGPR - user data - registers requested. This - number must be greater than - or equal to the number of user - data registers enabled. + 5:1 5 bits USER_SGPR_COUNT GFX6-GFX120* + The total number of SGPR + user data + registers requested. This + number must be greater than + or equal to the number of user + data registers enabled. - Used by CP to set up - ``COMPUTE_PGM_RSRC2.USER_SGPR``. + Used by CP to set up + ``COMPUTE_PGM_RSRC2.USER_SGPR``. 6 1 bit ENABLE_TRAP_HANDLER GFX6-GFX11 Must be 0. @@ -5504,8 +5519,25 @@ The fields used by CP for code objects before V3 also match those specified in which is set by the CP if the runtime has installed a trap handler. - GFX12 - Reserved, must be 0. + ENABLE_DYNAMIC_VGPR GFX120* + Enables dynamic VGPR mode, where + each wave allocates one VGPR chunk + at launch and can request for + additional space to use during + execution in SQ. + + Used by CP to set up + ``COMPUTE_PGM_RSRC2.DYNAMIC_VGPR``. + 6:1 6 bits USER_SGPR_COUNT GFX125* + The total number of SGPR + user data + registers requested. This + number must be greater than + or equal to the number of user + data registers enabled. + + Used by CP to set up + ``COMPUTE_PGM_RSRC2.USER_SGPR``. 7 1 bit ENABLE_SGPR_WORKGROUP_ID_X Enable the setup of the system SGPR register for the work-group id in the X @@ -5598,7 +5630,7 @@ The fields used by CP for code objects before V3 also match those specified in GFX6 roundup(lds-size / (64 * 4)) - GFX7-GFX11 + GFX7-GFX12 roundup(lds-size / (128 * 4)) GFX950 roundup(lds-size / (320 * 4)) @@ -5722,7 +5754,30 @@ The fields used by CP for code objects before V3 also match those specified in with a granularity of 128 bytes. 12 1 bit RESERVED Reserved, must be 0. 13 1 bit GLG_EN If 1, group launch guarantee will be enabled for this dispatch - 30:14 17 bits RESERVED Reserved, must be 0. + 16:14 3 bits RESERVED GFX120* + Reserved, must be 0. + NAMED_BAR_CNT GFX125* + Number of named barriers to alloc for each workgroup, in granularity of + 4. Range is from 0-4 allocating 0, 4, 8, 12, 16. + 17 1 bit RESERVED GFX120* + Reserved, must be 0. + ENABLE_DYNAMIC_VGPR GFX125* + Enables dynamic VGPR mode, where each wave allocates one VGPR chunk + at launch and can request for additional space to use during + execution in SQ. + + Used by CP to set up ``COMPUTE_PGM_RSRC3.DYNAMIC_VGPR``. + 20:18 3 bits RESERVED GFX120* + Reserved, must be 0. + TCP_SPLIT GFX125* + Desired LDS/VC split of TCP. 0: no preference 1: LDS=0, VC=448kB + 2: LDS=64kB, VC=384kB 3: LDS=128kB, VC=320kB 4: LDS=192kB, VC=256kB + 5: LDS=256kB, VC=192kB 6: LDS=320kB, VC=128kB 7: LDS=384kB, VC=64kB + 21 1 bit RESERVED GFX120* + Reserved, must be 0. + ENABLE_DIDT_THROTTLE GFX125* + Enable DIDT throttling for all ACE pipes + 30:22 9 bits RESERVED Reserved, must be 0. 31 1 bit IMAGE_OP If 1, the kernel execution contains image instructions. If executed as part of a graphics pipeline, image read instructions will stall waiting for any necessary ``WAIT_SYNC`` fence to be performed in order to diff --git a/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h b/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h index 78f38ed5a9d4b..fb9d68428cf18 100644 --- a/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h +++ b/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h @@ -83,26 +83,32 @@ enum : uint8_t { // Compute program resource register 1. Must match hardware definition. // GFX6+. -#define COMPUTE_PGM_RSRC1(NAME, SHIFT, WIDTH) \ - AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC1_ ## NAME, SHIFT, WIDTH) +#define COMPUTE_PGM_RSRC1(NAME, SHIFT, WIDTH) \ + AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC1_##NAME, SHIFT, WIDTH) // [GFX6-GFX8]. -#define COMPUTE_PGM_RSRC1_GFX6_GFX8(NAME, SHIFT, WIDTH) \ - AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC1_GFX6_GFX8_ ## NAME, SHIFT, WIDTH) +#define COMPUTE_PGM_RSRC1_GFX6_GFX8(NAME, SHIFT, WIDTH) \ + AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC1_GFX6_GFX8_##NAME, SHIFT, WIDTH) // [GFX6-GFX9]. -#define COMPUTE_PGM_RSRC1_GFX6_GFX9(NAME, SHIFT, WIDTH) \ - AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC1_GFX6_GFX9_ ## NAME, SHIFT, WIDTH) +#define COMPUTE_PGM_RSRC1_GFX6_GFX9(NAME, SHIFT, WIDTH) \ + AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC1_GFX6_GFX9_##NAME, SHIFT, WIDTH) // [GFX6-GFX11]. #define COMPUTE_PGM_RSRC1_GFX6_GFX11(NAME, SHIFT, WIDTH) \ AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC1_GFX6_GFX11_##NAME, SHIFT, WIDTH) +// [GFX6-GFX120]. +#define COMPUTE_PGM_RSRC1_GFX6_GFX120(NAME, SHIFT, WIDTH) \ + AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC1_GFX6_GFX120_##NAME, SHIFT, WIDTH) // GFX9+. -#define COMPUTE_PGM_RSRC1_GFX9_PLUS(NAME, SHIFT, WIDTH) \ - AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC1_GFX9_PLUS_ ## NAME, SHIFT, WIDTH) +#define COMPUTE_PGM_RSRC1_GFX9_PLUS(NAME, SHIFT, WIDTH) \ + AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC1_GFX9_PLUS_##NAME, SHIFT, WIDTH) // GFX10+. -#define COMPUTE_PGM_RSRC1_GFX10_PLUS(NAME, SHIFT, WIDTH) \ - AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC1_GFX10_PLUS_ ## NAME, SHIFT, WIDTH) +#define COMPUTE_PGM_RSRC1_GFX10_PLUS(NAME, SHIFT, WIDTH) \ + AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC1_GFX10_PLUS_##NAME, SHIFT, WIDTH) // GFX12+. #define COMPUTE_PGM_RSRC1_GFX12_PLUS(NAME, SHIFT, WIDTH) \ AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC1_GFX12_PLUS_##NAME, SHIFT, WIDTH) +// [GFX125]. +#define COMPUTE_PGM_RSRC1_GFX125(NAME, SHIFT, WIDTH) \ + AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC1_GFX125_##NAME, SHIFT, WIDTH) enum : int32_t { COMPUTE_PGM_RSRC1(GRANULATED_WORKITEM_VGPR_COUNT, 0, 6), COMPUTE_PGM_RSRC1(GRANULATED_WAVEFRONT_SGPR_COUNT, 6, 4), @@ -121,8 +127,10 @@ enum : int32_t { COMPUTE_PGM_RSRC1(CDBG_USER, 25, 1), COMPUTE_PGM_RSRC1_GFX6_GFX8(RESERVED0, 26, 1), COMPUTE_PGM_RSRC1_GFX9_PLUS(FP16_OVFL, 26, 1), - COMPUTE_PGM_RSRC1(RESERVED1, 27, 2), - COMPUTE_PGM_RSRC1_GFX6_GFX9(RESERVED2, 29, 3), + COMPUTE_PGM_RSRC1_GFX6_GFX120(RESERVED1, 27, 1), + COMPUTE_PGM_RSRC1_GFX125(FLAT_SCRATCH_IS_NV, 27, 1), + COMPUTE_PGM_RSRC1(RESERVED2, 28, 1), + COMPUTE_PGM_RSRC1_GFX6_GFX9(RESERVED3, 29, 3), COMPUTE_PGM_RSRC1_GFX10_PLUS(WGP_MODE, 29, 1), COMPUTE_PGM_RSRC1_GFX10_PLUS(MEM_ORDERED, 30, 1), COMPUTE_PGM_RSRC1_GFX10_PLUS(FWD_PROGRESS, 31, 1), @@ -131,19 +139,29 @@ enum : int32_t { // Compute program resource register 2. Must match hardware definition. // GFX6+. -#define COMPUTE_PGM_RSRC2(NAME, SHIFT, WIDTH) \ - AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC2_ ## NAME, SHIFT, WIDTH) +#define COMPUTE_PGM_RSRC2(NAME, SHIFT, WIDTH) \ + AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC2_##NAME, SHIFT, WIDTH) // [GFX6-GFX11]. #define COMPUTE_PGM_RSRC2_GFX6_GFX11(NAME, SHIFT, WIDTH) \ AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC2_GFX6_GFX11_##NAME, SHIFT, WIDTH) +// [GFX6-GFX120]. +#define COMPUTE_PGM_RSRC2_GFX6_GFX120(NAME, SHIFT, WIDTH) \ + AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC2_GFX6_GFX120_##NAME, SHIFT, WIDTH) // GFX12+. #define COMPUTE_PGM_RSRC2_GFX12_PLUS(NAME, SHIFT, WIDTH) \ AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC2_GFX12_PLUS_##NAME, SHIFT, WIDTH) +// [GFX120]. +#define COMPUTE_PGM_RSRC2_GFX120(NAME, SHIFT, WIDTH) \ + AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC2_GFX120_##NAME, SHIFT, WIDTH) +// [GFX125]. +#define COMPUTE_PGM_RSRC2_GFX125(NAME, SHIFT, WIDTH) \ + AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC2_GFX125_##NAME, SHIFT, WIDTH) enum : int32_t { COMPUTE_PGM_RSRC2(ENABLE_PRIVATE_SEGMENT, 0, 1), - COMPUTE_PGM_RSRC2(USER_SGPR_COUNT, 1, 5), + COMPUTE_PGM_RSRC2_GFX6_GFX120(USER_SGPR_COUNT, 1, 5), COMPUTE_PGM_RSRC2_GFX6_GFX11(ENABLE_TRAP_HANDLER, 6, 1), - COMPUTE_PGM_RSRC2_GFX12_PLUS(RESERVED1, 6, 1), + COMPUTE_PGM_RSRC2_GFX120(ENABLE_DYNAMIC_VGPR, 6, 1), + COMPUTE_PGM_RSRC2_GFX125(USER_SGPR_COUNT, 1, 6), COMPUTE_PGM_RSRC2(ENABLE_SGPR_WORKGROUP_ID_X, 7, 1), COMPUTE_PGM_RSRC2(ENABLE_SGPR_WORKGROUP_ID_Y, 8, 1), COMPUTE_PGM_RSRC2(ENABLE_SGPR_WORKGROUP_ID_Z, 9, 1), @@ -178,8 +196,8 @@ enum : int32_t { // Compute program resource register 3 for GFX10+. Must match hardware // definition. // GFX10+. -#define COMPUTE_PGM_RSRC3_GFX10_PLUS(NAME, SHIFT, WIDTH) \ - AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC3_GFX10_PLUS_ ## NAME, SHIFT, WIDTH) +#define COMPUTE_PGM_RSRC3_GFX10_PLUS(NAME, SHIFT, WIDTH) \ + AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC3_GFX10_PLUS_##NAME, SHIFT, WIDTH) // [GFX10]. #define COMPUTE_PGM_RSRC3_GFX10(NAME, SHIFT, WIDTH) \ AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC3_GFX10_##NAME, SHIFT, WIDTH) @@ -212,10 +230,13 @@ enum : int32_t { COMPUTE_PGM_RSRC3_GFX10_PLUS(RESERVED2, 12, 1), COMPUTE_PGM_RSRC3_GFX10_GFX11(RESERVED3, 13, 1), COMPUTE_PGM_RSRC3_GFX12_PLUS(GLG_EN, 13, 1), - COMPUTE_PGM_RSRC3_GFX10_GFX120(RESERVED4, 14, 3), + COMPUTE_PGM_RSRC3_GFX10_GFX120(RESERVED4, 14, 8), COMPUTE_PGM_RSRC3_GFX125(NAMED_BAR_CNT, 14, 3), - COMPUTE_PGM_RSRC3_GFX10_PLUS(RESERVED5, 17, 14), - COMPUTE_PGM_RSRC3_GFX10(RESERVED5, 31, 1), + COMPUTE_PGM_RSRC3_GFX125(ENABLE_DYNAMIC_VGPR, 17, 1), + COMPUTE_PGM_RSRC3_GFX125(TCP_SPLIT, 18, 3), + COMPUTE_PGM_RSRC3_GFX125(ENABLE_DIDT_THROTTLE, 21, 1), + COMPUTE_PGM_RSRC3_GFX10_PLUS(RESERVED5, 22, 9), + COMPUTE_PGM_RSRC3_GFX10(RESERVED6, 31, 1), COMPUTE_PGM_RSRC3_GFX11_PLUS(IMAGE_OP, 31, 1), }; #undef COMPUTE_PGM_RSRC3_GFX10_PLUS diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 78a2678808eee..2e21ba4c30b53 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -6410,12 +6410,24 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() { return TokError("amdgpu_user_sgpr_count smaller than than implied by " "enabled user SGPRs"); - if (!isUInt(UserSGPRCount)) - return TokError("too many user SGPRs enabled"); - AMDGPU::MCKernelDescriptor::bits_set( - KD.compute_pgm_rsrc2, MCConstantExpr::create(UserSGPRCount, getContext()), - COMPUTE_PGM_RSRC2_USER_SGPR_COUNT_SHIFT, - COMPUTE_PGM_RSRC2_USER_SGPR_COUNT, getContext()); + if (isGFX1250()) { + if (!isUInt(UserSGPRCount)) + return TokError("too many user SGPRs enabled"); + AMDGPU::MCKernelDescriptor::bits_set( + KD.compute_pgm_rsrc2, + MCConstantExpr::create(UserSGPRCount, getContext()), + COMPUTE_PGM_RSRC2_GFX125_USER_SGPR_COUNT_SHIFT, + COMPUTE_PGM_RSRC2_GFX125_USER_SGPR_COUNT, getContext()); + } else { + if (!isUInt( + UserSGPRCount)) + return TokError("too many user SGPRs enabled"); + AMDGPU::MCKernelDescriptor::bits_set( + KD.compute_pgm_rsrc2, + MCConstantExpr::create(UserSGPRCount, getContext()), + COMPUTE_PGM_RSRC2_GFX6_GFX120_USER_SGPR_COUNT_SHIFT, + COMPUTE_PGM_RSRC2_GFX6_GFX120_USER_SGPR_COUNT, getContext()); + } int64_t IVal = 0; if (!KD.kernarg_size->evaluateAsAbsolute(IVal)) diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index 4b891e48ff273..6a2beeed41dfd 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -2284,24 +2284,38 @@ Expected AMDGPUDisassembler::decodeCOMPUTE_PGM_RSRC1( CHECK_RESERVED_BITS(COMPUTE_PGM_RSRC1_BULKY); CHECK_RESERVED_BITS(COMPUTE_PGM_RSRC1_CDBG_USER); - if (isGFX9Plus()) + // Bits [26]. + if (isGFX9Plus()) { PRINT_DIRECTIVE(".amdhsa_fp16_overflow", COMPUTE_PGM_RSRC1_GFX9_PLUS_FP16_OVFL); - - if (!isGFX9Plus()) + } else { CHECK_RESERVED_BITS_DESC_MSG(COMPUTE_PGM_RSRC1_GFX6_GFX8_RESERVED0, "COMPUTE_PGM_RSRC1", "must be zero pre-gfx9"); + } - CHECK_RESERVED_BITS_DESC(COMPUTE_PGM_RSRC1_RESERVED1, "COMPUTE_PGM_RSRC1"); + // Bits [27]. + if (isGFX1250()) { + PRINT_PSEUDO_DIRECTIVE_COMMENT("FLAT_SCRATCH_IS_NV", + COMPUTE_PGM_RSRC1_GFX125_FLAT_SCRATCH_IS_NV); + } else { + CHECK_RESERVED_BITS_DESC(COMPUTE_PGM_RSRC1_GFX6_GFX120_RESERVED1, + "COMPUTE_PGM_RSRC1"); + } - if (!isGFX10Plus()) - CHECK_RESERVED_BITS_DESC_MSG(COMPUTE_PGM_RSRC1_GFX6_GFX9_RESERVED2, - "COMPUTE_PGM_RSRC1", "must be zero pre-gfx10"); + // Bits [28]. + CHECK_RESERVED_BITS_DESC(COMPUTE_PGM_RSRC1_RESERVED2, "COMPUTE_PGM_RSRC1"); + // Bits [29-31]. if (isGFX10Plus()) { - PRINT_DIRECTIVE(".amdhsa_workgroup_processor_mode", - COMPUTE_PGM_RSRC1_GFX10_PLUS_WGP_MODE); + // WGP_MODE is not available on GFX1250. + if (!isGFX1250()) { + PRINT_DIRECTIVE(".amdhsa_workgroup_processor_mode", + COMPUTE_PGM_RSRC1_GFX10_PLUS_WGP_MODE); + } PRINT_DIRECTIVE(".amdhsa_memory_ordered", COMPUTE_PGM_RSRC1_GFX10_PLUS_MEM_ORDERED); PRINT_DIRECTIVE(".amdhsa_forward_progress", COMPUTE_PGM_RSRC1_GFX10_PLUS_FWD_PROGRESS); + } else { + CHECK_RESERVED_BITS_DESC(COMPUTE_PGM_RSRC1_GFX6_GFX9_RESERVED3, + "COMPUTE_PGM_RSRC1"); } if (isGFX12Plus()) @@ -2423,17 +2437,24 @@ Expected AMDGPUDisassembler::decodeCOMPUTE_PGM_RSRC3( "must be zero on gfx10 or gfx11"); } - // Bits [14-16] + // Bits [14-21]. if (isGFX1250()) { PRINT_DIRECTIVE(".amdhsa_named_barrier_count", COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT); + PRINT_PSEUDO_DIRECTIVE_COMMENT( + "ENABLE_DYNAMIC_VGPR", COMPUTE_PGM_RSRC3_GFX125_ENABLE_DYNAMIC_VGPR); + PRINT_PSEUDO_DIRECTIVE_COMMENT("TCP_SPLIT", + COMPUTE_PGM_RSRC3_GFX125_TCP_SPLIT); + PRINT_PSEUDO_DIRECTIVE_COMMENT( + "ENABLE_DIDT_THROTTLE", + COMPUTE_PGM_RSRC3_GFX125_ENABLE_DIDT_THROTTLE); } else { CHECK_RESERVED_BITS_DESC_MSG(COMPUTE_PGM_RSRC3_GFX10_GFX120_RESERVED4, "COMPUTE_PGM_RSRC3", "must be zero on gfx10+"); } - // Bits [17-30]. + // Bits [22-30]. CHECK_RESERVED_BITS_DESC_MSG(COMPUTE_PGM_RSRC3_GFX10_PLUS_RESERVED5, "COMPUTE_PGM_RSRC3", "must be zero on gfx10+"); @@ -2442,7 +2463,7 @@ Expected AMDGPUDisassembler::decodeCOMPUTE_PGM_RSRC3( PRINT_PSEUDO_DIRECTIVE_COMMENT("IMAGE_OP", COMPUTE_PGM_RSRC3_GFX11_PLUS_IMAGE_OP); } else { - CHECK_RESERVED_BITS_DESC_MSG(COMPUTE_PGM_RSRC3_GFX10_RESERVED5, + CHECK_RESERVED_BITS_DESC_MSG(COMPUTE_PGM_RSRC3_GFX10_RESERVED6, "COMPUTE_PGM_RSRC3", "must be zero on gfx10"); } diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp index b58ba947c72e2..0bbab29dbda18 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp @@ -396,9 +396,17 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor( EmitMCExpr(KD.kernarg_size); OS << '\n'; - PrintField( - KD.compute_pgm_rsrc2, amdhsa::COMPUTE_PGM_RSRC2_USER_SGPR_COUNT_SHIFT, - amdhsa::COMPUTE_PGM_RSRC2_USER_SGPR_COUNT, ".amdhsa_user_sgpr_count"); + if (isGFX1250(STI)) { + PrintField(KD.compute_pgm_rsrc2, + amdhsa::COMPUTE_PGM_RSRC2_GFX125_USER_SGPR_COUNT_SHIFT, + amdhsa::COMPUTE_PGM_RSRC2_GFX125_USER_SGPR_COUNT, + ".amdhsa_user_sgpr_count"); + } else { + PrintField(KD.compute_pgm_rsrc2, + amdhsa::COMPUTE_PGM_RSRC2_GFX6_GFX120_USER_SGPR_COUNT_SHIFT, + amdhsa::COMPUTE_PGM_RSRC2_GFX6_GFX120_USER_SGPR_COUNT, + ".amdhsa_user_sgpr_count"); + } if (!hasArchitectedFlatScratch(STI)) PrintField( diff --git a/llvm/test/MC/AMDGPU/hsa-gfx1250-v4.s b/llvm/test/MC/AMDGPU/hsa-gfx1250-v4.s new file mode 100644 index 0000000000000..3cd1d877600e2 --- /dev/null +++ b/llvm/test/MC/AMDGPU/hsa-gfx1250-v4.s @@ -0,0 +1,323 @@ +// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx1250 --amdhsa-code-object-version=4 < %s | FileCheck --check-prefixes=ASM %s +// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx1250 --amdhsa-code-object-version=4 -filetype=obj < %s > %t +// RUN: llvm-readelf -S -r -s %t | FileCheck --check-prefix=READOBJ %s +// RUN: llvm-objdump -s -j .rodata %t | FileCheck --check-prefix=OBJDUMP %s + +// READOBJ: Section Headers +// READOBJ: .text PROGBITS {{[0-9a-f]+}} {{[0-9a-f]+}} {{[0-9a-f]+}} {{[0-9]+}} AX {{[0-9]+}} {{[0-9]+}} 256 +// READOBJ: .rodata PROGBITS {{[0-9a-f]+}} 000540 {{[0-9a-f]+}} {{[0-9]+}} A {{[0-9]+}} {{[0-9]+}} 64 + +// READOBJ: Relocation section '.rela.rodata' at offset +// READOBJ: 0000000000000010 {{[0-9a-f]+}}00000005 R_AMDGPU_REL64 0000000000000000 .text + 10 +// READOBJ: 0000000000000050 {{[0-9a-f]+}}00000005 R_AMDGPU_REL64 0000000000000000 .text + 110 +// READOBJ: 0000000000000090 {{[0-9a-f]+}}00000005 R_AMDGPU_REL64 0000000000000000 .text + 210 +// READOBJ: 00000000000000d0 {{[0-9a-f]+}}00000005 R_AMDGPU_REL64 0000000000000000 .text + 310 +// READOBJ: 0000000000000110 {{[0-9a-f]+}}00000005 R_AMDGPU_REL64 0000000000000000 .text + 410 + +// READOBJ: Symbol table '.symtab' contains {{[0-9]+}} entries: +// READOBJ: 0000000000000000 0 FUNC LOCAL PROTECTED 2 minimal +// READOBJ-NEXT: 0000000000000100 0 FUNC LOCAL PROTECTED 2 complete +// READOBJ-NEXT: 0000000000000200 0 FUNC LOCAL PROTECTED 2 special_sgpr +// READOBJ-NEXT: 0000000000000300 0 FUNC LOCAL PROTECTED 2 disabled_user_sgpr +// READOBJ-NEXT: 0000000000000400 0 FUNC LOCAL PROTECTED 2 max_lds_size +// READOBJ-NEXT: 0000000000000000 64 OBJECT LOCAL DEFAULT 3 minimal.kd +// READOBJ-NEXT: 0000000000000040 64 OBJECT LOCAL DEFAULT 3 complete.kd +// READOBJ-NEXT: 0000000000000080 64 OBJECT LOCAL DEFAULT 3 special_sgpr.kd +// READOBJ-NEXT: 00000000000000c0 64 OBJECT LOCAL DEFAULT 3 disabled_user_sgpr.kd +// READOBJ-NEXT: 0000000000000100 64 OBJECT LOCAL DEFAULT 3 max_lds_size.kd + +// OBJDUMP: Contents of section .rodata +// Note, relocation for KERNEL_CODE_ENTRY_BYTE_OFFSET is not resolved here. +// minimal +// OBJDUMP-NEXT: 0000 00000000 00000000 00000000 00000000 +// OBJDUMP-NEXT: 0010 00000000 00000000 00000000 00000000 +// OBJDUMP-NEXT: 0020 00000000 00000000 00000000 00000000 +// OBJDUMP-NEXT: 0030 00000cc0 80000000 00040000 00000000 +// complete +// OBJDUMP-NEXT: 0040 01000000 01000000 0c000000 00000000 +// OBJDUMP-NEXT: 0050 00000000 00000000 00000000 00000000 +// OBJDUMP-NEXT: 0060 00000000 00000000 00000000 00c00000 +// OBJDUMP-NEXT: 0070 015021c4 410f007f 5e068200 00000000 +// special_sgpr +// OBJDUMP-NEXT: 0080 00000000 00000000 00000000 00000000 +// OBJDUMP-NEXT: 0090 00000000 00000000 00000000 00000000 +// OBJDUMP-NEXT: 00a0 00000000 00000000 00000000 00000000 +// OBJDUMP-NEXT: 00b0 000000c0 80000000 00040000 00000000 +// disabled_user_sgpr +// OBJDUMP-NEXT: 00c0 00000000 00000000 00000000 00000000 +// OBJDUMP-NEXT: 00d0 00000000 00000000 00000000 00000000 +// OBJDUMP-NEXT: 00e0 00000000 00000000 00000000 00000000 +// OBJDUMP-NEXT: 00f0 00000cc0 80000000 00040000 00000000 +// max_lds_size +// OBJDUMP-NEXT: 0100 00000600 00000000 00000000 00000000 +// OBJDUMP-NEXT: 0110 00000000 00000000 00000000 00000000 +// OBJDUMP-NEXT: 0120 00000000 00000000 00000000 00000000 +// OBJDUMP-NEXT: 0130 00000cc0 80000000 00040000 00000000 + +.text + +.amdgcn_target "amdgcn-amd-amdhsa--gfx1250" +// ASM: .amdgcn_target "amdgcn-amd-amdhsa--gfx1250" + +.p2align 8 +.type minimal,@function +minimal: + s_endpgm + +.p2align 8 +.type complete,@function +complete: + s_endpgm + +.p2align 8 +.type special_sgpr,@function +special_sgpr: + s_endpgm + +.p2align 8 +.type disabled_user_sgpr,@function +disabled_user_sgpr: + s_endpgm + +.p2align 8 +.type max_lds_size,@function +max_lds_size: + s_endpgm + +.rodata +// ASM: .rodata + +// Test that only specifying required directives is allowed, and that defaulted +// values are omitted. +.p2align 6 +.amdhsa_kernel minimal + .amdhsa_next_free_vgpr 0 + .amdhsa_next_free_sgpr 0 +.end_amdhsa_kernel + +// ASM: .amdhsa_kernel minimal +// ASM: .amdhsa_next_free_vgpr 0 +// ASM-NEXT: .amdhsa_next_free_sgpr 0 +// ASM: .end_amdhsa_kernel + +// Test that we can specify all available directives with non-default values. +.p2align 6 +.amdhsa_kernel complete + .amdhsa_group_segment_fixed_size 1 + .amdhsa_private_segment_fixed_size 1 + .amdhsa_kernarg_size 12 + .amdhsa_user_sgpr_count 32 + .amdhsa_user_sgpr_dispatch_ptr 1 + .amdhsa_user_sgpr_queue_ptr 1 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_user_sgpr_dispatch_id 1 + .amdhsa_user_sgpr_kernarg_preload_length 2 + .amdhsa_user_sgpr_kernarg_preload_offset 1 + .amdhsa_user_sgpr_private_segment_size 1 + .amdhsa_uses_cu_stores 1 + .amdhsa_wavefront_size32 1 + .amdhsa_enable_private_segment 1 + .amdhsa_system_sgpr_workgroup_id_x 0 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_sgpr_workgroup_id_z 1 + .amdhsa_system_sgpr_workgroup_info 1 + .amdhsa_system_vgpr_workitem_id 1 + .amdhsa_next_free_vgpr 9 + .amdhsa_next_free_sgpr 27 + .amdhsa_named_barrier_count 3 + .amdhsa_reserve_vcc 0 + .amdhsa_float_round_mode_32 1 + .amdhsa_float_round_mode_16_64 1 + .amdhsa_float_denorm_mode_32 1 + .amdhsa_float_denorm_mode_16_64 0 + .amdhsa_fp16_overflow 1 + .amdhsa_memory_ordered 1 + .amdhsa_forward_progress 1 + .amdhsa_round_robin_scheduling 1 + .amdhsa_exception_fp_ieee_invalid_op 1 + .amdhsa_exception_fp_denorm_src 1 + .amdhsa_exception_fp_ieee_div_zero 1 + .amdhsa_exception_fp_ieee_overflow 1 + .amdhsa_exception_fp_ieee_underflow 1 + .amdhsa_exception_fp_ieee_inexact 1 + .amdhsa_exception_int_div_zero 1 +.end_amdhsa_kernel + +// ASM: .amdhsa_kernel complete +// ASM-NEXT: .amdhsa_group_segment_fixed_size 1 +// ASM-NEXT: .amdhsa_private_segment_fixed_size 1 +// ASM-NEXT: .amdhsa_kernarg_size 12 +// ASM-NEXT: .amdhsa_user_sgpr_count 32 +// ASM-NEXT: .amdhsa_user_sgpr_dispatch_ptr 1 +// ASM-NEXT: .amdhsa_user_sgpr_queue_ptr 1 +// ASM-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 1 +// ASM-NEXT: .amdhsa_user_sgpr_dispatch_id 1 +// ASM-NEXT: .amdhsa_user_sgpr_kernarg_preload_length 2 +// ASM-NEXT: .amdhsa_user_sgpr_kernarg_preload_offset 1 +// ASM-NEXT: .amdhsa_user_sgpr_private_segment_size 1 +// ASM-NEXT: .amdhsa_uses_cu_stores 1 +// ASM-NEXT: .amdhsa_wavefront_size32 1 +// ASM-NEXT: .amdhsa_enable_private_segment 1 +// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_x 0 +// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_y 1 +// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1 +// ASM-NEXT: .amdhsa_system_sgpr_workgroup_info 1 +// ASM-NEXT: .amdhsa_system_vgpr_workitem_id 1 +// ASM-NEXT: .amdhsa_next_free_vgpr 9 +// ASM-NEXT: .amdhsa_next_free_sgpr 32 +// ASM-NEXT: .amdhsa_named_barrier_count 3 +// ASM-NEXT: .amdhsa_reserve_vcc 0 +// ASM-NEXT: .amdhsa_float_round_mode_32 1 +// ASM-NEXT: .amdhsa_float_round_mode_16_64 1 +// ASM-NEXT: .amdhsa_float_denorm_mode_32 1 +// ASM-NEXT: .amdhsa_float_denorm_mode_16_64 0 +// ASM-NEXT: .amdhsa_fp16_overflow 1 +// ASM-NEXT: .amdhsa_memory_ordered 1 +// ASM-NEXT: .amdhsa_forward_progress 1 +// ASM-NEXT: .amdhsa_inst_pref_size 0 +// ASM-NEXT: .amdhsa_round_robin_scheduling 1 +// ASM-NEXT: .amdhsa_exception_fp_ieee_invalid_op 1 +// ASM-NEXT: .amdhsa_exception_fp_denorm_src 1 +// ASM-NEXT: .amdhsa_exception_fp_ieee_div_zero 1 +// ASM-NEXT: .amdhsa_exception_fp_ieee_overflow 1 +// ASM-NEXT: .amdhsa_exception_fp_ieee_underflow 1 +// ASM-NEXT: .amdhsa_exception_fp_ieee_inexact 1 +// ASM-NEXT: .amdhsa_exception_int_div_zero 1 +// ASM-NEXT: .end_amdhsa_kernel + +// Test that we are including special SGPR usage in the granulated count. +.p2align 6 +.amdhsa_kernel special_sgpr + .amdhsa_next_free_sgpr 27 + + .amdhsa_reserve_vcc 0 + + .amdhsa_float_denorm_mode_16_64 0 + .amdhsa_next_free_vgpr 0 +.end_amdhsa_kernel + +// ASM: .amdhsa_kernel special_sgpr +// ASM: .amdhsa_next_free_vgpr 0 +// ASM-NEXT: .amdhsa_next_free_sgpr 27 +// ASM-NEXT: .amdhsa_named_barrier_count 0 +// ASM-NEXT: .amdhsa_reserve_vcc 0 +// ASM: .amdhsa_float_denorm_mode_16_64 0 +// ASM: .end_amdhsa_kernel + +// Test that explicitly disabling user_sgpr's does not affect the user_sgpr +// count, i.e. this should produce the same descriptor as minimal. +.p2align 6 +.amdhsa_kernel disabled_user_sgpr + .amdhsa_next_free_vgpr 0 + .amdhsa_next_free_sgpr 0 +.end_amdhsa_kernel + +// ASM: .amdhsa_kernel disabled_user_sgpr +// ASM: .amdhsa_next_free_vgpr 0 +// ASM-NEXT: .amdhsa_next_free_sgpr 0 +// ASM: .end_amdhsa_kernel + +.p2align 6 +.amdhsa_kernel max_lds_size + .amdhsa_group_segment_fixed_size 393216 + .amdhsa_next_free_vgpr 1 + .amdhsa_next_free_sgpr 1 +.end_amdhsa_kernel + +// ASM: .amdhsa_kernel max_lds_size +// ASM: .amdhsa_group_segment_fixed_size 393216 +// ASM: .end_amdhsa_kernel + +.section .foo + +.byte .amdgcn.gfx_generation_number +// ASM: .byte 12 + +.byte .amdgcn.gfx_generation_minor +// ASM: .byte 5 + +.byte .amdgcn.gfx_generation_stepping +// ASM: .byte 0 + +.byte .amdgcn.next_free_vgpr +// ASM: .byte 0 +.byte .amdgcn.next_free_sgpr +// ASM: .byte 0 + +v_mov_b32_e32 v16, s3 + +.byte .amdgcn.next_free_vgpr +// ASM: .byte 17 +.byte .amdgcn.next_free_sgpr +// ASM: .byte 4 + +.set .amdgcn.next_free_vgpr, 0 +.set .amdgcn.next_free_sgpr, 0 + +.byte .amdgcn.next_free_vgpr +// ASM: .byte 0 +.byte .amdgcn.next_free_sgpr +// ASM: .byte 0 + +v_mov_b32_e32 v16, s3 + +.byte .amdgcn.next_free_vgpr +// ASM: .byte 17 +.byte .amdgcn.next_free_sgpr +// ASM: .byte 4 + +// Metadata + +.amdgpu_metadata + amdhsa.version: + - 3 + - 0 + amdhsa.kernels: + - .name: amd_kernel_code_t_test_all + .symbol: amd_kernel_code_t_test_all@kd + .kernarg_segment_size: 8 + .group_segment_fixed_size: 16 + .private_segment_fixed_size: 32 + .kernarg_segment_align: 64 + .wavefront_size: 128 + .sgpr_count: 14 + .vgpr_count: 1024 + .max_flat_workgroup_size: 256 + - .name: amd_kernel_code_t_minimal + .symbol: amd_kernel_code_t_minimal@kd + .kernarg_segment_size: 8 + .group_segment_fixed_size: 16 + .private_segment_fixed_size: 32 + .kernarg_segment_align: 64 + .wavefront_size: 128 + .sgpr_count: 14 + .vgpr_count: 40 + .max_flat_workgroup_size: 256 +.end_amdgpu_metadata + +// ASM: .amdgpu_metadata +// ASM: amdhsa.kernels: +// ASM: - .group_segment_fixed_size: 16 +// ASM: .kernarg_segment_align: 64 +// ASM: .kernarg_segment_size: 8 +// ASM: .max_flat_workgroup_size: 256 +// ASM: .name: amd_kernel_code_t_test_all +// ASM: .private_segment_fixed_size: 32 +// ASM: .sgpr_count: 14 +// ASM: .symbol: 'amd_kernel_code_t_test_all@kd' +// ASM: .vgpr_count: 1024 +// ASM: .wavefront_size: 128 +// ASM: - .group_segment_fixed_size: 16 +// ASM: .kernarg_segment_align: 64 +// ASM: .kernarg_segment_size: 8 +// ASM: .max_flat_workgroup_size: 256 +// ASM: .name: amd_kernel_code_t_minimal +// ASM: .private_segment_fixed_size: 32 +// ASM: .sgpr_count: 14 +// ASM: .symbol: 'amd_kernel_code_t_minimal@kd' +// ASM: .vgpr_count: 40 +// ASM: .wavefront_size: 128 +// ASM: amdhsa.version: +// ASM-NEXT: - 3 +// ASM-NEXT: - 0 +// ASM: .end_amdgpu_metadata diff --git a/llvm/test/MC/Disassembler/AMDGPU/kernel-descriptor-rsrc-errors.test b/llvm/test/MC/Disassembler/AMDGPU/kernel-descriptor-rsrc-errors.test index ad7bba076002f..be92aade6b531 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/kernel-descriptor-rsrc-errors.test +++ b/llvm/test/MC/Disassembler/AMDGPU/kernel-descriptor-rsrc-errors.test @@ -39,9 +39,14 @@ # RUN: yaml2obj %s -DGPU=GFX1100 -DSRC1=0300AC60 -DSRC2=80000000 -DSRC3=00000100 \ # RUN: | llvm-objdump --disassemble-symbols=test.kd - | FileCheck %s --check-prefix=RSRC3_10 -# RSRC3_10: ; error decoding test.kd: kernel descriptor COMPUTE_PGM_RSRC3 reserved bits in range (16:14) set, must be zero on gfx10+ +# RSRC3_10: ; error decoding test.kd: kernel descriptor COMPUTE_PGM_RSRC3 reserved bits in range (21:14) set, must be zero on gfx10+ # RSRC3_10-NEXT: ; decoding failed region as bytes +# RUN: yaml2obj %s -DGPU=GFX1100 -DSRC1=0300AC60 -DSRC2=80000000 -DSRC3=00004000 \ +# RUN: | llvm-objdump --disassemble-symbols=test.kd - | FileCheck %s --check-prefix=RSRC3_10_1 +# RSRC3_10_1: ; error decoding test.kd: kernel descriptor COMPUTE_PGM_RSRC3 reserved bits in range (30:22) set, must be zero on gfx10+ +# RSRC3_10_1-NEXT: ; decoding failed region as bytes + # RUN: yaml2obj %s -DGPU=GFX801 -DSRC1=0300AC60 -DSRC2=80000000 -DSRC3=00000001 \ # RUN: | llvm-objdump --disassemble-symbols=test.kd - | FileCheck %s --check-prefix=RSRC3_PRE_9 # RSRC3_PRE_9: ; error decoding test.kd: kernel descriptor COMPUTE_PGM_RSRC3 must be all zero before gfx9 diff --git a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx1250.s b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx1250.s new file mode 100644 index 0000000000000..99a4df3e5adfb --- /dev/null +++ b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx1250.s @@ -0,0 +1,121 @@ +;; Test disassembly for gfx1250 kernel descriptor. + +; RUN: rm -rf %t && split-file %s %t && cd %t + +;--- 1.s +; RUN: llvm-mc --triple=amdgcn-amd-amdhsa -filetype=obj -mcpu=gfx1250 < 1.s > 1.o +; RUN: llvm-objdump --disassemble-symbols=kernel.kd 1.o | tail -n +7 | tee 1-disasm.s | FileCheck 1.s +; RUN: llvm-mc --triple=amdgcn-amd-amdhsa -filetype=obj -mcpu=gfx1250 < 1-disasm.s > 1-disasm.o +; RUN: cmp 1.o 1-disasm.o +; CHECK: .amdhsa_kernel kernel +; CHECK-NEXT: .amdhsa_group_segment_fixed_size 0 +; CHECK-NEXT: .amdhsa_private_segment_fixed_size 0 +; CHECK-NEXT: .amdhsa_kernarg_size 0 +; CHECK-NEXT: .amdhsa_inst_pref_size 0 +; CHECK-NEXT: ; GLG_EN 0 +; CHECK-NEXT: .amdhsa_named_barrier_count 0 +; CHECK-NEXT: ; ENABLE_DYNAMIC_VGPR 0 +; CHECK-NEXT: ; TCP_SPLIT 0 +; CHECK-NEXT: ; ENABLE_DIDT_THROTTLE 0 +; CHECK-NEXT: ; IMAGE_OP 0 +; CHECK-NEXT: .amdhsa_next_free_vgpr 32 +; CHECK-NEXT: .amdhsa_reserve_vcc 0 +; CHECK-NEXT: .amdhsa_reserve_xnack_mask 0 +; CHECK-NEXT: .amdhsa_next_free_sgpr 8 +; CHECK-NEXT: .amdhsa_float_round_mode_32 0 +; CHECK-NEXT: .amdhsa_float_round_mode_16_64 0 +; CHECK-NEXT: .amdhsa_float_denorm_mode_32 0 +; CHECK-NEXT: .amdhsa_float_denorm_mode_16_64 3 +; CHECK-NEXT: .amdhsa_fp16_overflow 0 +; CHECK-NEXT: ; FLAT_SCRATCH_IS_NV 0 +; CHECK-NEXT: .amdhsa_memory_ordered 1 +; CHECK-NEXT: .amdhsa_forward_progress 1 +; CHECK-NEXT: .amdhsa_round_robin_scheduling 0 +; CHECK-NEXT: .amdhsa_enable_private_segment 0 +; CHECK-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1 +; CHECK-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0 +; CHECK-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0 +; CHECK-NEXT: .amdhsa_system_sgpr_workgroup_info 0 +; CHECK-NEXT: .amdhsa_system_vgpr_workitem_id 0 +; CHECK-NEXT: .amdhsa_exception_fp_ieee_invalid_op 0 +; CHECK-NEXT: .amdhsa_exception_fp_denorm_src 0 +; CHECK-NEXT: .amdhsa_exception_fp_ieee_div_zero 0 +; CHECK-NEXT: .amdhsa_exception_fp_ieee_overflow 0 +; CHECK-NEXT: .amdhsa_exception_fp_ieee_underflow 0 +; CHECK-NEXT: .amdhsa_exception_fp_ieee_inexact 0 +; CHECK-NEXT: .amdhsa_exception_int_div_zero 0 +; CHECK-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0 +; CHECK-NEXT: .amdhsa_user_sgpr_queue_ptr 0 +; CHECK-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 0 +; CHECK-NEXT: .amdhsa_user_sgpr_dispatch_id 0 +; CHECK-NEXT: .amdhsa_user_sgpr_private_segment_size 0 +; CHECK-NEXT: .amdhsa_uses_cu_stores 1 +; CHECK-NEXT: .amdhsa_wavefront_size32 1 +; CHECK-NEXT: .amdhsa_uses_dynamic_stack 0 +; CHECK-NEXT: .end_amdhsa_kernel +.amdhsa_kernel kernel + .amdhsa_next_free_vgpr 32 + .amdhsa_next_free_sgpr 32 + .amdhsa_inst_pref_size 0 + .amdhsa_uses_cu_stores 1 +.end_amdhsa_kernel + +;--- 2.s +; RUN: llvm-mc --triple=amdgcn-amd-amdhsa -filetype=obj -mcpu=gfx1250 < 2.s > 2.o +; RUN: llvm-objdump --disassemble-symbols=kernel.kd 2.o | tail -n +7 | tee 2-disasm.s | FileCheck 2.s +; RUN: llvm-mc --triple=amdgcn-amd-amdhsa -filetype=obj -mcpu=gfx1250 < 2-disasm.s > 2-disasm.o +; RUN: cmp 2.o 2-disasm.o +; CHECK: .amdhsa_kernel kernel +; CHECK-NEXT: .amdhsa_group_segment_fixed_size 393216 +; CHECK-NEXT: .amdhsa_private_segment_fixed_size 0 +; CHECK-NEXT: .amdhsa_kernarg_size 0 +; CHECK-NEXT: .amdhsa_inst_pref_size 63 +; CHECK-NEXT: ; GLG_EN 0 +; CHECK-NEXT: .amdhsa_named_barrier_count 7 +; CHECK-NEXT: ; ENABLE_DYNAMIC_VGPR 0 +; CHECK-NEXT: ; TCP_SPLIT 0 +; CHECK-NEXT: ; ENABLE_DIDT_THROTTLE 0 +; CHECK-NEXT: ; IMAGE_OP 0 +; CHECK-NEXT: .amdhsa_next_free_vgpr 32 +; CHECK-NEXT: .amdhsa_reserve_vcc 0 +; CHECK-NEXT: .amdhsa_reserve_xnack_mask 0 +; CHECK-NEXT: .amdhsa_next_free_sgpr 8 +; CHECK-NEXT: .amdhsa_float_round_mode_32 0 +; CHECK-NEXT: .amdhsa_float_round_mode_16_64 0 +; CHECK-NEXT: .amdhsa_float_denorm_mode_32 0 +; CHECK-NEXT: .amdhsa_float_denorm_mode_16_64 3 +; CHECK-NEXT: .amdhsa_fp16_overflow 0 +; CHECK-NEXT: ; FLAT_SCRATCH_IS_NV 0 +; CHECK-NEXT: .amdhsa_memory_ordered 1 +; CHECK-NEXT: .amdhsa_forward_progress 1 +; CHECK-NEXT: .amdhsa_round_robin_scheduling 0 +; CHECK-NEXT: .amdhsa_enable_private_segment 0 +; CHECK-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1 +; CHECK-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0 +; CHECK-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0 +; CHECK-NEXT: .amdhsa_system_sgpr_workgroup_info 0 +; CHECK-NEXT: .amdhsa_system_vgpr_workitem_id 0 +; CHECK-NEXT: .amdhsa_exception_fp_ieee_invalid_op 0 +; CHECK-NEXT: .amdhsa_exception_fp_denorm_src 0 +; CHECK-NEXT: .amdhsa_exception_fp_ieee_div_zero 0 +; CHECK-NEXT: .amdhsa_exception_fp_ieee_overflow 0 +; CHECK-NEXT: .amdhsa_exception_fp_ieee_underflow 0 +; CHECK-NEXT: .amdhsa_exception_fp_ieee_inexact 0 +; CHECK-NEXT: .amdhsa_exception_int_div_zero 0 +; CHECK-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0 +; CHECK-NEXT: .amdhsa_user_sgpr_queue_ptr 0 +; CHECK-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 0 +; CHECK-NEXT: .amdhsa_user_sgpr_dispatch_id 0 +; CHECK-NEXT: .amdhsa_user_sgpr_private_segment_size 0 +; CHECK-NEXT: .amdhsa_uses_cu_stores 0 +; CHECK-NEXT: .amdhsa_wavefront_size32 1 +; CHECK-NEXT: .amdhsa_uses_dynamic_stack 0 +; CHECK-NEXT: .end_amdhsa_kernel +.amdhsa_kernel kernel + .amdhsa_group_segment_fixed_size 393216 + .amdhsa_next_free_vgpr 32 + .amdhsa_next_free_sgpr 32 + .amdhsa_named_barrier_count 7 + .amdhsa_uses_cu_stores 0 + .amdhsa_inst_pref_size 63 +.end_amdhsa_kernel