diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 18d51087ff5fb..62bd864ab06b2 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -2006,8 +2006,12 @@ static bool isInlineableLiteralOp16(int64_t Val, MVT VT, bool HasInv2Pi) { return isInlinableIntLiteral(Val); } - // f16/v2f16 operands work correctly for all values. - return AMDGPU::isInlinableLiteral16(Val, HasInv2Pi); + if (VT.getScalarType() == MVT::f16) + return AMDGPU::isInlinableLiteralFP16(Val, HasInv2Pi); + + assert(VT.getScalarType() == MVT::bf16); + + return AMDGPU::isInlinableLiteralBF16(Val, HasInv2Pi); } bool AMDGPUOperand::isInlinableImm(MVT type) const { @@ -2375,15 +2379,26 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo return; case AMDGPU::OPERAND_REG_IMM_INT16: - case AMDGPU::OPERAND_REG_IMM_FP16: - case AMDGPU::OPERAND_REG_IMM_FP16_DEFERRED: case AMDGPU::OPERAND_REG_INLINE_C_INT16: - case AMDGPU::OPERAND_REG_INLINE_C_FP16: case AMDGPU::OPERAND_REG_INLINE_AC_INT16: + if (isSafeTruncation(Val, 16) && + AMDGPU::isInlinableIntLiteral(static_cast(Val))) { + Inst.addOperand(MCOperand::createImm(Val)); + setImmKindConst(); + return; + } + + Inst.addOperand(MCOperand::createImm(Val & 0xffff)); + setImmKindLiteral(); + return; + + case AMDGPU::OPERAND_REG_INLINE_C_FP16: + case AMDGPU::OPERAND_REG_IMM_FP16: + case AMDGPU::OPERAND_REG_IMM_FP16_DEFERRED: case AMDGPU::OPERAND_REG_INLINE_AC_FP16: if (isSafeTruncation(Val, 16) && - AMDGPU::isInlinableLiteral16(static_cast(Val), - AsmParser->hasInv2PiInlineImm())) { + AMDGPU::isInlinableLiteralFP16(static_cast(Val), + AsmParser->hasInv2PiInlineImm())) { Inst.addOperand(MCOperand::createImm(Val)); setImmKindConst(); return; @@ -2410,12 +2425,17 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo return; case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: + case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16: { + assert(isSafeTruncation(Val, 16)); + assert(AMDGPU::isInlinableIntLiteral(static_cast(Val))); + Inst.addOperand(MCOperand::createImm(Val)); + return; + } case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: - case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16: case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: { assert(isSafeTruncation(Val, 16)); - assert(AMDGPU::isInlinableLiteral16(static_cast(Val), - AsmParser->hasInv2PiInlineImm())); + assert(AMDGPU::isInlinableLiteralFP16(static_cast(Val), + AsmParser->hasInv2PiInlineImm())); Inst.addOperand(MCOperand::createImm(Val)); return; @@ -3559,7 +3579,19 @@ bool AMDGPUAsmParser::isInlineConstant(const MCInst &Inst, OperandType == AMDGPU::OPERAND_REG_IMM_V2BF16) return AMDGPU::isInlinableLiteralV2BF16(Val); - return AMDGPU::isInlinableLiteral16(Val, hasInv2PiInlineImm()); + if (OperandType == AMDGPU::OPERAND_REG_IMM_FP16 || + OperandType == AMDGPU::OPERAND_REG_INLINE_C_FP16 || + OperandType == AMDGPU::OPERAND_REG_INLINE_AC_FP16 || + OperandType == AMDGPU::OPERAND_REG_IMM_FP16_DEFERRED) + return AMDGPU::isInlinableLiteralFP16(Val, hasInv2PiInlineImm()); + + if (OperandType == AMDGPU::OPERAND_REG_IMM_BF16 || + OperandType == AMDGPU::OPERAND_REG_INLINE_C_BF16 || + OperandType == AMDGPU::OPERAND_REG_INLINE_AC_BF16 || + OperandType == AMDGPU::OPERAND_REG_IMM_BF16_DEFERRED) + return AMDGPU::isInlinableLiteralBF16(Val, hasInv2PiInlineImm()); + + llvm_unreachable("invalid operand type"); } default: llvm_unreachable("invalid operand size"); diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp index a32be1e50a605..683e8dad796c1 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp @@ -460,10 +460,8 @@ void AMDGPUInstPrinter::printImmediateInt16(uint32_t Imm, } } -// This must accept a 32-bit immediate value to correctly handle packed 16-bit -// operations. -static bool printImmediateFloat16(uint32_t Imm, const MCSubtargetInfo &STI, - raw_ostream &O) { +static bool printImmediateFP16(uint32_t Imm, const MCSubtargetInfo &STI, + raw_ostream &O) { if (Imm == 0x3C00) O << "1.0"; else if (Imm == 0xBC00) @@ -529,9 +527,9 @@ void AMDGPUInstPrinter::printImmediateBF16(uint32_t Imm, O << formatHex(static_cast(Imm)); } -void AMDGPUInstPrinter::printImmediate16(uint32_t Imm, - const MCSubtargetInfo &STI, - raw_ostream &O) { +void AMDGPUInstPrinter::printImmediateF16(uint32_t Imm, + const MCSubtargetInfo &STI, + raw_ostream &O) { int16_t SImm = static_cast(Imm); if (isInlinableIntLiteral(SImm)) { O << SImm; @@ -539,7 +537,7 @@ void AMDGPUInstPrinter::printImmediate16(uint32_t Imm, } uint16_t HImm = static_cast(Imm); - if (printImmediateFloat16(HImm, STI, O)) + if (printImmediateFP16(HImm, STI, O)) return; uint64_t Imm16 = static_cast(Imm); @@ -566,7 +564,7 @@ void AMDGPUInstPrinter::printImmediateV216(uint32_t Imm, uint8_t OpType, case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: if (isUInt<16>(Imm) && - printImmediateFloat16(static_cast(Imm), STI, O)) + printImmediateFP16(static_cast(Imm), STI, O)) return; break; case AMDGPU::OPERAND_REG_IMM_V2BF16: @@ -845,7 +843,7 @@ void AMDGPUInstPrinter::printRegularOperand(const MCInst *MI, unsigned OpNo, case AMDGPU::OPERAND_REG_INLINE_AC_FP16: case AMDGPU::OPERAND_REG_IMM_FP16: case AMDGPU::OPERAND_REG_IMM_FP16_DEFERRED: - printImmediate16(Op.getImm(), STI, O); + printImmediateF16(Op.getImm(), STI, O); break; case AMDGPU::OPERAND_REG_INLINE_C_BF16: case AMDGPU::OPERAND_REG_INLINE_AC_BF16: diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h index 15ecbf2e5e591..c801eaf1111e2 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h @@ -86,10 +86,10 @@ class AMDGPUInstPrinter : public MCInstPrinter { raw_ostream &O); void printImmediateInt16(uint32_t Imm, const MCSubtargetInfo &STI, raw_ostream &O); - void printImmediate16(uint32_t Imm, const MCSubtargetInfo &STI, - raw_ostream &O); void printImmediateBF16(uint32_t Imm, const MCSubtargetInfo &STI, raw_ostream &O); + void printImmediateF16(uint32_t Imm, const MCSubtargetInfo &STI, + raw_ostream &O); void printImmediateV216(uint32_t Imm, uint8_t OpType, const MCSubtargetInfo &STI, raw_ostream &O); bool printImmediateFloat32(uint32_t Imm, const MCSubtargetInfo &STI, diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 34c6038115329..30a65bb332652 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -15405,16 +15405,32 @@ bool SITargetLowering::checkAsmConstraintVal(SDValue Op, StringRef Constraint, llvm_unreachable("Invalid asm constraint"); } -bool SITargetLowering::checkAsmConstraintValA(SDValue Op, - uint64_t Val, +bool SITargetLowering::checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize) const { unsigned Size = std::min(Op.getScalarValueSizeInBits(), MaxSize); bool HasInv2Pi = Subtarget->hasInv2PiInlineImm(); - if ((Size == 16 && AMDGPU::isInlinableLiteral16(Val, HasInv2Pi)) || - (Size == 32 && AMDGPU::isInlinableLiteral32(Val, HasInv2Pi)) || - (Size == 64 && AMDGPU::isInlinableLiteral64(Val, HasInv2Pi))) { + if (Size == 16) { + MVT VT = Op.getSimpleValueType(); + switch (VT.SimpleTy) { + default: + return false; + case MVT::i16: + return AMDGPU::isInlinableLiteralI16(Val, HasInv2Pi); + case MVT::f16: + return AMDGPU::isInlinableLiteralFP16(Val, HasInv2Pi); + case MVT::bf16: + return AMDGPU::isInlinableLiteralBF16(Val, HasInv2Pi); + case MVT::v2i16: + return AMDGPU::getInlineEncodingV2I16(Val).has_value(); + case MVT::v2f16: + return AMDGPU::getInlineEncodingV2F16(Val).has_value(); + case MVT::v2bf16: + return AMDGPU::getInlineEncodingV2BF16(Val).has_value(); + } + } + if ((Size == 32 && AMDGPU::isInlinableLiteral32(Val, HasInv2Pi)) || + (Size == 64 && AMDGPU::isInlinableLiteral64(Val, HasInv2Pi))) return true; - } return false; } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 31ced9d41e15e..edd87e340d10d 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -4121,13 +4121,32 @@ bool SIInstrInfo::isInlineConstant(const APInt &Imm) const { ST.hasInv2PiInlineImm()); case 16: return ST.has16BitInsts() && - AMDGPU::isInlinableLiteral16(Imm.getSExtValue(), - ST.hasInv2PiInlineImm()); + AMDGPU::isInlinableLiteralI16(Imm.getSExtValue(), + ST.hasInv2PiInlineImm()); default: llvm_unreachable("invalid bitwidth"); } } +bool SIInstrInfo::isInlineConstant(const APFloat &Imm) const { + APInt IntImm = Imm.bitcastToAPInt(); + int64_t IntImmVal = IntImm.getSExtValue(); + bool HasInv2Pi = ST.hasInv2PiInlineImm(); + switch (APFloat::SemanticsToEnum(Imm.getSemantics())) { + default: + llvm_unreachable("invalid fltSemantics"); + case APFloatBase::S_IEEEsingle: + case APFloatBase::S_IEEEdouble: + return isInlineConstant(IntImm); + case APFloatBase::S_BFloat: + return ST.has16BitInsts() && + AMDGPU::isInlinableLiteralBF16(IntImmVal, HasInv2Pi); + case APFloatBase::S_IEEEhalf: + return ST.has16BitInsts() && + AMDGPU::isInlinableLiteralFP16(IntImmVal, HasInv2Pi); + } +} + bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, uint8_t OperandType) const { assert(!MO.isReg() && "isInlineConstant called on register operand!"); @@ -4200,7 +4219,7 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, // constants in these cases int16_t Trunc = static_cast(Imm); return ST.has16BitInsts() && - AMDGPU::isInlinableLiteral16(Trunc, ST.hasInv2PiInlineImm()); + AMDGPU::isInlinableLiteralFP16(Trunc, ST.hasInv2PiInlineImm()); } return false; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index a8a33a5fecb41..fe271e8ce790c 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -984,9 +984,7 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { bool isInlineConstant(const APInt &Imm) const; - bool isInlineConstant(const APFloat &Imm) const { - return isInlineConstant(Imm.bitcastToAPInt()); - } + bool isInlineConstant(const APFloat &Imm) const; // Returns true if this non-register operand definitely does not need to be // encoded as a 32-bit literal. Note that this function handles all kinds of diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 963dc2882fcc0..63285c06edaf2 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -2647,13 +2647,19 @@ bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi) { Val == 0x3E22; // 1.0 / (2.0 * pi) } -bool isInlinableLiteral16(int16_t Literal, bool HasInv2Pi) { +bool isInlinableLiteralI16(int16_t Literal, bool HasInv2Pi) { if (!HasInv2Pi) return false; - if (isInlinableIntLiteral(Literal)) return true; + return Literal == static_cast(0x3e22f983); +} +bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi) { + if (!HasInv2Pi) + return false; + if (isInlinableIntLiteral(Literal)) + return true; uint16_t Val = static_cast(Literal); return Val == 0x3C00 || // 1.0 Val == 0xBC00 || // -1.0 diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index 6edf01d1217f2..9fcb4caca30b0 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -1397,7 +1397,13 @@ LLVM_READNONE bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi); LLVM_READNONE -bool isInlinableLiteral16(int16_t Literal, bool HasInv2Pi); +bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi); + +LLVM_READNONE +bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi); + +LLVM_READNONE +bool isInlinableLiteralI16(int16_t Literal, bool HasInv2Pi); LLVM_READNONE std::optional getInlineEncodingV2I16(uint32_t Literal); diff --git a/llvm/test/CodeGen/AMDGPU/immv216.ll b/llvm/test/CodeGen/AMDGPU/immv216.ll index b66ca71a32749..ae51c3edf1c7e 100644 --- a/llvm/test/CodeGen/AMDGPU/immv216.ll +++ b/llvm/test/CodeGen/AMDGPU/immv216.ll @@ -577,40 +577,40 @@ define amdgpu_kernel void @add_inline_imm_64_v2f16(ptr addrspace(1) %out, <2 x h } ; GCN-LABEL: {{^}}mul_inline_imm_0.5_v2i16: -; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x38003800 -; GFX9: v_pk_mul_lo_u16 v0, v0, [[K]] +; GFX9: s_movk_i32 [[K:s[0-9]+]], 0x3800 +; GFX9: v_pk_mul_lo_u16 v0, v0, [[K]] op_sel_hi:[1,0] -; GFX10: v_pk_mul_lo_u16 v0, 0x38003800, v0 ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0x38,0x00,0x38] +; GFX10: v_pk_mul_lo_u16 v0, 0x3800, v0 op_sel_hi:[0,1] ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0x38,0x00,0x00] define <2 x i16> @mul_inline_imm_0.5_v2i16(<2 x i16> %x) { %y = mul <2 x i16> %x, bitcast (<2 x half> to <2 x i16>) ret <2 x i16> %y } ; GCN-LABEL: {{^}}mul_inline_imm_neg_0.5_v2i16: -; GFX9: s_mov_b32 [[K:s[0-9]+]], 0xb800b800 -; GFX9: v_pk_mul_lo_u16 v0, v0, [[K]] +; GFX9: s_movk_i32 [[K:s[0-9]+]], 0xb800 +; GFX9: v_pk_mul_lo_u16 v0, v0, [[K]] op_sel_hi:[1,0] -; GFX10: v_pk_mul_lo_u16 v0, 0xb800b800, v0 ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0xb8,0x00,0xb8] +; GFX10: v_pk_mul_lo_u16 v0, 0xffffb800, v0 op_sel_hi:[0,1] ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0xb8,0xff,0xff] define <2 x i16> @mul_inline_imm_neg_0.5_v2i16(<2 x i16> %x) { %y = mul <2 x i16> %x, bitcast (<2 x half> to <2 x i16>) ret <2 x i16> %y } ; GCN-LABEL: {{^}}mul_inline_imm_1.0_v2i16: -; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x3c003c00 -; GFX9: v_pk_mul_lo_u16 v0, v0, [[K]] +; GFX9: s_movk_i32 [[K:s[0-9]+]], 0x3c00 +; GFX9: v_pk_mul_lo_u16 v0, v0, [[K]] op_sel_hi:[1,0] -; GFX10: v_pk_mul_lo_u16 v0, 0x3c003c00, v0 ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0x3c,0x00,0x3c] +; GFX10: v_pk_mul_lo_u16 v0, 0x3c00, v0 op_sel_hi:[0,1] ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0x3c,0x00,0x00] define <2 x i16> @mul_inline_imm_1.0_v2i16(<2 x i16> %x) { %y = mul <2 x i16> %x, bitcast (<2 x half> to <2 x i16>) ret <2 x i16> %y } ; GCN-LABEL: {{^}}mul_inline_imm_neg_1.0_v2i16: -; GFX9: s_mov_b32 [[K:s[0-9]+]], 0xbc00bc00 -; GFX9: v_pk_mul_lo_u16 v0, v0, [[K]] +; GFX9: s_movk_i32 [[K:s[0-9]+]], 0xbc00 +; GFX9: v_pk_mul_lo_u16 v0, v0, [[K]] op_sel_hi:[1,0] -; GFX10: v_pk_mul_lo_u16 v0, 0xbc00bc00, v0 ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0xbc,0x00,0xbc] +; GFX10: v_pk_mul_lo_u16 v0, 0xffffbc00, v0 op_sel_hi:[0,1] ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0xbc,0xff,0xff] define <2 x i16> @mul_inline_imm_neg_1.0_v2i16(<2 x i16> %x) { %y = mul <2 x i16> %x, bitcast (<2 x half> to <2 x i16>) ret <2 x i16> %y @@ -635,10 +635,10 @@ define <2 x i16> @shl_inline_imm_neg_2.0_v2i16(<2 x i16> %x) { } ; GCN-LABEL: {{^}}mul_inline_imm_4.0_v2i16: -; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x44004400 -; GFX9: v_pk_mul_lo_u16 v0, v0, [[K]] +; GFX9: s_movk_i32 [[K:s[0-9]+]], 0x4400 +; GFX9: v_pk_mul_lo_u16 v0, v0, [[K]] op_sel_hi:[1,0] -; GFX10: v_pk_mul_lo_u16 v0, 0x44004400, v0 ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0x44,0x00,0x44] +; GFX10: v_pk_mul_lo_u16 v0, 0x4400, v0 op_sel_hi:[0,1] ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0x44,0x00,0x00] define <2 x i16> @mul_inline_imm_4.0_v2i16(<2 x i16> %x) { %y = mul <2 x i16> %x, bitcast (<2 x half> to <2 x i16>) ret <2 x i16> %y @@ -646,20 +646,20 @@ define <2 x i16> @mul_inline_imm_4.0_v2i16(<2 x i16> %x) { } ; GCN-LABEL: {{^}}mul_inline_imm_neg_4.0_v2i16: -; GFX9: s_mov_b32 [[K:s[0-9]+]], 0xc400c400 -; GFX9: v_pk_mul_lo_u16 v0, v0, [[K]] +; GFX9: s_movk_i32 [[K:s[0-9]+]], 0xc400 +; GFX9: v_pk_mul_lo_u16 v0, v0, [[K]] op_sel_hi:[1,0] -; GFX10: v_pk_mul_lo_u16 v0, 0xc400c400, v0 ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0xc4,0x00,0xc4] +; GFX10: v_pk_mul_lo_u16 v0, 0xffffc400, v0 op_sel_hi:[0,1] ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0xc4,0xff,0xff] define <2 x i16> @mul_inline_imm_neg_4.0_v2i16(<2 x i16> %x) { %y = mul <2 x i16> %x, bitcast (<2 x half> to <2 x i16>) ret <2 x i16> %y } ; GCN-LABEL: {{^}}mul_inline_imm_inv2pi_v2i16: -; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x31183118 -; GFX9: v_pk_mul_lo_u16 v0, v0, [[K]] +; GFX9: s_movk_i32 [[K:s[0-9]+]], 0x3118 +; GFX9: v_pk_mul_lo_u16 v0, v0, [[K]] op_sel_hi:[1,0] -; GFX10: v_pk_mul_lo_u16 v0, 0x31183118, v0 ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x18,0x31,0x18,0x31] +; GFX10: v_pk_mul_lo_u16 v0, 0x3118, v0 op_sel_hi:[0,1] ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x18,0x31,0x00,0x00] define <2 x i16> @mul_inline_imm_inv2pi_v2i16(<2 x i16> %x) { %y = mul <2 x i16> %x, bitcast (<2 x half> to <2 x i16>) ret <2 x i16> %y diff --git a/llvm/test/CodeGen/AMDGPU/inline-constraints.ll b/llvm/test/CodeGen/AMDGPU/inline-constraints.ll index 9ef246fe2e101..7bd6b037386b0 100644 --- a/llvm/test/CodeGen/AMDGPU/inline-constraints.ll +++ b/llvm/test/CodeGen/AMDGPU/inline-constraints.ll @@ -97,7 +97,6 @@ define i32 @inline_A_constraint_H1() { ; NOSI: error: invalid operand for inline asm constraint 'A' ; VI-LABEL: {{^}}inline_A_constraint_H2: -; VI: v_mov_b32 {{v[0-9]+}}, 0x3c00 define i32 @inline_A_constraint_H2() { %v0 = tail call i32 asm "v_mov_b32 $0, $1", "=v,A"(i16 bitcast (half 1.0 to i16)) ret i32 %v0 @@ -105,7 +104,6 @@ define i32 @inline_A_constraint_H2() { ; NOSI: error: invalid operand for inline asm constraint 'A' ; VI-LABEL: {{^}}inline_A_constraint_H3: -; VI: v_mov_b32 {{v[0-9]+}}, 0xbc00 define i32 @inline_A_constraint_H3() { %v0 = tail call i32 asm "v_mov_b32 $0, $1", "=v,A"(i16 bitcast (half -1.0 to i16)) ret i32 %v0 @@ -113,7 +111,6 @@ define i32 @inline_A_constraint_H3() { ; NOSI: error: invalid operand for inline asm constraint 'A' ; VI-LABEL: {{^}}inline_A_constraint_H4: -; VI: v_mov_b32 {{v[0-9]+}}, 0x3118 define i32 @inline_A_constraint_H4() { %v0 = tail call i32 asm "v_mov_b32 $0, $1", "=v,A"(half 0xH3118) ret i32 %v0 @@ -121,7 +118,6 @@ define i32 @inline_A_constraint_H4() { ; NOSI: error: invalid operand for inline asm constraint 'A' ; VI-LABEL: {{^}}inline_A_constraint_H5: -; VI: v_mov_b32 {{v[0-9]+}}, 0x3118 define i32 @inline_A_constraint_H5() { %v0 = tail call i32 asm "v_mov_b32 $0, $1", "=v,A"(i16 bitcast (half 0xH3118 to i16)) ret i32 %v0 @@ -129,7 +125,6 @@ define i32 @inline_A_constraint_H5() { ; NOSI: error: invalid operand for inline asm constraint 'A' ; VI-LABEL: {{^}}inline_A_constraint_H6: -; VI: v_mov_b32 {{v[0-9]+}}, 0xb800 define i32 @inline_A_constraint_H6() { %v0 = tail call i32 asm "v_mov_b32 $0, $1", "=v,A"(half -0.5) ret i32 %v0 @@ -293,7 +288,6 @@ define i32 @inline_A_constraint_V0() { ; NOSI: error: invalid operand for inline asm constraint 'A' ; VI-LABEL: {{^}}inline_A_constraint_V1: -; VI: v_mov_b32 {{v[0-9]+}}, 0xb800 define i32 @inline_A_constraint_V1() { %v0 = tail call i32 asm "v_mov_b32 $0, $1", "=v,A"(<2 x half> ) ret i32 %v0 @@ -970,7 +964,6 @@ define i32 @inline_DA_constraint_H1() { ; NOSI: error: invalid operand for inline asm constraint 'DA' ; VI-LABEL: {{^}}inline_DA_constraint_H2: -; VI: v_mov_b32 {{v[0-9]+}}, 0x3c00 define i32 @inline_DA_constraint_H2() { %v0 = tail call i32 asm "v_mov_b32 $0, $1", "=v,^DA"(i16 bitcast (half 1.0 to i16)) ret i32 %v0 @@ -978,7 +971,6 @@ define i32 @inline_DA_constraint_H2() { ; NOSI: error: invalid operand for inline asm constraint 'DA' ; VI-LABEL: {{^}}inline_DA_constraint_H3: -; VI: v_mov_b32 {{v[0-9]+}}, 0xbc00 define i32 @inline_DA_constraint_H3() { %v0 = tail call i32 asm "v_mov_b32 $0, $1", "=v,^DA"(i16 bitcast (half -1.0 to i16)) ret i32 %v0 @@ -986,7 +978,6 @@ define i32 @inline_DA_constraint_H3() { ; NOSI: error: invalid operand for inline asm constraint 'DA' ; VI-LABEL: {{^}}inline_DA_constraint_H4: -; VI: v_mov_b32 {{v[0-9]+}}, 0x3118 define i32 @inline_DA_constraint_H4() { %v0 = tail call i32 asm "v_mov_b32 $0, $1", "=v,^DA"(half 0xH3118) ret i32 %v0 @@ -994,7 +985,6 @@ define i32 @inline_DA_constraint_H4() { ; NOSI: error: invalid operand for inline asm constraint 'DA' ; VI-LABEL: {{^}}inline_DA_constraint_H5: -; VI: v_mov_b32 {{v[0-9]+}}, 0x3118 define i32 @inline_DA_constraint_H5() { %v0 = tail call i32 asm "v_mov_b32 $0, $1", "=v,^DA"(i16 bitcast (half 0xH3118 to i16)) ret i32 %v0 @@ -1002,7 +992,6 @@ define i32 @inline_DA_constraint_H5() { ; NOSI: error: invalid operand for inline asm constraint 'DA' ; VI-LABEL: {{^}}inline_DA_constraint_H6: -; VI: v_mov_b32 {{v[0-9]+}}, 0xb800 define i32 @inline_DA_constraint_H6() { %v0 = tail call i32 asm "v_mov_b32 $0, $1", "=v,^DA"(half -0.5) ret i32 %v0 @@ -1164,7 +1153,6 @@ define i32 @inline_DA_constraint_V0() { ; NOSI: error: invalid operand for inline asm constraint 'DA' ; VI-LABEL: {{^}}inline_DA_constraint_V1: -; VI: v_mov_b32 {{v[0-9]+}}, 0xb800 define i32 @inline_DA_constraint_V1() { %v0 = tail call i32 asm "v_mov_b32 $0, $1", "=v,^DA"(<2 x half> ) ret i32 %v0 diff --git a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll index 5de9b0b92c9a0..1a55bf608ebf5 100644 --- a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll +++ b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll @@ -3400,9 +3400,9 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(ptr addrspace(1) %out, p ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-SDAG-NEXT: s_mov_b32 s2, 0xc400c400 +; GFX9-SDAG-NEXT: s_movk_i32 s2, 0xc400 ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: v_pk_add_u16 v1, v1, s2 +; GFX9-SDAG-NEXT: v_pk_add_u16 v1, v1, s2 op_sel_hi:[1,0] ; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-SDAG-NEXT: s_endpgm ; @@ -3418,29 +3418,53 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(ptr addrspace(1) %out, p ; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm ; -; GFX10-LABEL: v_test_v2i16_x_add_neg_fpone: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[2:3] -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_add_u16 v1, 0xc400c400, v1 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] -; GFX10-NEXT: s_endpgm +; GFX10-SDAG-LABEL: v_test_v2i16_x_add_neg_fpone: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX10-SDAG-NEXT: v_pk_add_u16 v1, 0xffffc400, v1 op_sel_hi:[0,1] +; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-SDAG-NEXT: s_endpgm ; -; GFX11-LABEL: v_test_v2i16_x_add_neg_fpone: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_add_u16 v1, 0xc400c400, v1 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX10-GISEL-LABEL: v_test_v2i16_x_add_neg_fpone: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10-GISEL-NEXT: v_pk_add_u16 v1, 0xc400c400, v1 +; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_test_v2i16_x_add_neg_fpone: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: v_pk_add_u16 v1, 0xffffc400, v1 op_sel_hi:[0,1] +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_test_v2i16_x_add_neg_fpone: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: v_pk_add_u16 v1, 0xc400c400, v1 +; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext @@ -3541,9 +3565,9 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(ptr addrspace(1) %out ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-SDAG-NEXT: s_mov_b32 s2, 0x44004400 +; GFX9-SDAG-NEXT: s_movk_i32 s2, 0x4400 ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: v_pk_add_u16 v1, v1, s2 +; GFX9-SDAG-NEXT: v_pk_add_u16 v1, v1, s2 op_sel_hi:[1,0] ; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-SDAG-NEXT: s_endpgm ; @@ -3559,29 +3583,53 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(ptr addrspace(1) %out ; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm ; -; GFX10-LABEL: v_test_v2i16_x_add_neg_negfpone: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[2:3] -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_add_u16 v1, 0x44004400, v1 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] -; GFX10-NEXT: s_endpgm +; GFX10-SDAG-LABEL: v_test_v2i16_x_add_neg_negfpone: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX10-SDAG-NEXT: v_pk_add_u16 v1, 0x4400, v1 op_sel_hi:[0,1] +; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-SDAG-NEXT: s_endpgm ; -; GFX11-LABEL: v_test_v2i16_x_add_neg_negfpone: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_add_u16 v1, 0x44004400, v1 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX10-GISEL-LABEL: v_test_v2i16_x_add_neg_negfpone: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10-GISEL-NEXT: v_pk_add_u16 v1, 0x44004400, v1 +; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_test_v2i16_x_add_neg_negfpone: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: v_pk_add_u16 v1, 0x4400, v1 op_sel_hi:[0,1] +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_test_v2i16_x_add_neg_negfpone: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: v_pk_add_u16 v1, 0x44004400, v1 +; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext