Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 14 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -586,6 +586,12 @@ def FeatureRealTrue16Insts : SubtargetFeature<"real-true16",
"Use true 16-bit registers"
>;

def FeatureD16Writes32BitVgpr : SubtargetFeature<"d16-write-vgpr32",
"EnableD16Writes32BitVgpr",
"true",
"D16 instructions potentially have 32-bit data dependencies"
>;

def FeatureBF16TransInsts : SubtargetFeature<"bf16-trans-insts",
"HasBF16TransInsts",
"true",
Expand Down Expand Up @@ -1934,7 +1940,9 @@ def FeatureISAVersion11_Common : FeatureSet<
FeaturePackedTID,
FeatureVcmpxPermlaneHazard,
FeatureMemoryAtomicFAddF32DenormalSupport,
FeatureRealTrue16Insts]>;
FeatureRealTrue16Insts,
FeatureD16Writes32BitVgpr,
]>;

// There are few workarounds that need to be
// added to all targets. This pessimizes codegen
Expand Down Expand Up @@ -2570,6 +2578,11 @@ def UseFakeTrue16Insts : True16PredicateClass<"Subtarget->hasTrue16BitInsts() &&
// FIXME When we default to RealTrue16 instead of Fake, change the line as follows.
// AssemblerPredicate<(all_of FeatureTrue16BitInsts, (not FeatureRealTrue16Insts))>;

def HasD16Writes32BitVgpr: Predicate<"Subtarget->hasD16Writes32BitVgpr()">,
AssemblerPredicate<(all_of FeatureTrue16BitInsts, FeatureRealTrue16Insts, FeatureD16Writes32BitVgpr)>;
def NotHasD16Writes32BitVgpr: Predicate<"!Subtarget->hasD16Writes32BitVgpr()">,
AssemblerPredicate<(all_of FeatureTrue16BitInsts, FeatureRealTrue16Insts, (not FeatureD16Writes32BitVgpr))>;

def HasBF16TransInsts : Predicate<"Subtarget->hasBF16TransInsts()">,
AssemblerPredicate<(all_of FeatureBF16TransInsts)>;

Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,10 @@ bool AMDGPUSubtarget::useRealTrue16Insts() const {
return hasTrue16BitInsts() && EnableRealTrue16Insts;
}

bool AMDGPUSubtarget::hasD16Writes32BitVgpr() const {
return EnableD16Writes32BitVgpr;
}

// Returns the maximum per-workgroup LDS allocation size (in bytes) that still
// allows the given function to achieve an occupancy of NWaves waves per
// SIMD / EU, taking into account only the function's *maximum* workgroup size.
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ class AMDGPUSubtarget {
bool HasCvtPkF16F32Inst = false;
bool HasF32ToF16BF16ConversionSRInsts = false;
bool EnableRealTrue16Insts = false;
bool EnableD16Writes32BitVgpr = false;
bool HasBF16TransInsts = false;
bool HasBF16ConversionInsts = false;
bool HasBF16PackedInsts = false;
Expand Down Expand Up @@ -224,6 +225,8 @@ class AMDGPUSubtarget {
// supported and the support for fake True16 instructions is removed.
bool useRealTrue16Insts() const;

bool hasD16Writes32BitVgpr() const;

bool hasBF16TransInsts() const { return HasBF16TransInsts; }

bool hasBF16ConversionInsts() const {
Expand Down
9 changes: 9 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -845,6 +845,15 @@ RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
assert(Size % 16 == 0);
Result.second = Result.first + (Size / 16);

if (Size == 16 && Context->ST->hasD16Writes32BitVgpr()) {
// Regardless of which lo16/hi16 is used, consider the full 32-bit
// register used.
if (AMDGPU::isHi16Reg(MCReg, *TRI))
Result.first -= 1;
else
Result.second += 1;
}
} else if (TRI->isSGPRReg(*MRI, Op.getReg()) && RegIdx < SQ_MAX_PGM_SGPRS) {
// SGPRs including VCC, TTMPs and EXEC but excluding read-only scalar
// sources like SRC_PRIVATE_BASE.
Expand Down
278 changes: 23 additions & 255 deletions llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll

Large diffs are not rendered by default.

40 changes: 8 additions & 32 deletions llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
Original file line number Diff line number Diff line change
Expand Up @@ -5033,6 +5033,7 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v23.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v21.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v19.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v17.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v14.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v12.l
Expand All @@ -5059,15 +5060,10 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v29.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v33.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v33.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v34.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v34.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v35.l
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v36
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
Expand Down Expand Up @@ -11993,6 +11989,7 @@ define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v23.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v21.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v19.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v17.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v14.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v12.l
Expand All @@ -12019,15 +12016,10 @@ define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v29.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v33.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v33.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v34.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v34.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v35.l
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v36
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
Expand Down Expand Up @@ -18559,6 +18551,7 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:20
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:12
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:4
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v29.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v27.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v25.l
Expand Down Expand Up @@ -18596,13 +18589,9 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v35.h
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v33.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v34.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v34.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v35.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.l, 8, v36.l
Expand Down Expand Up @@ -18701,10 +18690,9 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB50_2
; GFX11-TRUE16-NEXT: .LBB50_4: ; %cmp.true
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v32.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v31.h, 3
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v32.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v31.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v28.l, 3
Expand Down Expand Up @@ -24640,6 +24628,7 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:20
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:12
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:4
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v29.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v27.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v25.l
Expand Down Expand Up @@ -24677,13 +24666,9 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v35.h
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v33.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v34.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v34.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v35.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.l, 8, v36.l
Expand Down Expand Up @@ -24782,10 +24767,9 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB62_2
; GFX11-TRUE16-NEXT: .LBB62_4: ; %cmp.true
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v32.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v31.h, 3
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v32.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v31.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v28.l, 3
Expand Down Expand Up @@ -28760,6 +28744,7 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:20
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:12
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:4
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v29.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v27.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v25.l
Expand Down Expand Up @@ -28792,15 +28777,10 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v38.h
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v36.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v36.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v37.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v38.l
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v49
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
Expand Down Expand Up @@ -32871,6 +32851,7 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:20
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:12
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:4
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v29.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v27.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v25.l
Expand Down Expand Up @@ -32903,15 +32884,10 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v38.h
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v36.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v36.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v37.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v38.l
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v49
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
Expand Down
Loading