Skip to content

Commit

Permalink
[AMDGPU] Fix latency for implicit vcc_lo operands on GFX10 wave32
Browse files Browse the repository at this point in the history
As described in the comment, the way we change vcc to vcc_lo in these
operands confuses addPhysRegDataDeps into treating them as implicit
pseudo operands. Fix this by setting the correct latency from the
SchedModel after addPhysRegDataDeps wrongly set it to 0.

Differential Revision: https://reviews.llvm.org/D112317
  • Loading branch information
jayfoad committed Oct 22, 2021
1 parent 2915889 commit 3f34f75
Show file tree
Hide file tree
Showing 4 changed files with 18 additions and 9 deletions.
7 changes: 7 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
Expand Up @@ -967,6 +967,13 @@ void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use,
--Lat;
}
Dep.setLatency(Lat);
} else if (Dep.getLatency() == 0 && Dep.getReg() == AMDGPU::VCC_LO) {
// Work around the fact that SIInstrInfo::fixImplicitOperands modifies
// implicit operands which come from the MCInstrDesc, which can fool
// ScheduleDAGInstrs::addPhysRegDataDeps into treating them as implicit
// pseudo operands.
Dep.setLatency(InstrInfo.getSchedModel().computeOperandLatency(
DefI, DefOpIdx, UseI, UseOpIdx));
}
}

Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.h
Expand Up @@ -1134,6 +1134,8 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
}

static unsigned getDSShaderTypeValue(const MachineFunction &MF);

const TargetSchedModel &getSchedModel() const { return SchedModel; }
};

/// \brief Returns true if a reg:subreg pair P has a TRC class
Expand Down
14 changes: 7 additions & 7 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
Expand Up @@ -115,8 +115,8 @@ define amdgpu_ps float @s_div_fmas_f32(float inreg %a, float inreg %b, float inr
; GFX10_W32: ; %bb.0:
; GFX10_W32-NEXT: s_cmp_eq_u32 s3, 0
; GFX10_W32-NEXT: v_mov_b32_e32 v0, s1
; GFX10_W32-NEXT: v_mov_b32_e32 v1, s2
; GFX10_W32-NEXT: s_cselect_b32 s3, 1, 0
; GFX10_W32-NEXT: v_mov_b32_e32 v1, s2
; GFX10_W32-NEXT: s_and_b32 s3, 1, s3
; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s3
; GFX10_W32-NEXT: v_div_fmas_f32 v0, s0, v0, v1
Expand Down Expand Up @@ -178,12 +178,12 @@ define amdgpu_ps double @s_div_fmas_f64(double inreg %a, double inreg %b, double
; GFX10_W32: ; %bb.0:
; GFX10_W32-NEXT: s_cmp_eq_u32 s6, 0
; GFX10_W32-NEXT: v_mov_b32_e32 v0, s2
; GFX10_W32-NEXT: v_mov_b32_e32 v2, s4
; GFX10_W32-NEXT: v_mov_b32_e32 v1, s3
; GFX10_W32-NEXT: v_mov_b32_e32 v3, s5
; GFX10_W32-NEXT: s_cselect_b32 s6, 1, 0
; GFX10_W32-NEXT: v_mov_b32_e32 v2, s4
; GFX10_W32-NEXT: s_and_b32 s6, 1, s6
; GFX10_W32-NEXT: v_mov_b32_e32 v1, s3
; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s6
; GFX10_W32-NEXT: v_mov_b32_e32 v3, s5
; GFX10_W32-NEXT: v_div_fmas_f64 v[0:1], s[0:1], v[0:1], v[2:3]
; GFX10_W32-NEXT: v_readfirstlane_b32 s0, v0
; GFX10_W32-NEXT: v_readfirstlane_b32 s1, v1
Expand Down Expand Up @@ -260,8 +260,8 @@ define amdgpu_kernel void @test_div_fmas_f32(float addrspace(1)* %out, [8 x i32]
; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX10_W32-NEXT: s_and_b32 s0, 1, s4
; GFX10_W32-NEXT: v_mov_b32_e32 v0, s5
; GFX10_W32-NEXT: v_mov_b32_e32 v1, s6
; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
; GFX10_W32-NEXT: v_mov_b32_e32 v1, s6
; GFX10_W32-NEXT: v_div_fmas_f32 v0, s7, v0, v1
; GFX10_W32-NEXT: v_mov_b32_e32 v1, 0
; GFX10_W32-NEXT: global_store_dword v1, v0, s[2:3]
Expand Down Expand Up @@ -558,9 +558,9 @@ define amdgpu_kernel void @test_div_fmas_f64(double addrspace(1)* %out, double %
; GFX10_W32-NEXT: s_and_b32 s0, 1, s2
; GFX10_W32-NEXT: v_mov_b32_e32 v0, s8
; GFX10_W32-NEXT: v_mov_b32_e32 v2, s10
; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
; GFX10_W32-NEXT: v_mov_b32_e32 v1, s9
; GFX10_W32-NEXT: v_mov_b32_e32 v3, s11
; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
; GFX10_W32-NEXT: v_div_fmas_f64 v[0:1], s[6:7], v[0:1], v[2:3]
; GFX10_W32-NEXT: v_mov_b32_e32 v2, 0
; GFX10_W32-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
Expand Down Expand Up @@ -634,8 +634,8 @@ define amdgpu_kernel void @test_div_fmas_f32_cond_to_vcc(float addrspace(1)* %ou
; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX10_W32-NEXT: s_cmp_eq_u32 s7, 0
; GFX10_W32-NEXT: v_mov_b32_e32 v0, s5
; GFX10_W32-NEXT: v_mov_b32_e32 v1, s6
; GFX10_W32-NEXT: s_cselect_b32 s0, 1, 0
; GFX10_W32-NEXT: v_mov_b32_e32 v1, s6
; GFX10_W32-NEXT: s_and_b32 s0, 1, s0
; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
; GFX10_W32-NEXT: v_div_fmas_f32 v0, s4, v0, v1
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll
Expand Up @@ -83,8 +83,8 @@ define amdgpu_kernel void @write_ds_sub0_offset0_global_clamp_bit(float %dummy.v
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: v_mov_b32_e32 v3, 0x7b
; GFX10-NEXT: s_mov_b32 vcc_lo, 0
; GFX10-NEXT: v_mov_b32_e32 v3, 0x7b
; GFX10-NEXT: v_sub_nc_u32_e32 v2, 0, v0
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
Expand Down Expand Up @@ -332,9 +332,9 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_mov_b32 vcc_lo, 0
; GFX10-NEXT: v_mov_b32_e32 v3, 0x7b
; GFX10-NEXT: v_mov_b32_e32 v4, 0
; GFX10-NEXT: s_mov_b32 vcc_lo, 0
; GFX10-NEXT: v_sub_nc_u32_e32 v2, 0x3fb, v0
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
Expand Down

0 comments on commit 3f34f75

Please sign in to comment.