Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 13 additions & 3 deletions llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -631,11 +631,21 @@ void SIPreEmitPeephole::collectUnpackingCandidates(
// latency, add latency of two unpacked instructions (currently estimated
// as 2 cycles).
TotalCyclesBetweenCandidates -= Latency;
// Once we've removed the packed latency, if we're already past the MFMA
// overlap window, later instructions can only increase the distance. Stop
// scanning for more candidates for this MFMA. Subtract 1 to account for
// MFMA issue latency. If the packed instruction cannot be immediately
// issued in the last cycle of the MFMA's execution we still want to
// unpack.
//
// FIXME: We shouldn't need to subtract 1 here, this should be reflected in
// the SchedModel.
if (TotalCyclesBetweenCandidates >= NumMFMACycles - 1)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this redundant with the check on line 612?

Also confused why test_pk_add_unpacking_f32 isn't impacted.

return;

// TODO: improve latency handling based on instruction modeling.
TotalCyclesBetweenCandidates += 2;
// Subtract 1 to account for MFMA issue latency.
if (TotalCyclesBetweenCandidates < NumMFMACycles - 1)
InstrsToUnpack.insert(&Instr);
InstrsToUnpack.insert(&Instr);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1167,3 +1167,69 @@ body: |
$vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
renamable $vgpr16_vgpr17 = nofpexcept V_PK_MUL_F32 8, killed $sgpr30_sgpr31, 11, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
S_ENDPGM 0
...
---
name: test_tie_unpack_minimal
tracksRegLiveness: true

liveins:
- { reg: '$vgpr0_vgpr1_vgpr2_vgpr3' }
- { reg: '$vgpr4_vgpr5_vgpr6_vgpr7' }
- { reg: '$vgpr8' }
- { reg: '$vgpr9' }
- { reg: '$vgpr10_vgpr11' }
- { reg: '$vgpr12_vgpr13' }
- { reg: '$agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15' }
Comment on lines +1175 to +1182
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
liveins:
- { reg: '$vgpr0_vgpr1_vgpr2_vgpr3' }
- { reg: '$vgpr4_vgpr5_vgpr6_vgpr7' }
- { reg: '$vgpr8' }
- { reg: '$vgpr9' }
- { reg: '$vgpr10_vgpr11' }
- { reg: '$vgpr12_vgpr13' }
- { reg: '$agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15' }

Don't actually need to record the function live ins


body: |
bb.0.entry:
liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8, $vgpr9, $vgpr10_vgpr11, $vgpr12_vgpr13, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15

; GFX950-LABEL: name: test_tie_unpack_minimal
; GFX950: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8, $vgpr9, $vgpr10_vgpr11, $vgpr12_vgpr13, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
; GFX950-NEXT: {{ $}}
; GFX950-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
; GFX950-NEXT: renamable $vgpr8 = nofpexcept V_ADD_F32_e32 killed $vgpr8, $vgpr9, implicit $mode, implicit $exec
; GFX950-NEXT: $vgpr12 = nofpexcept V_MUL_F32_e64 0, $vgpr10, 0, $vgpr10, 0, 0, implicit $mode, implicit $exec
; GFX950-NEXT: $vgpr13 = nofpexcept V_MUL_F32_e64 0, $vgpr10, 0, $vgpr11, 0, 0, implicit $mode, implicit $exec
; GFX950-NEXT: renamable $vgpr8 = nofpexcept V_ADD_F32_e32 killed $vgpr8, $vgpr9, implicit $mode, implicit $exec
; GFX950-NEXT: renamable $vgpr8 = nofpexcept V_ADD_F32_e32 killed $vgpr8, killed $vgpr9, implicit $mode, implicit $exec
; GFX950-NEXT: $vgpr12 = nofpexcept V_MUL_F32_e64 0, $vgpr10, 0, $vgpr10, 0, 0, implicit $mode, implicit $exec
; GFX950-NEXT: $vgpr13 = nofpexcept V_MUL_F32_e64 0, $vgpr10, 0, $vgpr11, 0, 0, implicit $mode, implicit $exec
; GFX950-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
; GFX950-NEXT: S_ENDPGM 0
;
; GFX942-LABEL: name: test_tie_unpack_minimal
; GFX942: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8, $vgpr9, $vgpr10_vgpr11, $vgpr12_vgpr13, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
; GFX942-NEXT: {{ $}}
; GFX942-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
; GFX942-NEXT: renamable $vgpr8 = nofpexcept V_ADD_F32_e32 killed $vgpr8, $vgpr9, implicit $mode, implicit $exec
; GFX942-NEXT: $vgpr12 = nofpexcept V_MUL_F32_e64 0, $vgpr10, 0, $vgpr10, 0, 0, implicit $mode, implicit $exec
; GFX942-NEXT: $vgpr13 = nofpexcept V_MUL_F32_e64 0, $vgpr10, 0, $vgpr11, 0, 0, implicit $mode, implicit $exec
; GFX942-NEXT: renamable $vgpr8 = nofpexcept V_ADD_F32_e32 killed $vgpr8, $vgpr9, implicit $mode, implicit $exec
; GFX942-NEXT: renamable $vgpr8 = nofpexcept V_ADD_F32_e32 killed $vgpr8, killed $vgpr9, implicit $mode, implicit $exec
; GFX942-NEXT: $vgpr12 = nofpexcept V_MUL_F32_e64 0, $vgpr10, 0, $vgpr10, 0, 0, implicit $mode, implicit $exec
; GFX942-NEXT: $vgpr13 = nofpexcept V_MUL_F32_e64 0, $vgpr10, 0, $vgpr11, 0, 0, implicit $mode, implicit $exec
; GFX942-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
; GFX942-NEXT: S_ENDPGM 0
;
; GFX90A-LABEL: name: test_tie_unpack_minimal
; GFX90A: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8, $vgpr9, $vgpr10_vgpr11, $vgpr12_vgpr13, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
; GFX90A-NEXT: renamable $vgpr8 = nofpexcept V_ADD_F32_e32 killed $vgpr8, $vgpr9, implicit $mode, implicit $exec
; GFX90A-NEXT: renamable $vgpr12_vgpr13 = nofpexcept V_PK_MUL_F32 0, $vgpr10_vgpr11, 8, $vgpr10_vgpr11, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
; GFX90A-NEXT: renamable $vgpr8 = nofpexcept V_ADD_F32_e32 killed $vgpr8, $vgpr9, implicit $mode, implicit $exec
; GFX90A-NEXT: renamable $vgpr8 = nofpexcept V_ADD_F32_e32 killed $vgpr8, killed $vgpr9, implicit $mode, implicit $exec
; GFX90A-NEXT: renamable $vgpr12_vgpr13 = nofpexcept V_PK_MUL_F32 0, $vgpr10_vgpr11, 8, $vgpr10_vgpr11, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
; GFX90A-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
; GFX90A-NEXT: S_ENDPGM 0
renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
renamable $vgpr8 = nofpexcept V_ADD_F32_e32 killed $vgpr8, $vgpr9, implicit $mode, implicit $exec
renamable $vgpr12_vgpr13 = nofpexcept V_PK_MUL_F32 0, $vgpr10_vgpr11, 8, $vgpr10_vgpr11, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
renamable $vgpr8 = nofpexcept V_ADD_F32_e32 killed $vgpr8, $vgpr9, implicit $mode, implicit $exec
renamable $vgpr8 = nofpexcept V_ADD_F32_e32 killed $vgpr8, killed $vgpr9, implicit $mode, implicit $exec
renamable $vgpr12_vgpr13 = nofpexcept V_PK_MUL_F32 0, $vgpr10_vgpr11, 8, $vgpr10_vgpr11, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
S_ENDPGM 0
...
Loading