Skip to content

Commit

Permalink
[Scheduling] Create the missing dependency edges for store cluster
Browse files Browse the repository at this point in the history
If it is load cluster, we don't need to create the dependency edges(SUb->reg) from SUb to SUa
as they both depend on the base register "reg"

     +-------+
+---->  reg  |
|    +---+---+
|        ^
|        |
|        |
|        |
|    +---+---+
|    |  SUa  |  Load 0(reg)
|    +---+---+
|        ^
|        |
|        |
|    +---+---+
+----+  SUb  |  Load 4(reg)
     +-------+

But if it is store cluster, we need to create it as follow shows to avoid the instruction store
depend on scheduled in-between SUb and SUa.

     +-------+
+---->  reg  |
|    +---+---+
|        ^
|        |         Missing       +-------+
|        | +-------------------->+   y   |
|        | |                     +---+---+
|    +---+-+-+                       ^
|    |  SUa  |  Store x 0(reg)       |
|    +---+---+                       |
|        ^                           |
|        |  +------------------------+
|        |  |
|    +---+--++
+----+  SUb  |  Store y 4(reg)
     +-------+

Reviewed By: evandro, arsenm, rampitec, foad, fhahn

Differential Revision: https://reviews.llvm.org/D72031
  • Loading branch information
QingShan Zhang committed Aug 7, 2020
1 parent 96b0280 commit 3359ea6
Show file tree
Hide file tree
Showing 10 changed files with 596 additions and 554 deletions.
36 changes: 26 additions & 10 deletions llvm/lib/CodeGen/MachineScheduler.cpp
Expand Up @@ -1624,16 +1624,32 @@ void BaseMemOpClusterMutation::clusterNeighboringMemOps(
LLVM_DEBUG(dbgs() << "Cluster ld/st SU(" << SUa->NodeNum << ") - SU("
<< SUb->NodeNum << ")\n");

// Copy successor edges from SUa to SUb. Interleaving computation
// dependent on SUa can prevent load combining due to register reuse.
// Predecessor edges do not need to be copied from SUb to SUa since
// nearby loads should have effectively the same inputs.
for (const SDep &Succ : SUa->Succs) {
if (Succ.getSUnit() == SUb)
continue;
LLVM_DEBUG(dbgs() << " Copy Succ SU(" << Succ.getSUnit()->NodeNum
<< ")\n");
DAG->addEdge(Succ.getSUnit(), SDep(SUb, SDep::Artificial));
if (IsLoad) {
// Copy successor edges from SUa to SUb. Interleaving computation
// dependent on SUa can prevent load combining due to register reuse.
// Predecessor edges do not need to be copied from SUb to SUa since
// nearby loads should have effectively the same inputs.
for (const SDep &Succ : SUa->Succs) {
if (Succ.getSUnit() == SUb)
continue;
LLVM_DEBUG(dbgs() << " Copy Succ SU(" << Succ.getSUnit()->NodeNum
<< ")\n");
DAG->addEdge(Succ.getSUnit(), SDep(SUb, SDep::Artificial));
}
} else {
// Copy predecessor edges from SUb to SUa to avoid the SUnits that
// SUb dependent on scheduled in-between SUb and SUa. Successor edges
// do not need to be copied from SUa to SUb since no one will depend
// on stores.
// Notice that, we don't need to care about the memory dependency as
// we won't try to cluster them if they have any memory dependency.
for (const SDep &Pred : SUb->Preds) {
if (Pred.getSUnit() == SUa)
continue;
LLVM_DEBUG(dbgs() << " Copy Pred SU(" << Pred.getSUnit()->NodeNum
<< ")\n");
DAG->addEdge(SUa, SDep(Pred.getSUnit(), SDep::Artificial));
}
}

LLVM_DEBUG(dbgs() << " Curr cluster length: " << ClusterLength
Expand Down
19 changes: 19 additions & 0 deletions llvm/test/CodeGen/AArch64/aarch64-stp-cluster.ll
Expand Up @@ -194,3 +194,22 @@ entry:
store i64 %add6.3, i64* %arrayidx5.3, align 8
ret void
}

; Verify that the SU(2) and SU(4) are the preds of SU(3)
; CHECK: ********** MI Scheduling **********
; CHECK-LABEL: stp_missing_preds_edges:%bb.0
; CHECK:Cluster ld/st SU(3) - SU(5)
; CHECK: Copy Pred SU(4)
; CHECK: Copy Pred SU(2)
; CHECK:SU(2): %0:gpr64common = COPY $x0
; CHECK:SU(3): STRWui %1:gpr32, %0:gpr64common, 0
; CHECK:SU(4): %3:gpr32common = nsw ADDWri %2:gpr32common, 5, 0
; CHECK:SU(5): STRWui %3:gpr32common, %0:gpr64common, 1
define void @stp_missing_preds_edges(i32* %p, i32 %m, i32 %n) {
entry:
store i32 %m, i32* %p, align 4
%add = add nsw i32 %n, 5
%arrayidx1 = getelementptr inbounds i32, i32* %p, i64 1
store i32 %add, i32* %arrayidx1, align 4
ret void
}
957 changes: 483 additions & 474 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll

Large diffs are not rendered by default.

100 changes: 50 additions & 50 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
Expand Up @@ -25,15 +25,15 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(<64 x i32> addrspace(1)* %out.
; GCN-NEXT: v_mov_b32_e32 v6, s15
; GCN-NEXT: v_mov_b32_e32 v8, s16
; GCN-NEXT: v_mov_b32_e32 v10, s17
; GCN-NEXT: v_mov_b32_e32 v12, s18
; GCN-NEXT: v_mov_b32_e32 v14, s19
; GCN-NEXT: s_movk_i32 s5, 0x60
; GCN-NEXT: v_add_u32_e32 v2, 8, v0
; GCN-NEXT: v_add_u32_e32 v3, 12, v0
; GCN-NEXT: v_add_u32_e32 v7, 16, v0
; GCN-NEXT: v_add_u32_e32 v9, 20, v0
; GCN-NEXT: v_add_u32_e32 v11, 24, v0
; GCN-NEXT: v_mov_b32_e32 v12, s18
; GCN-NEXT: v_add_u32_e32 v13, 28, v0
; GCN-NEXT: v_mov_b32_e32 v14, s19
; GCN-NEXT: v_add_u32_e32 v15, 32, v0
; GCN-NEXT: v_mov_b32_e32 v16, s20
; GCN-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen
Expand Down Expand Up @@ -71,7 +71,7 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(<64 x i32> addrspace(1)* %out.
; GCN-NEXT: buffer_store_dword v28, v27, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v30, v29, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v32, v31, s[0:3], 0 offen
; GCN-NEXT: s_movk_i32 s10, 0x70
; GCN-NEXT: s_movk_i32 s13, 0x70
; GCN-NEXT: v_add_u32_e32 v35, 0x48, v0
; GCN-NEXT: v_mov_b32_e32 v36, s70
; GCN-NEXT: v_add_u32_e32 v37, 0x4c, v0
Expand All @@ -96,19 +96,19 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(<64 x i32> addrspace(1)* %out.
; GCN-NEXT: v_add_u32_e32 v26, 0x64, v0
; GCN-NEXT: v_mov_b32_e32 v14, s77
; GCN-NEXT: v_mov_b32_e32 v4, s81
; GCN-NEXT: s_movk_i32 s11, 0x90
; GCN-NEXT: s_movk_i32 s13, 0xa0
; GCN-NEXT: s_movk_i32 s14, 0x90
; GCN-NEXT: s_movk_i32 s15, 0xa0
; GCN-NEXT: v_add_u32_e32 v28, 0x68, v0
; GCN-NEXT: v_mov_b32_e32 v16, s78
; GCN-NEXT: v_add_u32_e32 v30, 0x6c, v0
; GCN-NEXT: v_mov_b32_e32 v18, s79
; GCN-NEXT: v_add_u32_e32 v32, s13, v0
; GCN-NEXT: v_mov_b32_e32 v20, s80
; GCN-NEXT: v_mov_b32_e32 v5, s82
; GCN-NEXT: v_mov_b32_e32 v6, s83
; GCN-NEXT: v_add_u32_e32 v32, s10, v0
; GCN-NEXT: v_add_u32_e32 v34, 0x74, v0
; GCN-NEXT: v_add_u32_e32 v36, 0x78, v0
; GCN-NEXT: v_mov_b32_e32 v5, s82
; GCN-NEXT: v_add_u32_e32 v43, 0x7c, v0
; GCN-NEXT: v_mov_b32_e32 v6, s83
; GCN-NEXT: v_add_u32_e32 v44, 0x80, v0
; GCN-NEXT: v_mov_b32_e32 v8, s52
; GCN-NEXT: buffer_store_dword v14, v26, s[0:3], 0 offen
Expand All @@ -121,20 +121,20 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(<64 x i32> addrspace(1)* %out.
; GCN-NEXT: buffer_store_dword v8, v44, s[0:3], 0 offen
; GCN-NEXT: v_add_u32_e32 v45, 0x84, v0
; GCN-NEXT: v_mov_b32_e32 v4, s53
; GCN-NEXT: s_movk_i32 s14, 0xb0
; GCN-NEXT: s_movk_i32 s16, 0xb0
; GCN-NEXT: v_add_u32_e32 v46, 0x88, v0
; GCN-NEXT: v_mov_b32_e32 v5, s54
; GCN-NEXT: v_add_u32_e32 v47, 0x8c, v0
; GCN-NEXT: v_mov_b32_e32 v6, s55
; GCN-NEXT: v_add_u32_e32 v48, s11, v0
; GCN-NEXT: v_add_u32_e32 v48, s14, v0
; GCN-NEXT: v_mov_b32_e32 v8, s56
; GCN-NEXT: v_add_u32_e32 v49, 0x94, v0
; GCN-NEXT: v_mov_b32_e32 v10, s57
; GCN-NEXT: v_add_u32_e32 v50, 0x98, v0
; GCN-NEXT: v_mov_b32_e32 v12, s58
; GCN-NEXT: v_add_u32_e32 v51, 0x9c, v0
; GCN-NEXT: v_mov_b32_e32 v14, s59
; GCN-NEXT: v_add_u32_e32 v52, s13, v0
; GCN-NEXT: v_add_u32_e32 v52, s15, v0
; GCN-NEXT: v_mov_b32_e32 v16, s60
; GCN-NEXT: buffer_store_dword v4, v45, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v5, v46, s[0:3], 0 offen
Expand All @@ -146,13 +146,13 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(<64 x i32> addrspace(1)* %out.
; GCN-NEXT: buffer_store_dword v16, v52, s[0:3], 0 offen
; GCN-NEXT: v_add_u32_e32 v53, 0xa4, v0
; GCN-NEXT: v_mov_b32_e32 v4, s61
; GCN-NEXT: s_movk_i32 s15, 0xd0
; GCN-NEXT: s_movk_i32 s16, 0xe0
; GCN-NEXT: s_movk_i32 s17, 0xd0
; GCN-NEXT: s_movk_i32 s18, 0xe0
; GCN-NEXT: v_add_u32_e32 v54, 0xa8, v0
; GCN-NEXT: v_mov_b32_e32 v5, s62
; GCN-NEXT: v_add_u32_e32 v55, 0xac, v0
; GCN-NEXT: v_mov_b32_e32 v6, s63
; GCN-NEXT: v_add_u32_e32 v56, s14, v0
; GCN-NEXT: v_add_u32_e32 v56, s16, v0
; GCN-NEXT: v_mov_b32_e32 v8, s64
; GCN-NEXT: v_add_u32_e32 v57, 0xb4, v0
; GCN-NEXT: v_mov_b32_e32 v10, s65
Expand All @@ -173,20 +173,20 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(<64 x i32> addrspace(1)* %out.
; GCN-NEXT: v_add_u32_e32 v61, 0xc4, v0
; GCN-NEXT: v_mov_b32_e32 v4, s37
; GCN-NEXT: s_and_b32 s7, s7, 63
; GCN-NEXT: s_movk_i32 s17, 0xf0
; GCN-NEXT: s_movk_i32 s19, 0xf0
; GCN-NEXT: v_add_u32_e32 v62, 0xc8, v0
; GCN-NEXT: v_mov_b32_e32 v5, s38
; GCN-NEXT: v_add_u32_e32 v63, 0xcc, v0
; GCN-NEXT: v_mov_b32_e32 v6, s39
; GCN-NEXT: v_add_u32_e32 v64, s15, v0
; GCN-NEXT: v_add_u32_e32 v64, s17, v0
; GCN-NEXT: v_mov_b32_e32 v8, s40
; GCN-NEXT: v_add_u32_e32 v65, 0xd4, v0
; GCN-NEXT: v_mov_b32_e32 v10, s41
; GCN-NEXT: v_add_u32_e32 v66, 0xd8, v0
; GCN-NEXT: v_mov_b32_e32 v12, s42
; GCN-NEXT: v_add_u32_e32 v67, 0xdc, v0
; GCN-NEXT: v_mov_b32_e32 v14, s43
; GCN-NEXT: v_add_u32_e32 v68, s16, v0
; GCN-NEXT: v_add_u32_e32 v68, s18, v0
; GCN-NEXT: v_mov_b32_e32 v16, s44
; GCN-NEXT: buffer_store_dword v4, v61, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v5, v62, s[0:3], 0 offen
Expand All @@ -202,7 +202,7 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(<64 x i32> addrspace(1)* %out.
; GCN-NEXT: v_mov_b32_e32 v5, s46
; GCN-NEXT: v_add_u32_e32 v71, 0xec, v0
; GCN-NEXT: v_mov_b32_e32 v6, s47
; GCN-NEXT: v_add_u32_e32 v72, s17, v0
; GCN-NEXT: v_add_u32_e32 v72, s19, v0
; GCN-NEXT: v_mov_b32_e32 v8, s48
; GCN-NEXT: v_add_u32_e32 v73, 0xf4, v0
; GCN-NEXT: v_mov_b32_e32 v10, s49
Expand All @@ -217,9 +217,9 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(<64 x i32> addrspace(1)* %out.
; GCN-NEXT: v_mov_b32_e32 v4, s12
; GCN-NEXT: s_lshl_b32 s7, s7, 2
; GCN-NEXT: v_add_u32_e32 v75, 0xfc, v0
; GCN-NEXT: v_mov_b32_e32 v5, s51
; GCN-NEXT: v_mov_b32_e32 v14, s51
; GCN-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:256
; GCN-NEXT: buffer_store_dword v5, v75, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v14, v75, s[0:3], 0 offen
; GCN-NEXT: v_mov_b32_e32 v4, s6
; GCN-NEXT: v_add_u32_e32 v0, s7, v0
; GCN-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen
Expand Down Expand Up @@ -289,78 +289,78 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(<64 x i32> addrspace(1)* %out.
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:256
; GCN-NEXT: s_add_u32 s6, s8, 16
; GCN-NEXT: s_addc_u32 s7, s9, 0
; GCN-NEXT: v_mov_b32_e32 v65, s9
; GCN-NEXT: v_mov_b32_e32 v67, s7
; GCN-NEXT: v_mov_b32_e32 v66, s6
; GCN-NEXT: s_add_u32 s6, s8, 32
; GCN-NEXT: v_mov_b32_e32 v64, s8
; GCN-NEXT: s_addc_u32 s7, s9, 0
; GCN-NEXT: v_mov_b32_e32 v65, s9
; GCN-NEXT: s_add_u32 s10, s8, 48
; GCN-NEXT: v_mov_b32_e32 v64, s8
; GCN-NEXT: s_addc_u32 s11, s9, 0
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: global_store_dwordx4 v[64:65], v[0:3], off
; GCN-NEXT: global_store_dwordx4 v[66:67], v[4:7], off
; GCN-NEXT: v_mov_b32_e32 v0, s6
; GCN-NEXT: v_mov_b32_e32 v1, s7
; GCN-NEXT: s_add_u32 s6, s8, 48
; GCN-NEXT: s_addc_u32 s7, s9, 0
; GCN-NEXT: v_mov_b32_e32 v2, s6
; GCN-NEXT: v_mov_b32_e32 v3, s7
; GCN-NEXT: s_add_u32 s6, s8, 64
; GCN-NEXT: v_mov_b32_e32 v2, s10
; GCN-NEXT: s_addc_u32 s7, s9, 0
; GCN-NEXT: v_mov_b32_e32 v3, s11
; GCN-NEXT: s_add_u32 s10, s8, s4
; GCN-NEXT: s_addc_u32 s11, s9, 0
; GCN-NEXT: s_add_u32 s4, s8, s5
; GCN-NEXT: global_store_dwordx4 v[0:1], v[8:11], off
; GCN-NEXT: global_store_dwordx4 v[2:3], v[12:15], off
; GCN-NEXT: s_addc_u32 s7, s9, 0
; GCN-NEXT: v_mov_b32_e32 v0, s6
; GCN-NEXT: s_addc_u32 s5, s9, 0
; GCN-NEXT: v_mov_b32_e32 v1, s7
; GCN-NEXT: s_add_u32 s6, s8, s4
; GCN-NEXT: s_addc_u32 s7, s9, 0
; GCN-NEXT: s_add_u32 s4, s8, s5
; GCN-NEXT: v_mov_b32_e32 v2, s6
; GCN-NEXT: v_mov_b32_e32 v3, s7
; GCN-NEXT: s_add_u32 s6, s8, s13
; GCN-NEXT: v_mov_b32_e32 v2, s10
; GCN-NEXT: v_mov_b32_e32 v3, s11
; GCN-NEXT: global_store_dwordx4 v[0:1], v[16:19], off
; GCN-NEXT: global_store_dwordx4 v[2:3], v[20:23], off
; GCN-NEXT: s_addc_u32 s5, s9, 0
; GCN-NEXT: s_addc_u32 s7, s9, 0
; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: v_mov_b32_e32 v1, s5
; GCN-NEXT: s_add_u32 s4, s8, s10
; GCN-NEXT: s_addc_u32 s5, s9, 0
; GCN-NEXT: v_mov_b32_e32 v2, s4
; GCN-NEXT: v_mov_b32_e32 v3, s5
; GCN-NEXT: s_add_u32 s4, s8, 0x80
; GCN-NEXT: v_mov_b32_e32 v2, s6
; GCN-NEXT: s_addc_u32 s5, s9, 0
; GCN-NEXT: v_mov_b32_e32 v3, s7
; GCN-NEXT: s_add_u32 s6, s8, s14
; GCN-NEXT: global_store_dwordx4 v[0:1], v[24:27], off
; GCN-NEXT: global_store_dwordx4 v[2:3], v[28:31], off
; GCN-NEXT: s_addc_u32 s5, s9, 0
; GCN-NEXT: s_addc_u32 s7, s9, 0
; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: v_mov_b32_e32 v1, s5
; GCN-NEXT: s_add_u32 s4, s8, s11
; GCN-NEXT: s_add_u32 s4, s8, s15
; GCN-NEXT: v_mov_b32_e32 v2, s6
; GCN-NEXT: s_addc_u32 s5, s9, 0
; GCN-NEXT: v_mov_b32_e32 v2, s4
; GCN-NEXT: v_mov_b32_e32 v3, s5
; GCN-NEXT: s_add_u32 s4, s8, s13
; GCN-NEXT: v_mov_b32_e32 v3, s7
; GCN-NEXT: s_add_u32 s6, s8, s16
; GCN-NEXT: global_store_dwordx4 v[0:1], v[32:35], off
; GCN-NEXT: global_store_dwordx4 v[2:3], v[36:39], off
; GCN-NEXT: s_addc_u32 s5, s9, 0
; GCN-NEXT: s_addc_u32 s7, s9, 0
; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: v_mov_b32_e32 v1, s5
; GCN-NEXT: s_add_u32 s4, s8, s14
; GCN-NEXT: s_addc_u32 s5, s9, 0
; GCN-NEXT: v_mov_b32_e32 v2, s4
; GCN-NEXT: v_mov_b32_e32 v3, s5
; GCN-NEXT: s_add_u32 s4, s8, 0xc0
; GCN-NEXT: v_mov_b32_e32 v2, s6
; GCN-NEXT: v_mov_b32_e32 v3, s7
; GCN-NEXT: global_store_dwordx4 v[0:1], v[40:43], off
; GCN-NEXT: global_store_dwordx4 v[2:3], v[44:47], off
; GCN-NEXT: s_addc_u32 s5, s9, 0
; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: v_mov_b32_e32 v1, s5
; GCN-NEXT: s_add_u32 s4, s8, s15
; GCN-NEXT: s_add_u32 s4, s8, s17
; GCN-NEXT: s_addc_u32 s5, s9, 0
; GCN-NEXT: v_mov_b32_e32 v2, s4
; GCN-NEXT: v_mov_b32_e32 v3, s5
; GCN-NEXT: s_add_u32 s4, s8, s16
; GCN-NEXT: s_add_u32 s4, s8, s18
; GCN-NEXT: global_store_dwordx4 v[0:1], v[48:51], off
; GCN-NEXT: global_store_dwordx4 v[2:3], v[52:55], off
; GCN-NEXT: s_addc_u32 s5, s9, 0
; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: v_mov_b32_e32 v1, s5
; GCN-NEXT: s_add_u32 s4, s8, s17
; GCN-NEXT: s_add_u32 s4, s8, s19
; GCN-NEXT: s_addc_u32 s5, s9, 0
; GCN-NEXT: v_mov_b32_e32 v2, s4
; GCN-NEXT: v_mov_b32_e32 v3, s5
Expand Down
8 changes: 4 additions & 4 deletions llvm/test/CodeGen/AMDGPU/call-argument-types.ll
Expand Up @@ -744,13 +744,13 @@ entry:

; GCN-LABEL: {{^}}tail_call_byval_align16:
; GCN-NOT: s32
; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:12
; GCN: buffer_load_dword v33, off, s[0:3], s32 offset:8
; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:8
; GCN: buffer_load_dword v33, off, s[0:3], s32 offset:12

; GCN: s_getpc_b64

; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:4
; GCN: buffer_store_dword v33, off, s[0:3], s32{{$}}
; GCN: buffer_store_dword v33, off, s[0:3], s32 offset:4
; GCN: buffer_store_dword v32, off, s[0:3], s32{{$}}
; GCN-NOT: s32
; GCN: s_setpc_b64
define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
Expand Down
8 changes: 3 additions & 5 deletions llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
Expand Up @@ -624,11 +624,10 @@ define void @too_many_args_use_workitem_id_x_byval(


; FIXEDABI: v_mov_b32_e32 [[K0:v[0-9]+]], 0x3e7
; FIXEDABI: buffer_store_dword [[K0]], off, s[0:3], 0 offset:4{{$}}

; FIXEDABI: s_movk_i32 s32, 0x400{{$}}

; FIXEDABI: v_mov_b32_e32 [[K1:v[0-9]+]], 0x140
; FIXEDABI: buffer_store_dword [[K0]], off, s[0:3], 0 offset:4{{$}}

; FIXEDABI: buffer_store_dword [[K1]], off, s[0:3], s32{{$}}

; FIXME: Why this reload?
Expand Down Expand Up @@ -670,9 +669,8 @@ define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x_byval() #1

; FIXED-ABI-NOT: v31
; FIXEDABI: v_mov_b32_e32 [[K0:v[0-9]+]], 0x3e7{{$}}
; FIXEDABI: buffer_store_dword [[K0]], off, s[0:3], s33{{$}}

; FIXEDABI: v_mov_b32_e32 [[K1:v[0-9]+]], 0x140{{$}}
; FIXEDABI: buffer_store_dword [[K0]], off, s[0:3], s33{{$}}
; FIXEDABI: buffer_store_dword [[K1]], off, s[0:3], s32{{$}}
; FIXEDABI: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s33{{$}}

Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/AMDGPU/fshr.ll
Expand Up @@ -1364,11 +1364,11 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v2
; GFX9-NEXT: buffer_store_byte_d16_hi v2, v0, s[0:3], 0 offen offset:5
; GFX9-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:4
; GFX9-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:3
; GFX9-NEXT: buffer_store_byte_d16_hi v1, v0, s[0:3], 0 offen offset:2
; GFX9-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen
; GFX9-NEXT: buffer_store_byte_d16_hi v2, v0, s[0:3], 0 offen offset:5
; GFX9-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:4
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AMDGPU/half.ll
Expand Up @@ -312,6 +312,7 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f32(<8 x float> addrspace(1
; SI: v_cvt_f32_f16_e32
; SI: v_cvt_f32_f16_e32
; SI: v_cvt_f32_f16_e32
; SI: v_cvt_f32_f16_e32

; GCN: flat_store_dwordx4

Expand All @@ -325,7 +326,6 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f32(<8 x float> addrspace(1
; SI: v_cvt_f32_f16_e32
; SI: v_cvt_f32_f16_e32
; SI: v_cvt_f32_f16_e32
; SI: v_cvt_f32_f16_e32

; VI: v_cvt_f32_f16_e32
; VI: v_cvt_f32_f16_sdwa
Expand Down

0 comments on commit 3359ea6

Please sign in to comment.