Skip to content

Commit

Permalink
[GlobalISel] Add X,Y<dead> = G_UNMERGE Z -> X = G_TRUNC Z
Browse files Browse the repository at this point in the history
Add a combiner helper that replaces G_UNMERGE where all the destination lanes
are dead except the first one with a G_TRUNC.

Differential Revision: https://reviews.llvm.org/D87174
  • Loading branch information
Quentin Colombet committed Sep 15, 2020
1 parent 8bd0dc5 commit d232112
Show file tree
Hide file tree
Showing 9 changed files with 1,581 additions and 100 deletions.
4 changes: 4 additions & 0 deletions llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
Expand Up @@ -258,6 +258,10 @@ class CombinerHelper {
bool applyCombineUnmergeConstant(MachineInstr &MI,
SmallVectorImpl<APInt> &Csts);

/// Transform X, Y<dead> = G_UNMERGE Z -> X = G_TRUNC Z.
bool matchCombineUnmergeWithDeadLanesToTrunc(MachineInstr &MI);
bool applyCombineUnmergeWithDeadLanesToTrunc(MachineInstr &MI);

/// Transform IntToPtr(PtrToInt(x)) to x if cast is in the same address space.
bool matchCombineI2PToP2I(MachineInstr &MI, Register &Reg);
bool applyCombineI2PToP2I(MachineInstr &MI, Register &Reg);
Expand Down
10 changes: 9 additions & 1 deletion llvm/include/llvm/Target/GlobalISel/Combine.td
Expand Up @@ -421,6 +421,14 @@ def unmerge_cst : GICombineRule<
(apply [{ return Helper.applyCombineUnmergeConstant(*${d}, ${info}); }])
>;

// Transform x,y<dead> = unmerge z -> x = trunc z.
def unmerge_dead_to_trunc : GICombineRule<
(defs root:$d),
(match (wip_match_opcode G_UNMERGE_VALUES): $d,
[{ return Helper.matchCombineUnmergeWithDeadLanesToTrunc(*${d}); }]),
(apply [{ return Helper.applyCombineUnmergeWithDeadLanesToTrunc(*${d}); }])
>;

// FIXME: These should use the custom predicate feature once it lands.
def undef_combines : GICombineGroup<[undef_to_fp_zero, undef_to_int_zero,
undef_to_negative_one,
Expand Down Expand Up @@ -452,4 +460,4 @@ def all_combines : GICombineGroup<[trivial_combines, ptr_add_immed_chain,
width_reduction_combines, select_combines,
known_bits_simplifications, ext_ext_fold,
not_cmp_fold, opt_brcond_by_inverting_cond,
unmerge_merge, fabs_fabs_fold, unmerge_cst]>;
unmerge_merge, fabs_fabs_fold, unmerge_cst, unmerge_dead_to_trunc]>;
33 changes: 33 additions & 0 deletions llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
Expand Up @@ -1654,6 +1654,39 @@ bool CombinerHelper::applyCombineUnmergeConstant(MachineInstr &MI,
return true;
}

bool CombinerHelper::matchCombineUnmergeWithDeadLanesToTrunc(MachineInstr &MI) {
assert(MI.getOpcode() == TargetOpcode::G_UNMERGE_VALUES &&
"Expected an unmerge");
// Check that all the lanes are dead except the first one.
for (unsigned Idx = 1, EndIdx = MI.getNumDefs(); Idx != EndIdx; ++Idx) {
if (!MRI.use_nodbg_empty(MI.getOperand(Idx).getReg()))
return false;
}
return true;
}

bool CombinerHelper::applyCombineUnmergeWithDeadLanesToTrunc(MachineInstr &MI) {
Builder.setInstrAndDebugLoc(MI);
Register SrcReg = MI.getOperand(MI.getNumDefs()).getReg();
// Truncating a vector is going to truncate every single lane,
// whereas we want the full lowbits.
// Do the operation on a scalar instead.
LLT SrcTy = MRI.getType(SrcReg);
if (SrcTy.isVector())
SrcReg =
Builder.buildCast(LLT::scalar(SrcTy.getSizeInBits()), SrcReg).getReg(0);

Register Dst0Reg = MI.getOperand(0).getReg();
LLT Dst0Ty = MRI.getType(Dst0Reg);
if (Dst0Ty.isVector()) {
auto MIB = Builder.buildTrunc(LLT::scalar(Dst0Ty.getSizeInBits()), SrcReg);
Builder.buildCast(Dst0Reg, MIB);
} else
Builder.buildTrunc(Dst0Reg, SrcReg);
MI.eraseFromParent();
return true;
}

bool CombinerHelper::matchCombineShiftToUnmerge(MachineInstr &MI,
unsigned TargetShiftSize,
unsigned &ShiftVal) {
Expand Down
77 changes: 77 additions & 0 deletions llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir
Expand Up @@ -292,3 +292,80 @@ body: |
$h2 = COPY %3(s16)
$h3 = COPY %4(s16)
...

# Transform unmerge into trunc when only the first definition is live.
---
name: test_combine_unmerge_dead_to_trunc
body: |
bb.1:
; CHECK-LABEL: name: test_combine_unmerge_dead_to_trunc
; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s64)
; CHECK: $h0 = COPY [[TRUNC]](s16)
%0:_(s64) = COPY $x0
%1:_(s16),%2:_(s16),%3:_(s16),%4:_(s16) = G_UNMERGE_VALUES %0(s64)
$h0 = COPY %1(s16)
...

# Don't transform unmerge into trunc when middle lanes are live.
---
name: test_dont_combine_unmerge_dead_to_trunc
body: |
bb.1:
; CHECK-LABEL: name: test_dont_combine_unmerge_dead_to_trunc
; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
; CHECK: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16), [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY]](s64)
; CHECK: $h0 = COPY [[UV2]](s16)
%0:_(s64) = COPY $x0
%1:_(s16),%2:_(s16),%3:_(s16),%4:_(s16) = G_UNMERGE_VALUES %0(s64)
$h0 = COPY %3(s16)
...

# Transform unmerge into trunc when only the first definition is live, even
# if the input and output types are vectors.
---
name: test_combine_unmerge_dead_to_trunc_vec_in_n_out
body: |
bb.1:
; CHECK-LABEL: name: test_combine_unmerge_dead_to_trunc_vec_in_n_out
; CHECK: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $x0
; CHECK: [[BITCAST:%[0-9]+]]:_(s64) = G_BITCAST [[COPY]](<2 x s32>)
; CHECK: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[BITCAST]](s64)
; CHECK: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[TRUNC]](s32)
; CHECK: $w0 = COPY [[BITCAST1]](<2 x s16>)
%0:_(<2 x s32>) = COPY $x0
%1:_(<2 x s16>),%2:_(<2 x s16>) = G_UNMERGE_VALUES %0(<2 x s32>)
$w0 = COPY %1(<2 x s16>)
...

# Transform unmerge into trunc when only the first definition is live, even
# if the input type is vector.
---
name: test_combine_unmerge_dead_to_trunc_vec_in
body: |
bb.1:
; CHECK-LABEL: name: test_combine_unmerge_dead_to_trunc_vec_in
; CHECK: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $x0
; CHECK: [[BITCAST:%[0-9]+]]:_(s64) = G_BITCAST [[COPY]](<2 x s32>)
; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s64)
; CHECK: $h0 = COPY [[TRUNC]](s16)
%0:_(<2 x s32>) = COPY $x0
%1:_(s16),%2:_(s16),%3:_(s16),%4:_(s16) = G_UNMERGE_VALUES %0(<2 x s32>)
$h0 = COPY %1(s16)
...

# Transform unmerge into trunc when only the first definition is live, even
# if the output type are vector.
---
name: test_combine_unmerge_dead_to_trunc_vec_out
body: |
bb.1:
; CHECK-LABEL: name: test_combine_unmerge_dead_to_trunc_vec_out
; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
; CHECK: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
; CHECK: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[TRUNC]](s32)
; CHECK: $w0 = COPY [[BITCAST]](<2 x s16>)
%0:_(s64) = COPY $x0
%1:_(<2 x s16>),%2:_(<2 x s16>) = G_UNMERGE_VALUES %0(s64)
$w0 = COPY %1(<2 x s16>)
...
16 changes: 8 additions & 8 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-narrow.mir
Expand Up @@ -12,9 +12,9 @@ body: |
; CHECK-LABEL: name: narrow_shl_s64_32_s64amt
; CHECK: liveins: $vgpr0_vgpr1
; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
; CHECK: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
; CHECK: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C]](s32), [[UV]](s32)
; CHECK: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C]](s32), [[TRUNC]](s32)
; CHECK: $vgpr0_vgpr1 = COPY [[MV]](s64)
%0:_(s64) = COPY $vgpr0_vgpr1
%1:_(s64) = G_CONSTANT i64 32
Expand All @@ -32,9 +32,9 @@ body: |
; CHECK-LABEL: name: narrow_shl_s64_32
; CHECK: liveins: $vgpr0_vgpr1
; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
; CHECK: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
; CHECK: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C]](s32), [[UV]](s32)
; CHECK: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C]](s32), [[TRUNC]](s32)
; CHECK: $vgpr0_vgpr1 = COPY [[MV]](s64)
%0:_(s64) = COPY $vgpr0_vgpr1
%1:_(s32) = G_CONSTANT i32 32
Expand All @@ -52,9 +52,9 @@ body: |
; CHECK-LABEL: name: narrow_shl_s64_33
; CHECK: liveins: $vgpr0_vgpr1
; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
; CHECK: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[UV]], [[C]](s32)
; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[TRUNC]], [[C]](s32)
; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
; CHECK: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C1]](s32), [[SHL]](s32)
; CHECK: $vgpr0_vgpr1 = COPY [[MV]](s64)
Expand Down Expand Up @@ -93,9 +93,9 @@ body: |
; CHECK-LABEL: name: narrow_shl_s64_63
; CHECK: liveins: $vgpr0_vgpr1
; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
; CHECK: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 31
; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[UV]], [[C]](s32)
; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[TRUNC]], [[C]](s32)
; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
; CHECK: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C1]](s32), [[SHL]](s32)
; CHECK: $vgpr0_vgpr1 = COPY [[MV]](s64)
Expand Down
Expand Up @@ -110,15 +110,16 @@ define amdgpu_ps void @image_store_v3f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t,
; UNPACKED: [[COPY11:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
; UNPACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
; UNPACKED: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF
; UNPACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
; UNPACKED: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY10]](<2 x s16>)
; UNPACKED: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY10]](<2 x s16>), [[COPY11]](<2 x s16>), [[DEF]](<2 x s16>)
; UNPACKED: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[CONCAT_VECTORS]](<6 x s16>)
; UNPACKED: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST]](s96)
; UNPACKED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
; UNPACKED: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
; UNPACKED: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY11]](<2 x s16>)
; UNPACKED: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
; UNPACKED: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
; UNPACKED: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C]](s32)
; UNPACKED: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C]](s32)
; UNPACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
; UNPACKED: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
; UNPACKED: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
; UNPACKED: [[COPY14:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32)
; UNPACKED: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
; UNPACKED: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32)
; UNPACKED: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[BUILD_VECTOR2]](<3 x s32>), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 6 into custom "TargetCustom8", align 8)
; UNPACKED: S_ENDPGM 0
Expand All @@ -140,9 +141,29 @@ define amdgpu_ps void @image_store_v3f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t,
; PACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
; PACKED: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF
; PACKED: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY10]](<2 x s16>), [[COPY11]](<2 x s16>), [[DEF]](<2 x s16>)
; PACKED: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>)
; PACKED: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[CONCAT_VECTORS]](<6 x s16>)
; PACKED: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST]](s96)
; PACKED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
; PACKED: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C]](s32)
; PACKED: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C]](s32)
; PACKED: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
; PACKED: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
; PACKED: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C1]]
; PACKED: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
; PACKED: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C1]]
; PACKED: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
; PACKED: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
; PACKED: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
; PACKED: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
; PACKED: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY14]], [[C1]]
; PACKED: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
; PACKED: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C2]], [[C]](s32)
; PACKED: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
; PACKED: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
; PACKED: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[DEF]](<2 x s16>)
; PACKED: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS1]](<6 x s16>), 0
; PACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
; PACKED: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[UV]](<3 x s16>), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 6 into custom "TargetCustom8", align 8)
; PACKED: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[EXTRACT]](<3 x s16>), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 6 into custom "TargetCustom8", align 8)
; PACKED: S_ENDPGM 0
call void @llvm.amdgcn.image.store.2d.v3f16.i32(<3 x half> %in, i32 7, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
ret void
Expand Down

0 comments on commit d232112

Please sign in to comment.