Skip to content

Commit 6daf84e

Browse files
committed
[DAG] Fold mismatched widened avg idioms to narrow form (#147946)
This fold corrects mismatched widened averaging idioms by folding: `trunc(avgceilu(sext(x), sext(y))) -> avgceils(x, y)` `trunc(avgceils(zext(x), zext(y))) -> avgceilu(x, y)` When inputs are sign-extended, unsigned and signed averaging operations produce identical results after truncation, allowing us to use the semantically correct narrow operation. alive2: https://alive2.llvm.org/ce/z/ZRbfHT
1 parent 53ddeb4 commit 6daf84e

File tree

2 files changed

+103
-2
lines changed

2 files changed

+103
-2
lines changed

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16482,10 +16482,30 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
1648216482
DAG, DL);
1648316483
}
1648416484
break;
16485-
case ISD::AVGFLOORS:
16486-
case ISD::AVGFLOORU:
1648716485
case ISD::AVGCEILS:
1648816486
case ISD::AVGCEILU:
16487+
// trunc (avgceilu (sext (x), sext (y))) -> avgceils(x, y)
16488+
// trunc (avgceils (zext (x), zext (y))) -> avgceilu(x, y)
16489+
if (N0.hasOneUse()) {
16490+
SDValue Op0 = N0.getOperand(0);
16491+
SDValue Op1 = N0.getOperand(1);
16492+
if (N0.getOpcode() == ISD::AVGCEILU) {
16493+
if (TLI.isOperationLegalOrCustom(ISD::AVGCEILS, VT) &&
16494+
(Op0.getOpcode() == ISD::SIGN_EXTEND ||
16495+
Op0.getOpcode() == ISD::SIGN_EXTEND_INREG) &&
16496+
(Op1.getOpcode() == ISD::SIGN_EXTEND ||
16497+
Op1.getOpcode() == ISD::SIGN_EXTEND_INREG))
16498+
return DAG.getNode(ISD::AVGCEILS, DL, VT, Op0.getOperand(0),
16499+
Op1.getOperand(0));
16500+
} else {
16501+
if (TLI.isOperationLegalOrCustom(ISD::AVGCEILU, VT) &&
16502+
Op0.getOpcode() == ISD::ZERO_EXTEND &&
16503+
Op1.getOpcode() == ISD::ZERO_EXTEND)
16504+
return DAG.getNode(ISD::AVGCEILU, DL, VT, Op0.getOperand(0),
16505+
Op1.getOperand(0));
16506+
}
16507+
}
16508+
[[fallthrough]];
1648916509
case ISD::ABDS:
1649016510
case ISD::ABDU:
1649116511
// (trunc (avg a, b)) -> (avg (trunc a), (trunc b))

llvm/test/CodeGen/AArch64/sve-hadd.ll

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,87 @@
22
; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mattr=+sve | FileCheck %s -check-prefixes=CHECK,SVE
33
; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mattr=+sve2 | FileCheck %s -check-prefixes=CHECK,SVE2
44

5+
define <8 x i8> @srhadd_v8i8_trunc(<8 x i8> %s0, <8 x i8> %s1) {
6+
; CHECK-LABEL: srhadd_v8i8_trunc:
7+
; CHECK: // %bb.0:
8+
; CHECK-NEXT: srhadd v0.8b, v0.8b, v1.8b
9+
; CHECK-NEXT: ret
10+
%s0s = sext <8 x i8> %s0 to <8 x i16>
11+
%s1s = sext <8 x i8> %s1 to <8 x i16>
12+
%s = call <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16> %s0s, <8 x i16> %s1s)
13+
%s2 = trunc <8 x i16> %s to <8 x i8>
14+
ret <8 x i8> %s2
15+
}
16+
17+
define <4 x i16> @srhadd_v4i16_trunc(<4 x i16> %s0, <4 x i16> %s1) {
18+
; CHECK-LABEL: srhadd_v4i16_trunc:
19+
; CHECK: // %bb.0:
20+
; CHECK-NEXT: srhadd v0.4h, v0.4h, v1.4h
21+
; CHECK-NEXT: ret
22+
%s0s = sext <4 x i16> %s0 to <4 x i32>
23+
%s1s = sext <4 x i16> %s1 to <4 x i32>
24+
%s = call <4 x i32> @llvm.aarch64.neon.urhadd.v4i32(<4 x i32> %s0s, <4 x i32> %s1s)
25+
%s2 = trunc <4 x i32> %s to <4 x i16>
26+
ret <4 x i16> %s2
27+
}
28+
29+
define <2 x i32> @srhadd_v2i32_trunc(<2 x i32> %s0, <2 x i32> %s1) {
30+
; CHECK-LABEL: srhadd_v2i32_trunc:
31+
; CHECK: // %bb.0:
32+
; CHECK-NEXT: sshll v0.2d, v0.2s, #0
33+
; CHECK-NEXT: sshll v1.2d, v1.2s, #0
34+
; CHECK-NEXT: eor v2.16b, v0.16b, v1.16b
35+
; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
36+
; CHECK-NEXT: ushr v1.2d, v2.2d, #1
37+
; CHECK-NEXT: sub v0.2d, v0.2d, v1.2d
38+
; CHECK-NEXT: xtn v0.2s, v0.2d
39+
; CHECK-NEXT: ret
40+
%s0s = sext <2 x i32> %s0 to <2 x i64>
41+
%s1s = sext <2 x i32> %s1 to <2 x i64>
42+
%s = call <2 x i64> @llvm.aarch64.neon.urhadd.v2i64(<2 x i64> %s0s, <2 x i64> %s1s)
43+
%s2 = trunc <2 x i64> %s to <2 x i32>
44+
ret <2 x i32> %s2
45+
}
46+
47+
define <8 x i8> @urhadd_v8i8_trunc(<8 x i8> %s0, <8 x i8> %s1) {
48+
; CHECK-LABEL: urhadd_v8i8_trunc:
49+
; CHECK: // %bb.0:
50+
; CHECK-NEXT: urhadd v0.8b, v0.8b, v1.8b
51+
; CHECK-NEXT: ret
52+
%s0s = zext <8 x i8> %s0 to <8 x i16>
53+
%s1s = zext <8 x i8> %s1 to <8 x i16>
54+
%s = call <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16> %s0s, <8 x i16> %s1s)
55+
%s2 = trunc <8 x i16> %s to <8 x i8>
56+
ret <8 x i8> %s2
57+
}
58+
59+
define <4 x i16> @urhadd_v4i16_trunc(<4 x i16> %s0, <4 x i16> %s1) {
60+
; CHECK-LABEL: urhadd_v4i16_trunc:
61+
; CHECK: // %bb.0:
62+
; CHECK-NEXT: urhadd v0.4h, v0.4h, v1.4h
63+
; CHECK-NEXT: ret
64+
%s0s = zext <4 x i16> %s0 to <4 x i32>
65+
%s1s = zext <4 x i16> %s1 to <4 x i32>
66+
%s = call <4 x i32> @llvm.aarch64.neon.srhadd.v4i32(<4 x i32> %s0s, <4 x i32> %s1s)
67+
%s2 = trunc <4 x i32> %s to <4 x i16>
68+
ret <4 x i16> %s2
69+
}
70+
71+
define <2 x i32> @urhadd_v2i32_trunc(<2 x i32> %s0, <2 x i32> %s1) {
72+
; CHECK-LABEL: urhadd_v2i32_trunc:
73+
; CHECK: // %bb.0:
74+
; CHECK-NEXT: mov z2.d, #1 // =0x1
75+
; CHECK-NEXT: uaddl v0.2d, v0.2s, v1.2s
76+
; CHECK-NEXT: add v0.2d, v0.2d, v2.2d
77+
; CHECK-NEXT: shrn v0.2s, v0.2d, #1
78+
; CHECK-NEXT: ret
79+
%s0s = zext <2 x i32> %s0 to <2 x i64>
80+
%s1s = zext <2 x i32> %s1 to <2 x i64>
81+
%s = call <2 x i64> @llvm.aarch64.neon.srhadd.v2i64(<2 x i64> %s0s, <2 x i64> %s1s)
82+
%s2 = trunc <2 x i64> %s to <2 x i32>
83+
ret <2 x i32> %s2
84+
}
85+
586
define <vscale x 2 x i64> @hadds_v2i64(<vscale x 2 x i64> %s0, <vscale x 2 x i64> %s1) {
687
; SVE-LABEL: hadds_v2i64:
788
; SVE: // %bb.0: // %entry

0 commit comments

Comments
 (0)