Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[RISCV][ISel] Remove redundant min/max in saturating truncation #75145

Merged
merged 10 commits into from
Dec 29, 2023

Conversation

sun-jacobi
Copy link
Member

@sun-jacobi sun-jacobi commented Dec 12, 2023

This patch closed #73424, which is also a missed-optimization case similar to #68466 on X86.

Source Code

define void @trunc_sat_i8i16(ptr %x, ptr %y) {
  %1 = load <8 x i16>, ptr %x, align 16
  %2 = tail call <8 x i16> @llvm.smax.v8i16(<8 x i16> %1, <8 x i16> <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>)
  %3 = tail call <8 x i16> @llvm.smin.v8i16(<8 x i16> %2, <8 x i16> <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>)
  %4 = trunc <8 x i16> %3 to <8 x i8>
  store <8 x i8> %4, ptr %y, align 8
  ret void
}

Before this patch:

trunc_sat_i8i16:                  # @trunc_maxmin_id_i8i16
        vsetivli        zero, 8, e16, m1, ta, ma
        vle16.v v8, (a0)
        li      a0, -128
        vmax.vx v8, v8, a0
        li      a0, 127
        vmin.vx v8, v8, a0
        vsetvli zero, zero, e8, mf2, ta, ma
        vnsrl.wi        v8, v8, 0
        vse8.v  v8, (a1)
        ret

After this patch:

trunc_sat_i8i16:                  # @trunc_maxmin_id_i8i16
	vsetivli	zero, 8, e8, mf2, ta, ma
	vle16.v	v8, (a0)
	csrwi	vxrm, 0
	vnclip.wi	v8, v8, 0
	vse8.v	v8, (a1)
	ret

@llvmbot
Copy link
Collaborator

llvmbot commented Dec 12, 2023

@llvm/pr-subscribers-backend-risc-v

Author: Chia (sun-jacobi)

Changes

This patch is aiming at fixing a missed-optimization case similar to #68466 on X86.

Source Code

define void @<!-- -->trunc_maxmin_id_i8i16(ptr %x, ptr %y) {
  %1 = load &lt;8 x i16&gt;, ptr %x, align 16
  %2 = tail call &lt;8 x i16&gt; @<!-- -->llvm.smax.v8i16(&lt;8 x i16&gt; %1, &lt;8 x i16&gt; &lt;i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128&gt;)
  %3 = tail call &lt;8 x i16&gt; @<!-- -->llvm.smin.v8i16(&lt;8 x i16&gt; %2, &lt;8 x i16&gt; &lt;i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127&gt;)
  %4 = trunc &lt;8 x i16&gt; %3 to &lt;8 x i8&gt;
  store &lt;8 x i8&gt; %4, ptr %y, align 8
  ret void
}

Before this patch:

trunc_maxmin_id_i8i16:                  # @<!-- -->trunc_maxmin_id_i8i16
        vsetivli        zero, 8, e16, m1, ta, ma
        vle16.v v8, (a0)
        li      a0, -128
        vmax.vx v8, v8, a0
        li      a0, 127
        vmin.vx v8, v8, a0
        vsetvli zero, zero, e8, mf2, ta, ma
        vnsrl.wi        v8, v8, 0
        vse8.v  v8, (a1)
        ret

After this patch:

trunc_maxmin_id_i8i16:                  # @<!-- -->trunc_maxmin_id_i8i16
	vsetivli	zero, 8, e8, mf2, ta, ma
	vle16.v	v8, (a0)
	vnsrl.wi	v8, v8, 0
	vse8.v	v8, (a1)
	ret

This issue is also inspired by #73424, but not using vnclip


Full diff: https://github.com/llvm/llvm-project/pull/75145.diff

1 Files Affected:

  • (modified) llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td (+54)
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
index dc6b57fad3210..91eb9d775682c 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
@@ -1618,6 +1618,60 @@ multiclass VPatBinaryFPWVL_VV_VF_WV_WF_RM<SDNode vop, SDNode vop_w, string instr
   }
 }
 
+
+multiclass VPatTruncSplatMaxMinIdentityBase<VTypeInfo vti, VTypeInfo wti,
+  SDPatternOperator vop1, int vid1, SDPatternOperator vop2, int vid2> {
+  let Predicates = !listconcat(GetVTypePredicates<vti>.Predicates,
+                               GetVTypePredicates<wti>.Predicates) in
+  def : Pat<(vti.Vector (riscv_trunc_vector_vl
+    (wti.Vector (vop1
+        (wti.Vector (vop2
+            (wti.Vector wti.RegClass:$rs1),
+            (wti.Vector (riscv_vmv_v_x_vl (wti.Vector undef), vid2, (XLenVT srcvalue))),
+            (wti.Vector undef),(wti.Mask V0), VLOpFrag)),
+        (wti.Vector (riscv_vmv_v_x_vl (wti.Vector undef), vid1, (XLenVT srcvalue))),
+        (wti.Vector undef), (wti.Mask V0), VLOpFrag)),
+    (vti.Mask V0), VLOpFrag)),
+    (!cast<Instruction>("PseudoVNSRL_WI_"#vti.LMul.MX#"_MASK")
+      (vti.Vector (IMPLICIT_DEF)), wti.RegClass:$rs1, 0,
+      (vti.Mask V0), GPR:$vl, vti.Log2SEW, TA_MA)>;
+}
+
+multiclass VPatTruncSplatMinIdentityBase<VTypeInfo vti, VTypeInfo wti,
+  SDPatternOperator vop, int vid> {
+  let Predicates = !listconcat(GetVTypePredicates<vti>.Predicates,
+                               GetVTypePredicates<wti>.Predicates) in
+  def : Pat<(vti.Vector (riscv_trunc_vector_vl
+    (wti.Vector (vop
+      (wti.Vector wti.RegClass:$rs1),
+      (wti.Vector (riscv_vmv_v_x_vl (wti.Vector undef), vid, (XLenVT srcvalue))),
+      (wti.Vector undef), (wti.Mask V0), VLOpFrag)),
+    (vti.Mask V0), VLOpFrag)),
+    (!cast<Instruction>("PseudoVNSRL_WI_"#vti.LMul.MX#"_MASK")
+      (vti.Vector (IMPLICIT_DEF)), wti.RegClass:$rs1, 0,
+      (vti.Mask V0), GPR:$vl, vti.Log2SEW, TA_MA)>;
+}
+
+
+multiclass VPatTruncSplatMaxMinIdentity<VTypeInfo vti, VTypeInfo wti> {
+  defvar sew = vti.SEW;
+  defvar umin_id = !sub(!shl(1, sew), 1);
+  defvar umax_id = 0;
+  defvar smin_id = !sub(!shl(1, !sub(sew, 1)), 1);
+  defvar smax_id = !sub(0, !shl(1, !sub(sew, 1)));
+
+  defm : VPatTruncSplatMaxMinIdentityBase<vti, wti, riscv_umax_vl, umax_id, riscv_umin_vl, umin_id>;
+  defm : VPatTruncSplatMaxMinIdentityBase<vti, wti, riscv_umin_vl, umin_id, riscv_umax_vl, umax_id>;
+  defm : VPatTruncSplatMaxMinIdentityBase<vti, wti, riscv_smin_vl, smin_id, riscv_smax_vl, smax_id>;
+  defm : VPatTruncSplatMaxMinIdentityBase<vti, wti, riscv_smax_vl, smax_id, riscv_smin_vl, smin_id>;
+
+  defm : VPatTruncSplatMinIdentityBase<vti, wti, riscv_umin_vl, umin_id>;
+
+}
+
+foreach vtiToWti = AllWidenableIntVectors in
+  defm : VPatTruncSplatMaxMinIdentity<vtiToWti.Vti, vtiToWti.Wti>;
+
 multiclass VPatNarrowShiftSplatExt_WX<SDNode op, PatFrags extop, string instruction_name> {
   foreach vtiToWti = AllWidenableIntVectors in {
     defvar vti = vtiToWti.Vti;

@topperc
Copy link
Collaborator

topperc commented Dec 12, 2023

How can you use just vnsrl? vnsrl discards the upper bits without any max or min.

@sun-jacobi
Copy link
Member Author

sun-jacobi commented Dec 12, 2023

How can you use just vnsrl? vnsrl discards the upper bits without any max or min.

Yes, you are right.

But if the range created by a pair of max/min operation is precisely the range of the truncation destination,
then the pair could be removed.

In the above case, [-128, 127] is the value range of u8.
Since we truncate the u16 vector into a u8 vector, removing the max/min won't change the final result, AFAIU.

@topperc
Copy link
Collaborator

topperc commented Dec 12, 2023

How can you use just vnsrl? vnsrl discards the upper bits without any max or min.

Yes, you are right.

But if the range created by a pair of max/min operation is precisely the range of the truncation destination,

then the pair could be removed.

In the above case, [-128, 127] is the value range of u8.

Since we truncate the u16 vector into a u8 vector, removing the max/min won't change the final result, AFAIU.

Let's imagine the input is -32767(0x8001) in i16. The smax with -128 should give -128(0xff80). Then it will be truncated to 0x80 which is -128 in i8.

A vnsrl by itself will drop the upper bits and give 0x01. This is incorrect.

Copy link
Collaborator

@topperc topperc left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think this is correct. We need to use vnclip.

@sun-jacobi
Copy link
Member Author

Thank you for correcting me. I will fix it soon.

defvar sminval = !sub(!shl(1, !sub(sew, 1)), 1);
defvar smaxval = !sub(0, !shl(1, !sub(sew, 1)));

defm : VPatTruncSatClipMaxMin<"PseudoVNCLIP", vti, wti, riscv_umax_vl,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't umax/umin use VNCLIPU and smax/smin use VNCLIP?

defvar smaxval = !sub(0, !shl(1, !sub(sew, 1)));

defm : VPatTruncSatClipMaxMin<"PseudoVNCLIPU", vti, wti, riscv_umax_vl,
umaxval, riscv_umin_vl, uminval>;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is indented too far

defm : VPatTruncSatClipMaxMin<"PseudoVNCLIPU", vti, wti, riscv_umax_vl,
umaxval, riscv_umin_vl, uminval>;
defm : VPatTruncSatClipMaxMin<"PseudoVNCLIP", vti, wti, riscv_smin_vl,
sminval, riscv_smax_vl, smaxval>;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same here

defvar sminval = !sub(!shl(1, !sub(sew, 1)), 1);
defvar smaxval = !sub(0, !shl(1, !sub(sew, 1)));

defm : VPatTruncSatClipMaxMin<"PseudoVNCLIPU", vti, wti, riscv_umax_vl,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this unsigned pattern needed? Won't a umax with 0 be deleted by DAG combiner? Leaving only the umin.

Copy link
Collaborator

@topperc topperc left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM

@sun-jacobi
Copy link
Member Author

@topperc

Could you help me request more reviews?
It seems that I can not request review (or merge) without the write access.

SDPatternOperator op1, int op1_value, SDPatternOperator op2, int op2_value> {
let Predicates = !listconcat(GetVTypePredicates<vti>.Predicates,
GetVTypePredicates<wti>.Predicates) in
def : Pat<(vti.Vector (riscv_trunc_vector_vl
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is just one def in VPatTruncSatClipMaxMinBase and VPatTruncSatClipUMin, can we use class inheritances here?

@ChunyuLiao ChunyuLiao merged commit 87779fd into llvm:main Dec 29, 2023
4 checks passed
@sun-jacobi sun-jacobi deleted the combine-trunc-minmax-identity branch December 29, 2023 14:11
sun-jacobi added a commit that referenced this pull request Apr 18, 2024
Similar to #75145, but for scalable vectors.

Specifically, this patch works for the below optimization case:

## Source Code
```
define void @trunc_sat_i8i16_maxmin(ptr %x, ptr %y) {
  %1 = load <vscale x 4 x i16>, ptr %x, align 16
  %2 = tail call <vscale x 4 x i16> @llvm.smax.v4i16(<vscale x 4 x i16> %1, <vscale x 4 x i16> splat (i16 -128))
  %3 = tail call <vscale x 4 x i16> @llvm.smin.v4i16(<vscale x 4 x i16> %2, <vscale x 4 x i16> splat (i16 127))
  %4 = trunc <vscale x 4 x i16> %3 to <vscale x 4 x i8>
  store <vscale x 4 x i8> %4, ptr %y, align 8
  ret void
}
```
## Before this patch
[Compiler Explorer](https://godbolt.org/z/EKc9eGvo8)
```
trunc_sat_i8i16_maxmin:
        vl1re16.v       v8, (a0)
        li      a0, -128
        vsetvli a2, zero, e16, m1, ta, ma
        vmax.vx v8, v8, a0
        li      a0, 127
        vmin.vx v8, v8, a0
        vsetvli zero, zero, e8, mf2, ta, ma
        vnsrl.wi        v8, v8, 0
        vse8.v  v8, (a1)
        ret
```
## After this patch
```
trunc_sat_i8i16_maxmin:
        vsetivli zero, 4, e8, mf4, ta, ma
        vle16.v v8, (a0)
        vnclip.wi v8, v8, 0
        vse8.v v8, (a1)
        ret
```
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

[RISCV] Vector saturating truncation not using VNCLIP
5 participants