948 changes: 948 additions & 0 deletions llvm/test/CodeGen/RISCV/rvv/fixed-vectors-abd.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,948 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=riscv32 -mattr=+v,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
; RUN: llc -mtriple=riscv64 -mattr=+v,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
;
; SABD
;

define <8 x i8> @sabd_8b_as_16b(<8 x i8> %a, <8 x i8> %b) {
;
; CHECK-LABEL: sabd_8b_as_16b:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
; CHECK-NEXT: vwsub.vv v10, v8, v9
; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
; CHECK-NEXT: vrsub.vi v8, v10, 0
; CHECK-NEXT: vmax.vv v8, v10, v8
; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, ma
; CHECK-NEXT: vnsrl.wi v8, v8, 0
; CHECK-NEXT: ret
%a.sext = sext <8 x i8> %a to <8 x i16>
%b.sext = sext <8 x i8> %b to <8 x i16>
%sub = sub <8 x i16> %a.sext, %b.sext
%abs = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %sub, i1 true)
%trunc = trunc <8 x i16> %abs to <8 x i8>
ret <8 x i8> %trunc
}

define <8 x i8> @sabd_8b_as_32b(<8 x i8> %a, <8 x i8> %b) {
;
; CHECK-LABEL: sabd_8b_as_32b:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
; CHECK-NEXT: vsext.vf2 v10, v8
; CHECK-NEXT: vsext.vf2 v8, v9
; CHECK-NEXT: vwsub.vv v12, v10, v8
; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
; CHECK-NEXT: vrsub.vi v8, v12, 0
; CHECK-NEXT: vmax.vv v8, v12, v8
; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
; CHECK-NEXT: vnsrl.wi v10, v8, 0
; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, ma
; CHECK-NEXT: vnsrl.wi v8, v10, 0
; CHECK-NEXT: ret
%a.sext = sext <8 x i8> %a to <8 x i32>
%b.sext = sext <8 x i8> %b to <8 x i32>
%sub = sub <8 x i32> %a.sext, %b.sext
%abs = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %sub, i1 true)
%trunc = trunc <8 x i32> %abs to <8 x i8>
ret <8 x i8> %trunc
}

define <16 x i8> @sabd_16b(<16 x i8> %a, <16 x i8> %b) {
;
; CHECK-LABEL: sabd_16b:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
; CHECK-NEXT: vwsub.vv v10, v8, v9
; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
; CHECK-NEXT: vrsub.vi v8, v10, 0
; CHECK-NEXT: vmax.vv v10, v10, v8
; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma
; CHECK-NEXT: vnsrl.wi v8, v10, 0
; CHECK-NEXT: ret
%a.sext = sext <16 x i8> %a to <16 x i16>
%b.sext = sext <16 x i8> %b to <16 x i16>
%sub = sub <16 x i16> %a.sext, %b.sext
%abs = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %sub, i1 true)
%trunc = trunc <16 x i16> %abs to <16 x i8>
ret <16 x i8> %trunc
}

define <4 x i16> @sabd_4h(<4 x i16> %a, <4 x i16> %b) {
;
; CHECK-LABEL: sabd_4h:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
; CHECK-NEXT: vwsub.vv v10, v8, v9
; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
; CHECK-NEXT: vrsub.vi v8, v10, 0
; CHECK-NEXT: vmax.vv v8, v10, v8
; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
; CHECK-NEXT: vnsrl.wi v8, v8, 0
; CHECK-NEXT: ret
%a.sext = sext <4 x i16> %a to <4 x i32>
%b.sext = sext <4 x i16> %b to <4 x i32>
%sub = sub <4 x i32> %a.sext, %b.sext
%abs = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %sub, i1 true)
%trunc = trunc <4 x i32> %abs to <4 x i16>
ret <4 x i16> %trunc
}

define <4 x i16> @sabd_4h_promoted_ops(<4 x i8> %a, <4 x i8> %b) {
;
; CHECK-LABEL: sabd_4h_promoted_ops:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
; CHECK-NEXT: vwsub.vv v10, v8, v9
; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
; CHECK-NEXT: vrsub.vi v8, v10, 0
; CHECK-NEXT: vmax.vv v8, v10, v8
; CHECK-NEXT: ret
%a.sext = sext <4 x i8> %a to <4 x i16>
%b.sext = sext <4 x i8> %b to <4 x i16>
%sub = sub <4 x i16> %a.sext, %b.sext
%abs = call <4 x i16> @llvm.abs.v4i16(<4 x i16> %sub, i1 true)
ret <4 x i16> %abs
}

define <8 x i16> @sabd_8h(<8 x i16> %a, <8 x i16> %b) {
;
; CHECK-LABEL: sabd_8h:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
; CHECK-NEXT: vwsub.vv v10, v8, v9
; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
; CHECK-NEXT: vrsub.vi v8, v10, 0
; CHECK-NEXT: vmax.vv v10, v10, v8
; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
; CHECK-NEXT: vnsrl.wi v8, v10, 0
; CHECK-NEXT: ret
%a.sext = sext <8 x i16> %a to <8 x i32>
%b.sext = sext <8 x i16> %b to <8 x i32>
%sub = sub <8 x i32> %a.sext, %b.sext
%abs = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %sub, i1 true)
%trunc = trunc <8 x i32> %abs to <8 x i16>
ret <8 x i16> %trunc
}

define <8 x i16> @sabd_8h_promoted_ops(<8 x i8> %a, <8 x i8> %b) {
;
; CHECK-LABEL: sabd_8h_promoted_ops:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
; CHECK-NEXT: vwsub.vv v10, v8, v9
; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
; CHECK-NEXT: vrsub.vi v8, v10, 0
; CHECK-NEXT: vmax.vv v8, v10, v8
; CHECK-NEXT: ret
%a.sext = sext <8 x i8> %a to <8 x i16>
%b.sext = sext <8 x i8> %b to <8 x i16>
%sub = sub <8 x i16> %a.sext, %b.sext
%abs = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %sub, i1 true)
ret <8 x i16> %abs
}

define <2 x i32> @sabd_2s(<2 x i32> %a, <2 x i32> %b) {
;
; CHECK-LABEL: sabd_2s:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
; CHECK-NEXT: vwsub.vv v10, v8, v9
; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma
; CHECK-NEXT: vrsub.vi v8, v10, 0
; CHECK-NEXT: vmax.vv v8, v10, v8
; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
; CHECK-NEXT: vnsrl.wi v8, v8, 0
; CHECK-NEXT: ret
%a.sext = sext <2 x i32> %a to <2 x i64>
%b.sext = sext <2 x i32> %b to <2 x i64>
%sub = sub <2 x i64> %a.sext, %b.sext
%abs = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %sub, i1 true)
%trunc = trunc <2 x i64> %abs to <2 x i32>
ret <2 x i32> %trunc
}

define <2 x i32> @sabd_2s_promoted_ops(<2 x i16> %a, <2 x i16> %b) {
;
; CHECK-LABEL: sabd_2s_promoted_ops:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
; CHECK-NEXT: vwsub.vv v10, v8, v9
; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
; CHECK-NEXT: vrsub.vi v8, v10, 0
; CHECK-NEXT: vmax.vv v8, v10, v8
; CHECK-NEXT: ret
%a.sext = sext <2 x i16> %a to <2 x i32>
%b.sext = sext <2 x i16> %b to <2 x i32>
%sub = sub <2 x i32> %a.sext, %b.sext
%abs = call <2 x i32> @llvm.abs.v2i32(<2 x i32> %sub, i1 true)
ret <2 x i32> %abs
}

define <4 x i32> @sabd_4s(<4 x i32> %a, <4 x i32> %b) {
;
; CHECK-LABEL: sabd_4s:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; CHECK-NEXT: vwsub.vv v10, v8, v9
; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma
; CHECK-NEXT: vrsub.vi v8, v10, 0
; CHECK-NEXT: vmax.vv v10, v10, v8
; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
; CHECK-NEXT: vnsrl.wi v8, v10, 0
; CHECK-NEXT: ret
%a.sext = sext <4 x i32> %a to <4 x i64>
%b.sext = sext <4 x i32> %b to <4 x i64>
%sub = sub <4 x i64> %a.sext, %b.sext
%abs = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %sub, i1 true)
%trunc = trunc <4 x i64> %abs to <4 x i32>
ret <4 x i32> %trunc
}

define <4 x i32> @sabd_4s_promoted_ops(<4 x i16> %a, <4 x i16> %b) {
;
; CHECK-LABEL: sabd_4s_promoted_ops:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
; CHECK-NEXT: vwsub.vv v10, v8, v9
; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
; CHECK-NEXT: vrsub.vi v8, v10, 0
; CHECK-NEXT: vmax.vv v8, v10, v8
; CHECK-NEXT: ret
%a.sext = sext <4 x i16> %a to <4 x i32>
%b.sext = sext <4 x i16> %b to <4 x i32>
%sub = sub <4 x i32> %a.sext, %b.sext
%abs = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %sub, i1 true)
ret <4 x i32> %abs
}

define <2 x i64> @sabd_2d(<2 x i64> %a, <2 x i64> %b) {
; RV32-LABEL: sabd_2d:
; RV32: # %bb.0:
; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
; RV32-NEXT: vslidedown.vi v10, v8, 1
; RV32-NEXT: li a1, 32
; RV32-NEXT: vsrl.vx v11, v10, a1
; RV32-NEXT: vmv.x.s a3, v11
; RV32-NEXT: vsrl.vx v11, v8, a1
; RV32-NEXT: vmv.x.s a5, v11
; RV32-NEXT: srai t1, a5, 31
; RV32-NEXT: vmv.x.s a4, v10
; RV32-NEXT: vmv.x.s a0, v8
; RV32-NEXT: vslidedown.vi v8, v9, 1
; RV32-NEXT: vsrl.vx v10, v8, a1
; RV32-NEXT: vmv.x.s a6, v10
; RV32-NEXT: vsrl.vx v10, v9, a1
; RV32-NEXT: vmv.x.s a7, v10
; RV32-NEXT: srai t4, a7, 31
; RV32-NEXT: vmv.x.s a1, v9
; RV32-NEXT: sltu a2, a0, a1
; RV32-NEXT: vmv.x.s t0, v8
; RV32-NEXT: mv t5, a2
; RV32-NEXT: beq a5, a7, .LBB11_2
; RV32-NEXT: # %bb.1:
; RV32-NEXT: sltu t5, a5, a7
; RV32-NEXT: .LBB11_2:
; RV32-NEXT: srai t2, a3, 31
; RV32-NEXT: srai t3, a6, 31
; RV32-NEXT: sub t6, t1, t4
; RV32-NEXT: sltu t5, t6, t5
; RV32-NEXT: sltu t1, t1, t4
; RV32-NEXT: sltu t4, a4, t0
; RV32-NEXT: sub t1, t6, t1
; RV32-NEXT: mv t6, t4
; RV32-NEXT: beq a3, a6, .LBB11_4
; RV32-NEXT: # %bb.3:
; RV32-NEXT: sltu t6, a3, a6
; RV32-NEXT: .LBB11_4:
; RV32-NEXT: sub t1, t1, t5
; RV32-NEXT: sub t5, t2, t3
; RV32-NEXT: sltu t6, t5, t6
; RV32-NEXT: sltu t2, t2, t3
; RV32-NEXT: sub t2, t5, t2
; RV32-NEXT: sub t2, t2, t6
; RV32-NEXT: sub a5, a5, a7
; RV32-NEXT: sub a3, a3, a6
; RV32-NEXT: sub a3, a3, t4
; RV32-NEXT: sub a4, a4, t0
; RV32-NEXT: bgez t2, .LBB11_6
; RV32-NEXT: # %bb.5:
; RV32-NEXT: snez a6, a4
; RV32-NEXT: neg a6, a6
; RV32-NEXT: sub a3, a6, a3
; RV32-NEXT: neg a4, a4
; RV32-NEXT: .LBB11_6:
; RV32-NEXT: sub a5, a5, a2
; RV32-NEXT: sub a0, a0, a1
; RV32-NEXT: bgez t1, .LBB11_8
; RV32-NEXT: # %bb.7:
; RV32-NEXT: snez a1, a0
; RV32-NEXT: neg a1, a1
; RV32-NEXT: sub a5, a1, a5
; RV32-NEXT: neg a0, a0
; RV32-NEXT: .LBB11_8:
; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
; RV32-NEXT: vmv.v.x v8, a0
; RV32-NEXT: vslide1down.vx v8, v8, a5
; RV32-NEXT: vmv.v.x v9, a4
; RV32-NEXT: vslide1down.vx v9, v9, a3
; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, ma
; RV32-NEXT: vslideup.vi v8, v9, 1
; RV32-NEXT: ret
;
; RV64-LABEL: sabd_2d:
; RV64: # %bb.0:
; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
; RV64-NEXT: vslidedown.vi v10, v8, 1
; RV64-NEXT: vmv.x.s a1, v10
; RV64-NEXT: srai a4, a1, 63
; RV64-NEXT: vmv.x.s a0, v8
; RV64-NEXT: srai a3, a0, 63
; RV64-NEXT: vslidedown.vi v8, v9, 1
; RV64-NEXT: vmv.x.s a5, v8
; RV64-NEXT: srai a6, a5, 63
; RV64-NEXT: vmv.x.s a2, v9
; RV64-NEXT: srai a7, a2, 63
; RV64-NEXT: sltu t0, a0, a2
; RV64-NEXT: sub a3, a3, a7
; RV64-NEXT: sub a3, a3, t0
; RV64-NEXT: sltu a7, a1, a5
; RV64-NEXT: sub a4, a4, a6
; RV64-NEXT: sub a4, a4, a7
; RV64-NEXT: sub a1, a1, a5
; RV64-NEXT: bgez a4, .LBB11_2
; RV64-NEXT: # %bb.1:
; RV64-NEXT: neg a1, a1
; RV64-NEXT: .LBB11_2:
; RV64-NEXT: sub a0, a0, a2
; RV64-NEXT: bgez a3, .LBB11_4
; RV64-NEXT: # %bb.3:
; RV64-NEXT: neg a0, a0
; RV64-NEXT: .LBB11_4:
; RV64-NEXT: vmv.s.x v8, a0
; RV64-NEXT: vmv.s.x v9, a1
; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; RV64-NEXT: vslideup.vi v8, v9, 1
; RV64-NEXT: ret
%a.sext = sext <2 x i64> %a to <2 x i128>
%b.sext = sext <2 x i64> %b to <2 x i128>
%sub = sub <2 x i128> %a.sext, %b.sext
%abs = call <2 x i128> @llvm.abs.v2i128(<2 x i128> %sub, i1 true)
%trunc = trunc <2 x i128> %abs to <2 x i64>
ret <2 x i64> %trunc
}

define <2 x i64> @sabd_2d_promoted_ops(<2 x i32> %a, <2 x i32> %b) {
;
; CHECK-LABEL: sabd_2d_promoted_ops:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
; CHECK-NEXT: vwsub.vv v10, v8, v9
; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma
; CHECK-NEXT: vrsub.vi v8, v10, 0
; CHECK-NEXT: vmax.vv v8, v10, v8
; CHECK-NEXT: ret
%a.sext = sext <2 x i32> %a to <2 x i64>
%b.sext = sext <2 x i32> %b to <2 x i64>
%sub = sub <2 x i64> %a.sext, %b.sext
%abs = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %sub, i1 true)
ret <2 x i64> %abs
}

;
; UABD
;

define <8 x i8> @uabd_8b(<8 x i8> %a, <8 x i8> %b) {
;
; CHECK-LABEL: uabd_8b:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
; CHECK-NEXT: vwsubu.vv v10, v8, v9
; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
; CHECK-NEXT: vrsub.vi v8, v10, 0
; CHECK-NEXT: vmax.vv v8, v10, v8
; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, ma
; CHECK-NEXT: vnsrl.wi v8, v8, 0
; CHECK-NEXT: ret
%a.zext = zext <8 x i8> %a to <8 x i16>
%b.zext = zext <8 x i8> %b to <8 x i16>
%sub = sub <8 x i16> %a.zext, %b.zext
%abs = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %sub, i1 true)
%trunc = trunc <8 x i16> %abs to <8 x i8>
ret <8 x i8> %trunc
}

define <16 x i8> @uabd_16b(<16 x i8> %a, <16 x i8> %b) {
;
; CHECK-LABEL: uabd_16b:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
; CHECK-NEXT: vwsubu.vv v10, v8, v9
; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
; CHECK-NEXT: vrsub.vi v8, v10, 0
; CHECK-NEXT: vmax.vv v10, v10, v8
; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma
; CHECK-NEXT: vnsrl.wi v8, v10, 0
; CHECK-NEXT: ret
%a.zext = zext <16 x i8> %a to <16 x i16>
%b.zext = zext <16 x i8> %b to <16 x i16>
%sub = sub <16 x i16> %a.zext, %b.zext
%abs = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %sub, i1 true)
%trunc = trunc <16 x i16> %abs to <16 x i8>
ret <16 x i8> %trunc
}

define <4 x i16> @uabd_4h(<4 x i16> %a, <4 x i16> %b) {
;
; CHECK-LABEL: uabd_4h:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
; CHECK-NEXT: vwsubu.vv v10, v8, v9
; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
; CHECK-NEXT: vrsub.vi v8, v10, 0
; CHECK-NEXT: vmax.vv v8, v10, v8
; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
; CHECK-NEXT: vnsrl.wi v8, v8, 0
; CHECK-NEXT: ret
%a.zext = zext <4 x i16> %a to <4 x i32>
%b.zext = zext <4 x i16> %b to <4 x i32>
%sub = sub <4 x i32> %a.zext, %b.zext
%abs = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %sub, i1 true)
%trunc = trunc <4 x i32> %abs to <4 x i16>
ret <4 x i16> %trunc
}

define <4 x i16> @uabd_4h_promoted_ops(<4 x i8> %a, <4 x i8> %b) {
;
; CHECK-LABEL: uabd_4h_promoted_ops:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
; CHECK-NEXT: vwsubu.vv v10, v8, v9
; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
; CHECK-NEXT: vrsub.vi v8, v10, 0
; CHECK-NEXT: vmax.vv v8, v10, v8
; CHECK-NEXT: ret
%a.zext = zext <4 x i8> %a to <4 x i16>
%b.zext = zext <4 x i8> %b to <4 x i16>
%sub = sub <4 x i16> %a.zext, %b.zext
%abs = call <4 x i16> @llvm.abs.v4i16(<4 x i16> %sub, i1 true)
ret <4 x i16> %abs
}

define <8 x i16> @uabd_8h(<8 x i16> %a, <8 x i16> %b) {
;
; CHECK-LABEL: uabd_8h:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
; CHECK-NEXT: vwsubu.vv v10, v8, v9
; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
; CHECK-NEXT: vrsub.vi v8, v10, 0
; CHECK-NEXT: vmax.vv v10, v10, v8
; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
; CHECK-NEXT: vnsrl.wi v8, v10, 0
; CHECK-NEXT: ret
%a.zext = zext <8 x i16> %a to <8 x i32>
%b.zext = zext <8 x i16> %b to <8 x i32>
%sub = sub <8 x i32> %a.zext, %b.zext
%abs = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %sub, i1 true)
%trunc = trunc <8 x i32> %abs to <8 x i16>
ret <8 x i16> %trunc
}

define <8 x i16> @uabd_8h_promoted_ops(<8 x i8> %a, <8 x i8> %b) {
;
; CHECK-LABEL: uabd_8h_promoted_ops:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
; CHECK-NEXT: vwsubu.vv v10, v8, v9
; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
; CHECK-NEXT: vrsub.vi v8, v10, 0
; CHECK-NEXT: vmax.vv v8, v10, v8
; CHECK-NEXT: ret
%a.zext = zext <8 x i8> %a to <8 x i16>
%b.zext = zext <8 x i8> %b to <8 x i16>
%sub = sub <8 x i16> %a.zext, %b.zext
%abs = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %sub, i1 true)
ret <8 x i16> %abs
}

define <2 x i32> @uabd_2s(<2 x i32> %a, <2 x i32> %b) {
;
; CHECK-LABEL: uabd_2s:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
; CHECK-NEXT: vwsubu.vv v10, v8, v9
; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma
; CHECK-NEXT: vrsub.vi v8, v10, 0
; CHECK-NEXT: vmax.vv v8, v10, v8
; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
; CHECK-NEXT: vnsrl.wi v8, v8, 0
; CHECK-NEXT: ret
%a.zext = zext <2 x i32> %a to <2 x i64>
%b.zext = zext <2 x i32> %b to <2 x i64>
%sub = sub <2 x i64> %a.zext, %b.zext
%abs = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %sub, i1 true)
%trunc = trunc <2 x i64> %abs to <2 x i32>
ret <2 x i32> %trunc
}

define <2 x i32> @uabd_2s_promoted_ops(<2 x i16> %a, <2 x i16> %b) {
;
; CHECK-LABEL: uabd_2s_promoted_ops:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
; CHECK-NEXT: vwsubu.vv v10, v8, v9
; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
; CHECK-NEXT: vrsub.vi v8, v10, 0
; CHECK-NEXT: vmax.vv v8, v10, v8
; CHECK-NEXT: ret
%a.zext = zext <2 x i16> %a to <2 x i32>
%b.zext = zext <2 x i16> %b to <2 x i32>
%sub = sub <2 x i32> %a.zext, %b.zext
%abs = call <2 x i32> @llvm.abs.v2i32(<2 x i32> %sub, i1 true)
ret <2 x i32> %abs
}

define <4 x i32> @uabd_4s(<4 x i32> %a, <4 x i32> %b) {
;
; CHECK-LABEL: uabd_4s:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; CHECK-NEXT: vwsubu.vv v10, v8, v9
; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma
; CHECK-NEXT: vrsub.vi v8, v10, 0
; CHECK-NEXT: vmax.vv v10, v10, v8
; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
; CHECK-NEXT: vnsrl.wi v8, v10, 0
; CHECK-NEXT: ret
%a.zext = zext <4 x i32> %a to <4 x i64>
%b.zext = zext <4 x i32> %b to <4 x i64>
%sub = sub <4 x i64> %a.zext, %b.zext
%abs = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %sub, i1 true)
%trunc = trunc <4 x i64> %abs to <4 x i32>
ret <4 x i32> %trunc
}

define <4 x i32> @uabd_4s_promoted_ops(<4 x i16> %a, <4 x i16> %b) {
;
; CHECK-LABEL: uabd_4s_promoted_ops:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
; CHECK-NEXT: vwsubu.vv v10, v8, v9
; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
; CHECK-NEXT: vrsub.vi v8, v10, 0
; CHECK-NEXT: vmax.vv v8, v10, v8
; CHECK-NEXT: ret
%a.zext = zext <4 x i16> %a to <4 x i32>
%b.zext = zext <4 x i16> %b to <4 x i32>
%sub = sub <4 x i32> %a.zext, %b.zext
%abs = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %sub, i1 true)
ret <4 x i32> %abs
}

define <2 x i64> @uabd_2d(<2 x i64> %a, <2 x i64> %b) {
; RV32-LABEL: uabd_2d:
; RV32: # %bb.0:
; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
; RV32-NEXT: vslidedown.vi v10, v8, 1
; RV32-NEXT: li a4, 32
; RV32-NEXT: vsrl.vx v11, v10, a4
; RV32-NEXT: vmv.x.s a0, v11
; RV32-NEXT: vsrl.vx v11, v8, a4
; RV32-NEXT: vmv.x.s a3, v11
; RV32-NEXT: vmv.x.s a1, v10
; RV32-NEXT: vmv.x.s a2, v8
; RV32-NEXT: vslidedown.vi v8, v9, 1
; RV32-NEXT: vsrl.vx v10, v8, a4
; RV32-NEXT: vmv.x.s a7, v10
; RV32-NEXT: vsrl.vx v10, v9, a4
; RV32-NEXT: vmv.x.s a6, v10
; RV32-NEXT: vmv.x.s a4, v9
; RV32-NEXT: sltu a5, a2, a4
; RV32-NEXT: vmv.x.s t0, v8
; RV32-NEXT: mv t2, a5
; RV32-NEXT: beq a3, a6, .LBB23_2
; RV32-NEXT: # %bb.1:
; RV32-NEXT: sltu t2, a3, a6
; RV32-NEXT: .LBB23_2:
; RV32-NEXT: sltu t3, a1, t0
; RV32-NEXT: mv t4, t3
; RV32-NEXT: beq a0, a7, .LBB23_4
; RV32-NEXT: # %bb.3:
; RV32-NEXT: sltu t4, a0, a7
; RV32-NEXT: .LBB23_4:
; RV32-NEXT: sub t1, a3, a6
; RV32-NEXT: neg a3, t2
; RV32-NEXT: neg a6, t4
; RV32-NEXT: sub a0, a0, a7
; RV32-NEXT: sub a0, a0, t3
; RV32-NEXT: sub a1, a1, t0
; RV32-NEXT: bgez a6, .LBB23_6
; RV32-NEXT: # %bb.5:
; RV32-NEXT: snez a7, a1
; RV32-NEXT: neg a7, a7
; RV32-NEXT: sub a0, a7, a0
; RV32-NEXT: .LBB23_6:
; RV32-NEXT: sub a5, t1, a5
; RV32-NEXT: sub a2, a2, a4
; RV32-NEXT: bltz a3, .LBB23_11
; RV32-NEXT: # %bb.7:
; RV32-NEXT: bltz a6, .LBB23_12
; RV32-NEXT: .LBB23_8:
; RV32-NEXT: bgez a3, .LBB23_10
; RV32-NEXT: .LBB23_9:
; RV32-NEXT: neg a2, a2
; RV32-NEXT: .LBB23_10:
; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
; RV32-NEXT: vmv.v.x v8, a2
; RV32-NEXT: vslide1down.vx v8, v8, a5
; RV32-NEXT: vmv.v.x v9, a1
; RV32-NEXT: vslide1down.vx v9, v9, a0
; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, ma
; RV32-NEXT: vslideup.vi v8, v9, 1
; RV32-NEXT: ret
; RV32-NEXT: .LBB23_11:
; RV32-NEXT: snez a4, a2
; RV32-NEXT: neg a4, a4
; RV32-NEXT: sub a5, a4, a5
; RV32-NEXT: bgez a6, .LBB23_8
; RV32-NEXT: .LBB23_12:
; RV32-NEXT: neg a1, a1
; RV32-NEXT: bltz a3, .LBB23_9
; RV32-NEXT: j .LBB23_10
;
; RV64-LABEL: uabd_2d:
; RV64: # %bb.0:
; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
; RV64-NEXT: vslidedown.vi v10, v8, 1
; RV64-NEXT: vmv.x.s a0, v10
; RV64-NEXT: vmv.x.s a1, v8
; RV64-NEXT: vslidedown.vi v8, v9, 1
; RV64-NEXT: vmv.x.s a4, v8
; RV64-NEXT: vmv.x.s a2, v9
; RV64-NEXT: sltu a3, a1, a2
; RV64-NEXT: neg a3, a3
; RV64-NEXT: sltu a5, a0, a4
; RV64-NEXT: neg a5, a5
; RV64-NEXT: sub a0, a0, a4
; RV64-NEXT: bgez a5, .LBB23_2
; RV64-NEXT: # %bb.1:
; RV64-NEXT: neg a0, a0
; RV64-NEXT: .LBB23_2:
; RV64-NEXT: sub a1, a1, a2
; RV64-NEXT: bgez a3, .LBB23_4
; RV64-NEXT: # %bb.3:
; RV64-NEXT: neg a1, a1
; RV64-NEXT: .LBB23_4:
; RV64-NEXT: vmv.s.x v8, a1
; RV64-NEXT: vmv.s.x v9, a0
; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; RV64-NEXT: vslideup.vi v8, v9, 1
; RV64-NEXT: ret
%a.zext = zext <2 x i64> %a to <2 x i128>
%b.zext = zext <2 x i64> %b to <2 x i128>
%sub = sub <2 x i128> %a.zext, %b.zext
%abs = call <2 x i128> @llvm.abs.v2i128(<2 x i128> %sub, i1 true)
%trunc = trunc <2 x i128> %abs to <2 x i64>
ret <2 x i64> %trunc
}

define <2 x i64> @uabd_2d_promoted_ops(<2 x i32> %a, <2 x i32> %b) {
;
; CHECK-LABEL: uabd_2d_promoted_ops:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
; CHECK-NEXT: vwsubu.vv v10, v8, v9
; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma
; CHECK-NEXT: vrsub.vi v8, v10, 0
; CHECK-NEXT: vmax.vv v8, v10, v8
; CHECK-NEXT: ret
%a.zext = zext <2 x i32> %a to <2 x i64>
%b.zext = zext <2 x i32> %b to <2 x i64>
%sub = sub <2 x i64> %a.zext, %b.zext
%abs = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %sub, i1 true)
ret <2 x i64> %abs
}

define <16 x i8> @uabd_v16i8_nuw(<16 x i8> %a, <16 x i8> %b) {
;
; CHECK-LABEL: uabd_v16i8_nuw:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
; CHECK-NEXT: vsub.vv v8, v8, v9
; CHECK-NEXT: vrsub.vi v9, v8, 0
; CHECK-NEXT: vmax.vv v8, v8, v9
; CHECK-NEXT: ret
%sub = sub nuw <16 x i8> %a, %b
%abs = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %sub, i1 true)
ret <16 x i8> %abs
}

define <8 x i16> @uabd_v8i16_nuw(<8 x i16> %a, <8 x i16> %b) {
;
; CHECK-LABEL: uabd_v8i16_nuw:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
; CHECK-NEXT: vsub.vv v8, v8, v9
; CHECK-NEXT: vrsub.vi v9, v8, 0
; CHECK-NEXT: vmax.vv v8, v8, v9
; CHECK-NEXT: ret
%sub = sub nuw <8 x i16> %a, %b
%abs = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %sub, i1 true)
ret <8 x i16> %abs
}

define <4 x i32> @uabd_v4i32_nuw(<4 x i32> %a, <4 x i32> %b) {
;
; CHECK-LABEL: uabd_v4i32_nuw:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; CHECK-NEXT: vsub.vv v8, v8, v9
; CHECK-NEXT: vrsub.vi v9, v8, 0
; CHECK-NEXT: vmax.vv v8, v8, v9
; CHECK-NEXT: ret
%sub = sub nuw <4 x i32> %a, %b
%abs = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %sub, i1 true)
ret <4 x i32> %abs
}

define <2 x i64> @uabd_v2i64_nuw(<2 x i64> %a, <2 x i64> %b) {
;
; CHECK-LABEL: uabd_v2i64_nuw:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; CHECK-NEXT: vsub.vv v8, v8, v9
; CHECK-NEXT: vrsub.vi v9, v8, 0
; CHECK-NEXT: vmax.vv v8, v8, v9
; CHECK-NEXT: ret
%sub = sub nuw <2 x i64> %a, %b
%abs = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %sub, i1 true)
ret <2 x i64> %abs
}

define <16 x i8> @sabd_v16i8_nsw(<16 x i8> %a, <16 x i8> %b) {
;
; CHECK-LABEL: sabd_v16i8_nsw:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
; CHECK-NEXT: vsub.vv v8, v8, v9
; CHECK-NEXT: vrsub.vi v9, v8, 0
; CHECK-NEXT: vmax.vv v8, v8, v9
; CHECK-NEXT: ret
%sub = sub nsw <16 x i8> %a, %b
%abs = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %sub, i1 true)
ret <16 x i8> %abs
}

define <8 x i16> @sabd_v8i16_nsw(<8 x i16> %a, <8 x i16> %b) {
;
; CHECK-LABEL: sabd_v8i16_nsw:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
; CHECK-NEXT: vsub.vv v8, v8, v9
; CHECK-NEXT: vrsub.vi v9, v8, 0
; CHECK-NEXT: vmax.vv v8, v8, v9
; CHECK-NEXT: ret
%sub = sub nsw <8 x i16> %a, %b
%abs = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %sub, i1 true)
ret <8 x i16> %abs
}

define <4 x i32> @sabd_v4i32_nsw(<4 x i32> %a, <4 x i32> %b) {
;
; CHECK-LABEL: sabd_v4i32_nsw:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; CHECK-NEXT: vsub.vv v8, v8, v9
; CHECK-NEXT: vrsub.vi v9, v8, 0
; CHECK-NEXT: vmax.vv v8, v8, v9
; CHECK-NEXT: ret
%sub = sub nsw <4 x i32> %a, %b
%abs = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %sub, i1 true)
ret <4 x i32> %abs
}

define <2 x i64> @sabd_v2i64_nsw(<2 x i64> %a, <2 x i64> %b) {
;
; CHECK-LABEL: sabd_v2i64_nsw:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; CHECK-NEXT: vsub.vv v8, v8, v9
; CHECK-NEXT: vrsub.vi v9, v8, 0
; CHECK-NEXT: vmax.vv v8, v8, v9
; CHECK-NEXT: ret
%sub = sub nsw <2 x i64> %a, %b
%abs = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %sub, i1 true)
ret <2 x i64> %abs
}

define <16 x i8> @smaxmin_v16i8(<16 x i8> %0, <16 x i8> %1) {
;
; CHECK-LABEL: smaxmin_v16i8:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
; CHECK-NEXT: vmax.vv v10, v8, v9
; CHECK-NEXT: vmin.vv v8, v8, v9
; CHECK-NEXT: vsub.vv v8, v10, v8
; CHECK-NEXT: ret
%a = tail call <16 x i8> @llvm.smax.v16i8(<16 x i8> %0, <16 x i8> %1)
%b = tail call <16 x i8> @llvm.smin.v16i8(<16 x i8> %0, <16 x i8> %1)
%sub = sub <16 x i8> %a, %b
ret <16 x i8> %sub
}

define <8 x i16> @smaxmin_v8i16(<8 x i16> %0, <8 x i16> %1) {
;
; CHECK-LABEL: smaxmin_v8i16:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
; CHECK-NEXT: vmax.vv v10, v8, v9
; CHECK-NEXT: vmin.vv v8, v8, v9
; CHECK-NEXT: vsub.vv v8, v10, v8
; CHECK-NEXT: ret
%a = tail call <8 x i16> @llvm.smax.v8i16(<8 x i16> %0, <8 x i16> %1)
%b = tail call <8 x i16> @llvm.smin.v8i16(<8 x i16> %0, <8 x i16> %1)
%sub = sub <8 x i16> %a, %b
ret <8 x i16> %sub
}

define <4 x i32> @smaxmin_v4i32(<4 x i32> %0, <4 x i32> %1) {
;
; CHECK-LABEL: smaxmin_v4i32:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; CHECK-NEXT: vmax.vv v10, v8, v9
; CHECK-NEXT: vmin.vv v8, v8, v9
; CHECK-NEXT: vsub.vv v8, v10, v8
; CHECK-NEXT: ret
%a = tail call <4 x i32> @llvm.smax.v4i32(<4 x i32> %0, <4 x i32> %1)
%b = tail call <4 x i32> @llvm.smin.v4i32(<4 x i32> %0, <4 x i32> %1)
%sub = sub <4 x i32> %a, %b
ret <4 x i32> %sub
}

define <2 x i64> @smaxmin_v2i64(<2 x i64> %0, <2 x i64> %1) {
;
; CHECK-LABEL: smaxmin_v2i64:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; CHECK-NEXT: vmax.vv v10, v8, v9
; CHECK-NEXT: vmin.vv v8, v8, v9
; CHECK-NEXT: vsub.vv v8, v10, v8
; CHECK-NEXT: ret
%a = tail call <2 x i64> @llvm.smax.v2i64(<2 x i64> %0, <2 x i64> %1)
%b = tail call <2 x i64> @llvm.smin.v2i64(<2 x i64> %0, <2 x i64> %1)
%sub = sub <2 x i64> %a, %b
ret <2 x i64> %sub
}

define <16 x i8> @umaxmin_v16i8(<16 x i8> %0, <16 x i8> %1) {
;
; CHECK-LABEL: umaxmin_v16i8:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
; CHECK-NEXT: vmaxu.vv v10, v8, v9
; CHECK-NEXT: vminu.vv v8, v8, v9
; CHECK-NEXT: vsub.vv v8, v10, v8
; CHECK-NEXT: ret
%a = tail call <16 x i8> @llvm.umax.v16i8(<16 x i8> %0, <16 x i8> %1)
%b = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> %0, <16 x i8> %1)
%sub = sub <16 x i8> %a, %b
ret <16 x i8> %sub
}

define <8 x i16> @umaxmin_v8i16(<8 x i16> %0, <8 x i16> %1) {
;
; CHECK-LABEL: umaxmin_v8i16:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
; CHECK-NEXT: vmaxu.vv v10, v8, v9
; CHECK-NEXT: vminu.vv v8, v8, v9
; CHECK-NEXT: vsub.vv v8, v10, v8
; CHECK-NEXT: ret
%a = tail call <8 x i16> @llvm.umax.v8i16(<8 x i16> %0, <8 x i16> %1)
%b = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %0, <8 x i16> %1)
%sub = sub <8 x i16> %a, %b
ret <8 x i16> %sub
}

define <4 x i32> @umaxmin_v4i32(<4 x i32> %0, <4 x i32> %1) {
;
; CHECK-LABEL: umaxmin_v4i32:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; CHECK-NEXT: vmaxu.vv v10, v8, v9
; CHECK-NEXT: vminu.vv v8, v8, v9
; CHECK-NEXT: vsub.vv v8, v10, v8
; CHECK-NEXT: ret
%a = tail call <4 x i32> @llvm.umax.v4i32(<4 x i32> %0, <4 x i32> %1)
%b = tail call <4 x i32> @llvm.umin.v4i32(<4 x i32> %0, <4 x i32> %1)
%sub = sub <4 x i32> %a, %b
ret <4 x i32> %sub
}

define <2 x i64> @umaxmin_v2i64(<2 x i64> %0, <2 x i64> %1) {
;
; CHECK-LABEL: umaxmin_v2i64:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; CHECK-NEXT: vmaxu.vv v10, v8, v9
; CHECK-NEXT: vminu.vv v8, v8, v9
; CHECK-NEXT: vsub.vv v8, v10, v8
; CHECK-NEXT: ret
%a = tail call <2 x i64> @llvm.umax.v2i64(<2 x i64> %0, <2 x i64> %1)
%b = tail call <2 x i64> @llvm.umin.v2i64(<2 x i64> %0, <2 x i64> %1)
%sub = sub <2 x i64> %a, %b
ret <2 x i64> %sub
}

define <16 x i8> @umaxmin_v16i8_com1(<16 x i8> %0, <16 x i8> %1) {
;
; CHECK-LABEL: umaxmin_v16i8_com1:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
; CHECK-NEXT: vmaxu.vv v10, v8, v9
; CHECK-NEXT: vminu.vv v8, v9, v8
; CHECK-NEXT: vsub.vv v8, v10, v8
; CHECK-NEXT: ret
%a = tail call <16 x i8> @llvm.umax.v16i8(<16 x i8> %0, <16 x i8> %1)
%b = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> %1, <16 x i8> %0)
%sub = sub <16 x i8> %a, %b
ret <16 x i8> %sub
}

declare <8 x i8> @llvm.abs.v8i8(<8 x i8>, i1)
declare <16 x i8> @llvm.abs.v16i8(<16 x i8>, i1)

declare <4 x i16> @llvm.abs.v4i16(<4 x i16>, i1)
declare <8 x i16> @llvm.abs.v8i16(<8 x i16>, i1)
declare <16 x i16> @llvm.abs.v16i16(<16 x i16>, i1)

declare <2 x i32> @llvm.abs.v2i32(<2 x i32>, i1)
declare <4 x i32> @llvm.abs.v4i32(<4 x i32>, i1)
declare <8 x i32> @llvm.abs.v8i32(<8 x i32>, i1)

declare <2 x i64> @llvm.abs.v2i64(<2 x i64>, i1)
declare <4 x i64> @llvm.abs.v4i64(<4 x i64>, i1)

declare <2 x i128> @llvm.abs.v2i128(<2 x i128>, i1)

declare <16 x i8> @llvm.smax.v16i8(<16 x i8>, <16 x i8>)
declare <8 x i16> @llvm.smax.v8i16(<8 x i16>, <8 x i16>)
declare <4 x i32> @llvm.smax.v4i32(<4 x i32>, <4 x i32>)
declare <2 x i64> @llvm.smax.v2i64(<2 x i64>, <2 x i64>)
declare <16 x i8> @llvm.smin.v16i8(<16 x i8>, <16 x i8>)
declare <8 x i16> @llvm.smin.v8i16(<8 x i16>, <8 x i16>)
declare <4 x i32> @llvm.smin.v4i32(<4 x i32>, <4 x i32>)
declare <2 x i64> @llvm.smin.v2i64(<2 x i64>, <2 x i64>)
declare <16 x i8> @llvm.umax.v16i8(<16 x i8>, <16 x i8>)
declare <8 x i16> @llvm.umax.v8i16(<8 x i16>, <8 x i16>)
declare <4 x i32> @llvm.umax.v4i32(<4 x i32>, <4 x i32>)
declare <2 x i64> @llvm.umax.v2i64(<2 x i64>, <2 x i64>)
declare <16 x i8> @llvm.umin.v16i8(<16 x i8>, <16 x i8>)
declare <8 x i16> @llvm.umin.v8i16(<8 x i16>, <8 x i16>)
declare <4 x i32> @llvm.umin.v4i32(<4 x i32>, <4 x i32>)
declare <2 x i64> @llvm.umin.v2i64(<2 x i64>, <2 x i64>)

191 changes: 191 additions & 0 deletions llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sad.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
; RUN: llc < %s -mtriple=riscv32 -mattr=+v | FileCheck %s
; RUN: llc < %s -mtriple=riscv64 -mattr=+v | FileCheck %s

define signext i16 @sad_4x8_as_i16(<4 x i8> %a, <4 x i8> %b) {
; CHECK-LABEL: sad_4x8_as_i16:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
; CHECK-NEXT: vwsubu.vv v10, v8, v9
; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
; CHECK-NEXT: vrsub.vi v8, v10, 0
; CHECK-NEXT: vmax.vv v8, v10, v8
; CHECK-NEXT: vmv.s.x v9, zero
; CHECK-NEXT: vredsum.vs v8, v8, v9
; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
entry:
%1 = zext <4 x i8> %a to <4 x i16>
%3 = zext <4 x i8> %b to <4 x i16>
%4 = sub nsw <4 x i16> %1, %3
%5 = tail call <4 x i16> @llvm.abs.v4i16(<4 x i16> %4, i1 true)
%6 = tail call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %5)
ret i16 %6
}

define signext i32 @sad_4x8_as_i32(<4 x i8> %a, <4 x i8> %b) {
; CHECK-LABEL: sad_4x8_as_i32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
; CHECK-NEXT: vwsubu.vv v10, v8, v9
; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
; CHECK-NEXT: vrsub.vi v8, v10, 0
; CHECK-NEXT: vmax.vv v8, v10, v8
; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
; CHECK-NEXT: vmv.s.x v9, zero
; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
; CHECK-NEXT: vwredsumu.vs v8, v8, v9
; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
entry:
%1 = zext <4 x i8> %a to <4 x i32>
%3 = zext <4 x i8> %b to <4 x i32>
%4 = sub nsw <4 x i32> %1, %3
%5 = tail call <4 x i32> @llvm.abs.v4i32(<4 x i32> %4, i1 true)
%6 = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %5)
ret i32 %6
}

define signext i16 @sad_16x8_as_i16(<16 x i8> %a, <16 x i8> %b) {
; CHECK-LABEL: sad_16x8_as_i16:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
; CHECK-NEXT: vwsubu.vv v10, v8, v9
; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
; CHECK-NEXT: vrsub.vi v8, v10, 0
; CHECK-NEXT: vmax.vv v8, v10, v8
; CHECK-NEXT: vmv.s.x v10, zero
; CHECK-NEXT: vredsum.vs v8, v8, v10
; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
entry:
%1 = zext <16 x i8> %a to <16 x i16>
%3 = zext <16 x i8> %b to <16 x i16>
%4 = sub nsw <16 x i16> %1, %3
%5 = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> %4, i1 true)
%6 = tail call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %5)
ret i16 %6
}

define signext i32 @sad_16x8_as_i32(<16 x i8> %a, <16 x i8> %b) {
; CHECK-LABEL: sad_16x8_as_i32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
; CHECK-NEXT: vwsubu.vv v10, v8, v9
; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
; CHECK-NEXT: vrsub.vi v8, v10, 0
; CHECK-NEXT: vmax.vv v8, v10, v8
; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
; CHECK-NEXT: vmv.s.x v10, zero
; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
; CHECK-NEXT: vwredsumu.vs v8, v8, v10
; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
entry:
%1 = zext <16 x i8> %a to <16 x i32>
%3 = zext <16 x i8> %b to <16 x i32>
%4 = sub nsw <16 x i32> %1, %3
%5 = tail call <16 x i32> @llvm.abs.v16i32(<16 x i32> %4, i1 true)
%6 = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %5)
ret i32 %6
}

define signext i32 @sad_2block_16xi8_as_i32(ptr %a, ptr %b, i32 signext %stridea, i32 signext %strideb) {
; CHECK-LABEL: sad_2block_16xi8_as_i32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
; CHECK-NEXT: vle8.v v8, (a0)
; CHECK-NEXT: vle8.v v9, (a1)
; CHECK-NEXT: vwsubu.vv v10, v8, v9
; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
; CHECK-NEXT: add a0, a0, a2
; CHECK-NEXT: add a1, a1, a3
; CHECK-NEXT: vle8.v v8, (a0)
; CHECK-NEXT: vle8.v v9, (a1)
; CHECK-NEXT: vrsub.vi v12, v10, 0
; CHECK-NEXT: vmax.vv v12, v10, v12
; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma
; CHECK-NEXT: vwsubu.vv v10, v8, v9
; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
; CHECK-NEXT: vrsub.vi v8, v10, 0
; CHECK-NEXT: add a0, a0, a2
; CHECK-NEXT: add a1, a1, a3
; CHECK-NEXT: vle8.v v14, (a0)
; CHECK-NEXT: vle8.v v15, (a1)
; CHECK-NEXT: vmax.vv v16, v10, v8
; CHECK-NEXT: vwaddu.vv v8, v16, v12
; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma
; CHECK-NEXT: vwsubu.vv v12, v14, v15
; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
; CHECK-NEXT: vrsub.vi v14, v12, 0
; CHECK-NEXT: add a0, a0, a2
; CHECK-NEXT: add a1, a1, a3
; CHECK-NEXT: vle8.v v16, (a0)
; CHECK-NEXT: vle8.v v17, (a1)
; CHECK-NEXT: vmax.vv v12, v12, v14
; CHECK-NEXT: vwaddu.wv v8, v8, v12
; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma
; CHECK-NEXT: vwsubu.vv v12, v16, v17
; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
; CHECK-NEXT: vrsub.vi v14, v12, 0
; CHECK-NEXT: vmax.vv v12, v12, v14
; CHECK-NEXT: vwaddu.wv v8, v8, v12
; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
; CHECK-NEXT: vmv.s.x v12, zero
; CHECK-NEXT: vredsum.vs v8, v8, v12
; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
entry:
%idx.ext8 = sext i32 %strideb to i64
%idx.ext = sext i32 %stridea to i64
%0 = load <16 x i8>, ptr %a, align 1
%1 = zext <16 x i8> %0 to <16 x i32>
%2 = load <16 x i8>, ptr %b, align 1
%3 = zext <16 x i8> %2 to <16 x i32>
%4 = sub nsw <16 x i32> %1, %3
%5 = tail call <16 x i32> @llvm.abs.v16i32(<16 x i32> %4, i1 true)
%6 = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %5)
%add.ptr = getelementptr inbounds i8, ptr %a, i64 %idx.ext
%add.ptr9 = getelementptr inbounds i8, ptr %b, i64 %idx.ext8
%7 = load <16 x i8>, ptr %add.ptr, align 1
%8 = zext <16 x i8> %7 to <16 x i32>
%9 = load <16 x i8>, ptr %add.ptr9, align 1
%10 = zext <16 x i8> %9 to <16 x i32>
%11 = sub nsw <16 x i32> %8, %10
%12 = tail call <16 x i32> @llvm.abs.v16i32(<16 x i32> %11, i1 true)
%13 = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %12)
%op.rdx.1 = add i32 %13, %6
%add.ptr.1 = getelementptr inbounds i8, ptr %add.ptr, i64 %idx.ext
%add.ptr9.1 = getelementptr inbounds i8, ptr %add.ptr9, i64 %idx.ext8
%14 = load <16 x i8>, ptr %add.ptr.1, align 1
%15 = zext <16 x i8> %14 to <16 x i32>
%16 = load <16 x i8>, ptr %add.ptr9.1, align 1
%17 = zext <16 x i8> %16 to <16 x i32>
%18 = sub nsw <16 x i32> %15, %17
%19 = tail call <16 x i32> @llvm.abs.v16i32(<16 x i32> %18, i1 true)
%20 = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %19)
%op.rdx.2 = add i32 %20, %op.rdx.1
%add.ptr.2 = getelementptr inbounds i8, ptr %add.ptr.1, i64 %idx.ext
%add.ptr9.2 = getelementptr inbounds i8, ptr %add.ptr9.1, i64 %idx.ext8
%21 = load <16 x i8>, ptr %add.ptr.2, align 1
%22 = zext <16 x i8> %21 to <16 x i32>
%23 = load <16 x i8>, ptr %add.ptr9.2, align 1
%24 = zext <16 x i8> %23 to <16 x i32>
%25 = sub nsw <16 x i32> %22, %24
%26 = tail call <16 x i32> @llvm.abs.v16i32(<16 x i32> %25, i1 true)
%27 = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %26)
%op.rdx.3 = add i32 %27, %op.rdx.2
ret i32 %op.rdx.3
}

declare <4 x i32> @llvm.abs.v4i32(<4 x i32>, i1)
declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
declare <4 x i16> @llvm.abs.v4i16(<4 x i16>, i1)
declare i16 @llvm.vector.reduce.add.v4i16(<4 x i16>)

declare <16 x i32> @llvm.abs.v16i32(<16 x i32>, i1)
declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
declare <16 x i16> @llvm.abs.v16i16(<16 x i16>, i1)
declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>)