@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=riscv32 -mattr=+v,+m,+zbb -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
; RUN: llc -mtriple=riscv64 -mattr=+v,+m,+zbb -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
; RUN: llc -mtriple=riscv32 -mattr=+v,+m -verify-machineinstrs < %s | FileCheck %s
; RUN: llc -mtriple=riscv64 -mattr=+v,+m -verify-machineinstrs < %s | FileCheck %s
define i32 @reduce_sum_2xi32 (<2 x i32 > %v ) {
; CHECK-LABEL: reduce_sum_2xi32:
Expand Down
Expand Up
@@ -448,336 +448,3 @@ define i32 @reduce_sum_16xi32_prefix15(ptr %p) {
%add13 = add i32 %add12 , %e14
ret i32 %add13
}
;; Most of the cornercases are exercised above, the following just
;; makes sure that other opcodes work as expected.
define i32 @reduce_xor_16xi32_prefix2 (ptr %p ) {
; CHECK-LABEL: reduce_xor_16xi32_prefix2:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
; CHECK-NEXT: vle32.v v8, (a0)
; CHECK-NEXT: vmv.s.x v9, zero
; CHECK-NEXT: vredxor.vs v8, v8, v9
; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
%v = load <16 x i32 >, ptr %p , align 256
%e0 = extractelement <16 x i32 > %v , i32 0
%e1 = extractelement <16 x i32 > %v , i32 1
%xor0 = xor i32 %e0 , %e1
ret i32 %xor0
}
define i32 @reduce_xor_16xi32_prefix5 (ptr %p ) {
; CHECK-LABEL: reduce_xor_16xi32_prefix5:
; CHECK: # %bb.0:
; CHECK-NEXT: li a1, 224
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
; CHECK-NEXT: vmv.s.x v0, a1
; CHECK-NEXT: vmv.v.i v8, -1
; CHECK-NEXT: vmerge.vim v8, v8, 0, v0
; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
; CHECK-NEXT: vle32.v v10, (a0)
; CHECK-NEXT: vsext.vf4 v12, v8
; CHECK-NEXT: vand.vv v8, v10, v12
; CHECK-NEXT: vmv.s.x v10, zero
; CHECK-NEXT: vredxor.vs v8, v8, v10
; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
%v = load <16 x i32 >, ptr %p , align 256
%e0 = extractelement <16 x i32 > %v , i32 0
%e1 = extractelement <16 x i32 > %v , i32 1
%e2 = extractelement <16 x i32 > %v , i32 2
%e3 = extractelement <16 x i32 > %v , i32 3
%e4 = extractelement <16 x i32 > %v , i32 4
%xor0 = xor i32 %e0 , %e1
%xor1 = xor i32 %xor0 , %e2
%xor2 = xor i32 %xor1 , %e3
%xor3 = xor i32 %xor2 , %e4
ret i32 %xor3
}
define i32 @reduce_and_16xi32_prefix2 (ptr %p ) {
; CHECK-LABEL: reduce_and_16xi32_prefix2:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
; CHECK-NEXT: vle32.v v8, (a0)
; CHECK-NEXT: vredand.vs v8, v8, v8
; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
%v = load <16 x i32 >, ptr %p , align 256
%e0 = extractelement <16 x i32 > %v , i32 0
%e1 = extractelement <16 x i32 > %v , i32 1
%and0 = and i32 %e0 , %e1
ret i32 %and0
}
define i32 @reduce_and_16xi32_prefix5 (ptr %p ) {
; CHECK-LABEL: reduce_and_16xi32_prefix5:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 8, e32, m1, ta, ma
; CHECK-NEXT: vmv.v.i v8, -1
; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
; CHECK-NEXT: vle32.v v10, (a0)
; CHECK-NEXT: vsetivli zero, 6, e32, m2, tu, ma
; CHECK-NEXT: vslideup.vi v10, v8, 5
; CHECK-NEXT: vsetivli zero, 7, e32, m2, tu, ma
; CHECK-NEXT: vslideup.vi v10, v8, 6
; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
; CHECK-NEXT: vslideup.vi v10, v8, 7
; CHECK-NEXT: vredand.vs v8, v10, v10
; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
%v = load <16 x i32 >, ptr %p , align 256
%e0 = extractelement <16 x i32 > %v , i32 0
%e1 = extractelement <16 x i32 > %v , i32 1
%e2 = extractelement <16 x i32 > %v , i32 2
%e3 = extractelement <16 x i32 > %v , i32 3
%e4 = extractelement <16 x i32 > %v , i32 4
%and0 = and i32 %e0 , %e1
%and1 = and i32 %and0 , %e2
%and2 = and i32 %and1 , %e3
%and3 = and i32 %and2 , %e4
ret i32 %and3
}
define i32 @reduce_or_16xi32_prefix2 (ptr %p ) {
; CHECK-LABEL: reduce_or_16xi32_prefix2:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
; CHECK-NEXT: vle32.v v8, (a0)
; CHECK-NEXT: vredor.vs v8, v8, v8
; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
%v = load <16 x i32 >, ptr %p , align 256
%e0 = extractelement <16 x i32 > %v , i32 0
%e1 = extractelement <16 x i32 > %v , i32 1
%or0 = or i32 %e0 , %e1
ret i32 %or0
}
define i32 @reduce_or_16xi32_prefix5 (ptr %p ) {
; CHECK-LABEL: reduce_or_16xi32_prefix5:
; CHECK: # %bb.0:
; CHECK-NEXT: li a1, 224
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
; CHECK-NEXT: vmv.s.x v0, a1
; CHECK-NEXT: vmv.v.i v8, -1
; CHECK-NEXT: vmerge.vim v8, v8, 0, v0
; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
; CHECK-NEXT: vle32.v v10, (a0)
; CHECK-NEXT: vsext.vf4 v12, v8
; CHECK-NEXT: vand.vv v8, v10, v12
; CHECK-NEXT: vredor.vs v8, v8, v8
; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
%v = load <16 x i32 >, ptr %p , align 256
%e0 = extractelement <16 x i32 > %v , i32 0
%e1 = extractelement <16 x i32 > %v , i32 1
%e2 = extractelement <16 x i32 > %v , i32 2
%e3 = extractelement <16 x i32 > %v , i32 3
%e4 = extractelement <16 x i32 > %v , i32 4
%or0 = or i32 %e0 , %e1
%or1 = or i32 %or0 , %e2
%or2 = or i32 %or1 , %e3
%or3 = or i32 %or2 , %e4
ret i32 %or3
}
declare i32 @llvm.smax.i32 (i32 %a , i32 %b )
declare i32 @llvm.smin.i32 (i32 %a , i32 %b )
declare i32 @llvm.umax.i32 (i32 %a , i32 %b )
declare i32 @llvm.umin.i32 (i32 %a , i32 %b )
define i32 @reduce_smax_16xi32_prefix2 (ptr %p ) {
; CHECK-LABEL: reduce_smax_16xi32_prefix2:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
; CHECK-NEXT: vle32.v v8, (a0)
; CHECK-NEXT: vredmax.vs v8, v8, v8
; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
%v = load <16 x i32 >, ptr %p , align 256
%e0 = extractelement <16 x i32 > %v , i32 0
%e1 = extractelement <16 x i32 > %v , i32 1
%smax0 = call i32 @llvm.smax.i32 (i32 %e0 , i32 %e1 )
ret i32 %smax0
}
define i32 @reduce_smax_16xi32_prefix5 (ptr %p ) {
; CHECK-LABEL: reduce_smax_16xi32_prefix5:
; CHECK: # %bb.0:
; CHECK-NEXT: lui a1, 524288
; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
; CHECK-NEXT: vle32.v v8, (a0)
; CHECK-NEXT: vmv.s.x v10, a1
; CHECK-NEXT: vsetivli zero, 6, e32, m2, tu, ma
; CHECK-NEXT: vslideup.vi v8, v10, 5
; CHECK-NEXT: vsetivli zero, 7, e32, m2, tu, ma
; CHECK-NEXT: vslideup.vi v8, v10, 6
; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
; CHECK-NEXT: vslideup.vi v8, v10, 7
; CHECK-NEXT: vredmax.vs v8, v8, v8
; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
%v = load <16 x i32 >, ptr %p , align 256
%e0 = extractelement <16 x i32 > %v , i32 0
%e1 = extractelement <16 x i32 > %v , i32 1
%e2 = extractelement <16 x i32 > %v , i32 2
%e3 = extractelement <16 x i32 > %v , i32 3
%e4 = extractelement <16 x i32 > %v , i32 4
%smax0 = call i32 @llvm.smax.i32 (i32 %e0 , i32 %e1 )
%smax1 = call i32 @llvm.smax.i32 (i32 %smax0 , i32 %e2 )
%smax2 = call i32 @llvm.smax.i32 (i32 %smax1 , i32 %e3 )
%smax3 = call i32 @llvm.smax.i32 (i32 %smax2 , i32 %e4 )
ret i32 %smax3
}
define i32 @reduce_smin_16xi32_prefix2 (ptr %p ) {
; CHECK-LABEL: reduce_smin_16xi32_prefix2:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
; CHECK-NEXT: vle32.v v8, (a0)
; CHECK-NEXT: vredmin.vs v8, v8, v8
; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
%v = load <16 x i32 >, ptr %p , align 256
%e0 = extractelement <16 x i32 > %v , i32 0
%e1 = extractelement <16 x i32 > %v , i32 1
%smin0 = call i32 @llvm.smin.i32 (i32 %e0 , i32 %e1 )
ret i32 %smin0
}
define i32 @reduce_smin_16xi32_prefix5 (ptr %p ) {
; RV32-LABEL: reduce_smin_16xi32_prefix5:
; RV32: # %bb.0:
; RV32-NEXT: lui a1, 524288
; RV32-NEXT: addi a1, a1, -1
; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
; RV32-NEXT: vle32.v v8, (a0)
; RV32-NEXT: vmv.s.x v10, a1
; RV32-NEXT: vsetivli zero, 6, e32, m2, tu, ma
; RV32-NEXT: vslideup.vi v8, v10, 5
; RV32-NEXT: vsetivli zero, 7, e32, m2, tu, ma
; RV32-NEXT: vslideup.vi v8, v10, 6
; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
; RV32-NEXT: vslideup.vi v8, v10, 7
; RV32-NEXT: vredmin.vs v8, v8, v8
; RV32-NEXT: vmv.x.s a0, v8
; RV32-NEXT: ret
;
; RV64-LABEL: reduce_smin_16xi32_prefix5:
; RV64: # %bb.0:
; RV64-NEXT: lui a1, 524288
; RV64-NEXT: addiw a1, a1, -1
; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma
; RV64-NEXT: vle32.v v8, (a0)
; RV64-NEXT: vmv.s.x v10, a1
; RV64-NEXT: vsetivli zero, 6, e32, m2, tu, ma
; RV64-NEXT: vslideup.vi v8, v10, 5
; RV64-NEXT: vsetivli zero, 7, e32, m2, tu, ma
; RV64-NEXT: vslideup.vi v8, v10, 6
; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma
; RV64-NEXT: vslideup.vi v8, v10, 7
; RV64-NEXT: vredmin.vs v8, v8, v8
; RV64-NEXT: vmv.x.s a0, v8
; RV64-NEXT: ret
%v = load <16 x i32 >, ptr %p , align 256
%e0 = extractelement <16 x i32 > %v , i32 0
%e1 = extractelement <16 x i32 > %v , i32 1
%e2 = extractelement <16 x i32 > %v , i32 2
%e3 = extractelement <16 x i32 > %v , i32 3
%e4 = extractelement <16 x i32 > %v , i32 4
%smin0 = call i32 @llvm.smin.i32 (i32 %e0 , i32 %e1 )
%smin1 = call i32 @llvm.smin.i32 (i32 %smin0 , i32 %e2 )
%smin2 = call i32 @llvm.smin.i32 (i32 %smin1 , i32 %e3 )
%smin3 = call i32 @llvm.smin.i32 (i32 %smin2 , i32 %e4 )
ret i32 %smin3
}
define i32 @reduce_umax_16xi32_prefix2 (ptr %p ) {
; CHECK-LABEL: reduce_umax_16xi32_prefix2:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
; CHECK-NEXT: vle32.v v8, (a0)
; CHECK-NEXT: vredmaxu.vs v8, v8, v8
; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
%v = load <16 x i32 >, ptr %p , align 256
%e0 = extractelement <16 x i32 > %v , i32 0
%e1 = extractelement <16 x i32 > %v , i32 1
%umax0 = call i32 @llvm.umax.i32 (i32 %e0 , i32 %e1 )
ret i32 %umax0
}
define i32 @reduce_umax_16xi32_prefix5 (ptr %p ) {
; CHECK-LABEL: reduce_umax_16xi32_prefix5:
; CHECK: # %bb.0:
; CHECK-NEXT: li a1, 224
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
; CHECK-NEXT: vmv.s.x v0, a1
; CHECK-NEXT: vmv.v.i v8, -1
; CHECK-NEXT: vmerge.vim v8, v8, 0, v0
; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
; CHECK-NEXT: vle32.v v10, (a0)
; CHECK-NEXT: vsext.vf4 v12, v8
; CHECK-NEXT: vand.vv v8, v10, v12
; CHECK-NEXT: vredmaxu.vs v8, v8, v8
; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
%v = load <16 x i32 >, ptr %p , align 256
%e0 = extractelement <16 x i32 > %v , i32 0
%e1 = extractelement <16 x i32 > %v , i32 1
%e2 = extractelement <16 x i32 > %v , i32 2
%e3 = extractelement <16 x i32 > %v , i32 3
%e4 = extractelement <16 x i32 > %v , i32 4
%umax0 = call i32 @llvm.umax.i32 (i32 %e0 , i32 %e1 )
%umax1 = call i32 @llvm.umax.i32 (i32 %umax0 , i32 %e2 )
%umax2 = call i32 @llvm.umax.i32 (i32 %umax1 , i32 %e3 )
%umax3 = call i32 @llvm.umax.i32 (i32 %umax2 , i32 %e4 )
ret i32 %umax3
}
define i32 @reduce_umin_16xi32_prefix2 (ptr %p ) {
; CHECK-LABEL: reduce_umin_16xi32_prefix2:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
; CHECK-NEXT: vle32.v v8, (a0)
; CHECK-NEXT: vredminu.vs v8, v8, v8
; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
%v = load <16 x i32 >, ptr %p , align 256
%e0 = extractelement <16 x i32 > %v , i32 0
%e1 = extractelement <16 x i32 > %v , i32 1
%umin0 = call i32 @llvm.umin.i32 (i32 %e0 , i32 %e1 )
ret i32 %umin0
}
define i32 @reduce_umin_16xi32_prefix5 (ptr %p ) {
; CHECK-LABEL: reduce_umin_16xi32_prefix5:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 8, e32, m1, ta, ma
; CHECK-NEXT: vmv.v.i v8, -1
; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
; CHECK-NEXT: vle32.v v10, (a0)
; CHECK-NEXT: vsetivli zero, 6, e32, m2, tu, ma
; CHECK-NEXT: vslideup.vi v10, v8, 5
; CHECK-NEXT: vsetivli zero, 7, e32, m2, tu, ma
; CHECK-NEXT: vslideup.vi v10, v8, 6
; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
; CHECK-NEXT: vslideup.vi v10, v8, 7
; CHECK-NEXT: vredminu.vs v8, v10, v10
; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
%v = load <16 x i32 >, ptr %p , align 256
%e0 = extractelement <16 x i32 > %v , i32 0
%e1 = extractelement <16 x i32 > %v , i32 1
%e2 = extractelement <16 x i32 > %v , i32 2
%e3 = extractelement <16 x i32 > %v , i32 3
%e4 = extractelement <16 x i32 > %v , i32 4
%umin0 = call i32 @llvm.umin.i32 (i32 %e0 , i32 %e1 )
%umin1 = call i32 @llvm.umin.i32 (i32 %umin0 , i32 %e2 )
%umin2 = call i32 @llvm.umin.i32 (i32 %umin1 , i32 %e3 )
%umin3 = call i32 @llvm.umin.i32 (i32 %umin2 , i32 %e4 )
ret i32 %umin3
}