55 changes: 33 additions & 22 deletions llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
Original file line number Diff line number Diff line change
Expand Up @@ -645,19 +645,39 @@ entry:
ret <4 x i16> %5
}

define <8 x i8> @merge_start_into_end(<8 x i8> %v, <8 x i8> %w) {
; CHECK-LABEL: merge_start_into_end:
define <8 x i8> @concat_4xi8_start(<8 x i8> %v, <8 x i8> %w) {
; CHECK-LABEL: concat_4xi8_start:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
; CHECK-NEXT: vslideup.vi v8, v9, 4
; CHECK-NEXT: ret
%res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
ret <8 x i8> %res
}

define <8 x i8> @concat_4xi8_start_undef(<8 x i8> %v, <8 x i8> %w) {
; CHECK-LABEL: concat_4xi8_start_undef:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
; CHECK-NEXT: vslideup.vi v8, v9, 4
; CHECK-NEXT: ret
%res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 undef, i32 10, i32 11>
ret <8 x i8> %res
}

define <8 x i8> @concat_4xi8_start_undef_at_start(<8 x i8> %v, <8 x i8> %w) {
; CHECK-LABEL: concat_4xi8_start_undef_at_start:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
; CHECK-NEXT: vid.v v11
; CHECK-NEXT: vrgather.vv v10, v8, v11
; CHECK-NEXT: li a0, 240
; CHECK-NEXT: li a0, 224
; CHECK-NEXT: vmv.s.x v0, a0
; CHECK-NEXT: vadd.vi v8, v11, -4
; CHECK-NEXT: vrgather.vv v10, v9, v8, v0.t
; CHECK-NEXT: vmv1r.v v8, v10
; CHECK-NEXT: ret
%res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
%res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 9, i32 10, i32 11>
ret <8 x i8> %res
}

Expand All @@ -680,10 +700,9 @@ define <8 x i8> @merge_start_into_end_non_contiguous(<8 x i8> %v, <8 x i8> %w) {
define <8 x i8> @merge_end_into_end(<8 x i8> %v, <8 x i8> %w) {
; CHECK-LABEL: merge_end_into_end:
; CHECK: # %bb.0:
; CHECK-NEXT: li a0, 15
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
; CHECK-NEXT: vmv.s.x v0, a0
; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0
; CHECK-NEXT: vsetivli zero, 4, e8, mf2, tu, ma
; CHECK-NEXT: vslideup.vi v9, v8, 0
; CHECK-NEXT: vmv1r.v v8, v9
; CHECK-NEXT: ret
%res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
ret <8 x i8> %res
Expand All @@ -692,14 +711,8 @@ define <8 x i8> @merge_end_into_end(<8 x i8> %v, <8 x i8> %w) {
define <8 x i8> @merge_start_into_middle(<8 x i8> %v, <8 x i8> %w) {
; CHECK-LABEL: merge_start_into_middle:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
; CHECK-NEXT: vid.v v11
; CHECK-NEXT: vrgather.vv v10, v8, v11
; CHECK-NEXT: li a0, 30
; CHECK-NEXT: vmv.s.x v0, a0
; CHECK-NEXT: vadd.vi v8, v11, -1
; CHECK-NEXT: vrgather.vv v10, v9, v8, v0.t
; CHECK-NEXT: vmv1r.v v8, v10
; CHECK-NEXT: vsetivli zero, 5, e8, mf2, tu, ma
; CHECK-NEXT: vslideup.vi v8, v9, 1
; CHECK-NEXT: ret
%res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 0, i32 8, i32 9, i32 10, i32 11, i32 5, i32 6, i32 7>
ret <8 x i8> %res
Expand All @@ -708,10 +721,8 @@ define <8 x i8> @merge_start_into_middle(<8 x i8> %v, <8 x i8> %w) {
define <8 x i8> @merge_start_into_start(<8 x i8> %v, <8 x i8> %w) {
; CHECK-LABEL: merge_start_into_start:
; CHECK: # %bb.0:
; CHECK-NEXT: li a0, 240
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
; CHECK-NEXT: vmv.s.x v0, a0
; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0
; CHECK-NEXT: vsetivli zero, 4, e8, mf2, tu, ma
; CHECK-NEXT: vslideup.vi v8, v9, 0
; CHECK-NEXT: ret
%res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
ret <8 x i8> %res
Expand Down Expand Up @@ -758,8 +769,8 @@ define <8 x i8> @unmergable(<8 x i8> %v, <8 x i8> %w) {
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
; CHECK-NEXT: vid.v v10
; CHECK-NEXT: vadd.vi v11, v10, 2
; CHECK-NEXT: lui a0, %hi(.LCPI44_0)
; CHECK-NEXT: addi a0, a0, %lo(.LCPI44_0)
; CHECK-NEXT: lui a0, %hi(.LCPI46_0)
; CHECK-NEXT: addi a0, a0, %lo(.LCPI46_0)
; CHECK-NEXT: vle8.v v12, (a0)
; CHECK-NEXT: li a0, 234
; CHECK-NEXT: vmv.s.x v0, a0
Expand Down
36 changes: 12 additions & 24 deletions llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll
Original file line number Diff line number Diff line change
Expand Up @@ -150,13 +150,10 @@ define void @vnsrl_0_i32(ptr %in, ptr %out) {
; ZVE32F: # %bb.0: # %entry
; ZVE32F-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; ZVE32F-NEXT: vle32.v v8, (a0)
; ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu
; ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma
; ZVE32F-NEXT: vslidedown.vi v9, v8, 2
; ZVE32F-NEXT: li a0, 2
; ZVE32F-NEXT: vmv.s.x v0, a0
; ZVE32F-NEXT: vrgather.vi v10, v8, 0
; ZVE32F-NEXT: vrgather.vi v10, v9, 0, v0.t
; ZVE32F-NEXT: vse32.v v10, (a1)
; ZVE32F-NEXT: vslideup.vi v8, v9, 1
; ZVE32F-NEXT: vse32.v v8, (a1)
; ZVE32F-NEXT: ret
entry:
%0 = load <4 x i32>, ptr %in, align 4
Expand Down Expand Up @@ -209,13 +206,10 @@ define void @vnsrl_0_float(ptr %in, ptr %out) {
; ZVE32F: # %bb.0: # %entry
; ZVE32F-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; ZVE32F-NEXT: vle32.v v8, (a0)
; ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu
; ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma
; ZVE32F-NEXT: vslidedown.vi v9, v8, 2
; ZVE32F-NEXT: li a0, 2
; ZVE32F-NEXT: vmv.s.x v0, a0
; ZVE32F-NEXT: vrgather.vi v10, v8, 0
; ZVE32F-NEXT: vrgather.vi v10, v9, 0, v0.t
; ZVE32F-NEXT: vse32.v v10, (a1)
; ZVE32F-NEXT: vslideup.vi v8, v9, 1
; ZVE32F-NEXT: vse32.v v8, (a1)
; ZVE32F-NEXT: ret
entry:
%0 = load <4 x float>, ptr %in, align 4
Expand Down Expand Up @@ -259,13 +253,10 @@ define void @vnsrl_0_i64(ptr %in, ptr %out) {
; V: # %bb.0: # %entry
; V-NEXT: vsetivli zero, 4, e64, m1, ta, ma
; V-NEXT: vle64.v v8, (a0)
; V-NEXT: vsetivli zero, 2, e64, m1, ta, mu
; V-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; V-NEXT: vslidedown.vi v9, v8, 2
; V-NEXT: li a0, 2
; V-NEXT: vmv.s.x v0, a0
; V-NEXT: vrgather.vi v10, v8, 0
; V-NEXT: vrgather.vi v10, v9, 0, v0.t
; V-NEXT: vse64.v v10, (a1)
; V-NEXT: vslideup.vi v8, v9, 1
; V-NEXT: vse64.v v8, (a1)
; V-NEXT: ret
;
; ZVE32F-LABEL: vnsrl_0_i64:
Expand Down Expand Up @@ -315,13 +306,10 @@ define void @vnsrl_0_double(ptr %in, ptr %out) {
; V: # %bb.0: # %entry
; V-NEXT: vsetivli zero, 4, e64, m1, ta, ma
; V-NEXT: vle64.v v8, (a0)
; V-NEXT: vsetivli zero, 2, e64, m1, ta, mu
; V-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; V-NEXT: vslidedown.vi v9, v8, 2
; V-NEXT: li a0, 2
; V-NEXT: vmv.s.x v0, a0
; V-NEXT: vrgather.vi v10, v8, 0
; V-NEXT: vrgather.vi v10, v9, 0, v0.t
; V-NEXT: vse64.v v10, (a1)
; V-NEXT: vslideup.vi v8, v9, 1
; V-NEXT: vse64.v v8, (a1)
; V-NEXT: ret
;
; ZVE32F-LABEL: vnsrl_0_double:
Expand Down
16 changes: 6 additions & 10 deletions llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
Original file line number Diff line number Diff line change
Expand Up @@ -292,15 +292,13 @@ define {<2 x i64>, <2 x i64>} @vector_deinterleave_v2i64_v4i64(<4 x i64> %vec) {
; CHECK-LABEL: vector_deinterleave_v2i64_v4i64:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 2, e64, m2, ta, ma
; CHECK-NEXT: vslidedown.vi v12, v8, 2
; CHECK-NEXT: vslidedown.vi v10, v8, 2
; CHECK-NEXT: li a0, 2
; CHECK-NEXT: vmv.s.x v0, a0
; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu
; CHECK-NEXT: vrgather.vi v10, v8, 0
; CHECK-NEXT: vrgather.vi v10, v12, 0, v0.t
; CHECK-NEXT: vrgather.vi v9, v8, 1
; CHECK-NEXT: vrgather.vi v9, v12, 1, v0.t
; CHECK-NEXT: vmv.v.v v8, v10
; CHECK-NEXT: vrgather.vi v9, v10, 1, v0.t
; CHECK-NEXT: vslideup.vi v8, v10, 1
; CHECK-NEXT: ret
%retval = call {<2 x i64>, <2 x i64>} @llvm.experimental.vector.deinterleave2.v4i64(<4 x i64> %vec)
ret {<2 x i64>, <2 x i64>} %retval
Expand Down Expand Up @@ -381,15 +379,13 @@ define {<2 x double>, <2 x double>} @vector_deinterleave_v2f64_v4f64(<4 x double
; CHECK-LABEL: vector_deinterleave_v2f64_v4f64:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 2, e64, m2, ta, ma
; CHECK-NEXT: vslidedown.vi v12, v8, 2
; CHECK-NEXT: vslidedown.vi v10, v8, 2
; CHECK-NEXT: li a0, 2
; CHECK-NEXT: vmv.s.x v0, a0
; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu
; CHECK-NEXT: vrgather.vi v10, v8, 0
; CHECK-NEXT: vrgather.vi v10, v12, 0, v0.t
; CHECK-NEXT: vrgather.vi v9, v8, 1
; CHECK-NEXT: vrgather.vi v9, v12, 1, v0.t
; CHECK-NEXT: vmv.v.v v8, v10
; CHECK-NEXT: vrgather.vi v9, v10, 1, v0.t
; CHECK-NEXT: vslideup.vi v8, v10, 1
; CHECK-NEXT: ret
%retval = call {<2 x double>, <2 x double>} @llvm.experimental.vector.deinterleave2.v4f64(<4 x double> %vec)
ret {<2 x double>, <2 x double>} %retval
Expand Down
72 changes: 30 additions & 42 deletions llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll
Original file line number Diff line number Diff line change
Expand Up @@ -199,13 +199,11 @@ define <4 x float> @exp_4x(ptr %a) {
; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @expf(float [[VECEXT_1]])
; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
; CHECK-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
; CHECK-NEXT: [[TMP3:%.*]] = tail call fast float @expf(float [[VECEXT_2]])
; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
; CHECK-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
; CHECK-NEXT: [[TMP4:%.*]] = tail call fast float @expf(float [[VECEXT_3]])
; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
; CHECK-NEXT: ret <4 x float> [[VECINS_3]]
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
; CHECK-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP3]])
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
; CHECK-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
; CHECK-NEXT: ret <4 x float> [[VECINS_31]]
;
; DEFAULT-LABEL: define <4 x float> @exp_4x
; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
Expand Down Expand Up @@ -256,13 +254,11 @@ define <4 x float> @int_exp_4x(ptr %a) {
; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_1]])
; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
; CHECK-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
; CHECK-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_2]])
; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
; CHECK-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
; CHECK-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_3]])
; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
; CHECK-NEXT: ret <4 x float> [[VECINS_3]]
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
; CHECK-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP3]])
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
; CHECK-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
; CHECK-NEXT: ret <4 x float> [[VECINS_31]]
;
; DEFAULT-LABEL: define <4 x float> @int_exp_4x
; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
Expand Down Expand Up @@ -313,13 +309,11 @@ define <4 x float> @log_4x(ptr %a) {
; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @logf(float [[VECEXT_1]])
; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
; CHECK-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
; CHECK-NEXT: [[TMP3:%.*]] = tail call fast float @logf(float [[VECEXT_2]])
; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
; CHECK-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
; CHECK-NEXT: [[TMP4:%.*]] = tail call fast float @logf(float [[VECEXT_3]])
; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
; CHECK-NEXT: ret <4 x float> [[VECINS_3]]
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
; CHECK-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP3]])
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
; CHECK-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
; CHECK-NEXT: ret <4 x float> [[VECINS_31]]
;
; DEFAULT-LABEL: define <4 x float> @log_4x
; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
Expand Down Expand Up @@ -370,13 +364,11 @@ define <4 x float> @int_log_4x(ptr %a) {
; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_1]])
; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
; CHECK-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
; CHECK-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_2]])
; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
; CHECK-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
; CHECK-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_3]])
; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
; CHECK-NEXT: ret <4 x float> [[VECINS_3]]
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
; CHECK-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP3]])
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
; CHECK-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
; CHECK-NEXT: ret <4 x float> [[VECINS_31]]
;
; DEFAULT-LABEL: define <4 x float> @int_log_4x
; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
Expand Down Expand Up @@ -427,13 +419,11 @@ define <4 x float> @sin_4x(ptr %a) {
; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @sinf(float [[VECEXT_1]])
; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
; CHECK-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
; CHECK-NEXT: [[TMP3:%.*]] = tail call fast float @sinf(float [[VECEXT_2]])
; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
; CHECK-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
; CHECK-NEXT: [[TMP4:%.*]] = tail call fast float @sinf(float [[VECEXT_3]])
; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
; CHECK-NEXT: ret <4 x float> [[VECINS_3]]
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
; CHECK-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]])
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
; CHECK-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
; CHECK-NEXT: ret <4 x float> [[VECINS_31]]
;
; DEFAULT-LABEL: define <4 x float> @sin_4x
; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
Expand Down Expand Up @@ -484,13 +474,11 @@ define <4 x float> @int_sin_4x(ptr %a) {
; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]])
; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
; CHECK-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
; CHECK-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_2]])
; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
; CHECK-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
; CHECK-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_3]])
; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
; CHECK-NEXT: ret <4 x float> [[VECINS_3]]
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
; CHECK-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]])
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
; CHECK-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
; CHECK-NEXT: ret <4 x float> [[VECINS_31]]
;
; DEFAULT-LABEL: define <4 x float> @int_sin_4x
; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
Expand Down
175 changes: 116 additions & 59 deletions llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -passes=slp-vectorizer -mtriple=riscv64 -mattr=+v \
; RUN: -riscv-v-vector-bits-min=128 -S | FileCheck %s --check-prefixes=CHECK
; RUN: -riscv-v-vector-bits-min=128 -riscv-v-slp-max-vf=0 -S \
; RUN: | FileCheck %s --check-prefixes=CHECK
; RUN: opt < %s -passes=slp-vectorizer -mtriple=riscv64 -mattr=+v \
; RUN: -riscv-v-vector-bits-min=256 -S | FileCheck %s --check-prefixes=CHECK
; RUN: -riscv-v-vector-bits-min=256 -riscv-v-slp-max-vf=0 -S \
; RUN: | FileCheck %s --check-prefixes=CHECK
; RUN: opt < %s -passes=slp-vectorizer -mtriple=riscv64 -mattr=+v \
; RUN: -riscv-v-vector-bits-min=512 -S | FileCheck %s --check-prefixes=CHECK
; RUN: -riscv-v-vector-bits-min=512 -riscv-v-slp-max-vf=0 -S \
; RUN: | FileCheck %s --check-prefixes=CHECK

target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n64-S128"
target triple = "riscv64"
Expand Down Expand Up @@ -823,64 +826,24 @@ entry:

declare i32 @llvm.abs.i32(i32, i1)

; FIXME: This horizontal reduction occurs because the cost model thinks it can
; vectorize the loads here. However, because -riscv-v-slp-max-vf is set to 1 by
; default, tryToVectorizeList fails and we end up with this very expensive
; scalarized load.
;
; This is the code the cost model thinks it's going to generate, which you can
; get by passing -riscv-v-slp-max-vf=0
;
; define i32 @stride_sum_abs_diff(ptr %p, ptr %q, i64 %stride) #0 {
; %p.2 = getelementptr inbounds i32, ptr %p, i64 %stride
; %q.2 = getelementptr inbounds i32, ptr %q, i64 %stride
; %p.3 = getelementptr inbounds i32, ptr %p.2, i64 1
; %q.3 = getelementptr inbounds i32, ptr %q.2, i64 1
; %1 = load <2 x i32>, ptr %p, align 4
; %2 = load <2 x i32>, ptr %q, align 4
; %x.2 = load i32, ptr %p.2, align 4
; %y.2 = load i32, ptr %q.2, align 4
; %x.3 = load i32, ptr %p.3, align 4
; %y.3 = load i32, ptr %q.3, align 4
; %3 = shufflevector <2 x i32> %1, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
; %4 = insertelement <4 x i32> %3, i32 %x.2, i32 2
; %5 = insertelement <4 x i32> %4, i32 %x.3, i32 3
; %6 = shufflevector <2 x i32> %2, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
; %7 = insertelement <4 x i32> %6, i32 %y.2, i32 2
; %8 = insertelement <4 x i32> %7, i32 %y.3, i32 3
; %9 = sub <4 x i32> %5, %8
; %10 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %9, i1 true)
; %11 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %10)
; ret i32 %11
; }
define i32 @stride_sum_abs_diff(ptr %p, ptr %q, i64 %stride) {
; CHECK-LABEL: @stride_sum_abs_diff(
; CHECK-NEXT: [[P_1:%.*]] = getelementptr inbounds i32, ptr [[P:%.*]], i64 1
; CHECK-NEXT: [[Q_1:%.*]] = getelementptr inbounds i32, ptr [[Q:%.*]], i64 1
; CHECK-NEXT: [[P_2:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 [[STRIDE:%.*]]
; CHECK-NEXT: [[Q_2:%.*]] = getelementptr inbounds i32, ptr [[Q]], i64 [[STRIDE]]
; CHECK-NEXT: [[P_3:%.*]] = getelementptr inbounds i32, ptr [[P_2]], i64 1
; CHECK-NEXT: [[Q_3:%.*]] = getelementptr inbounds i32, ptr [[Q_2]], i64 1
; CHECK-NEXT: [[X_0:%.*]] = load i32, ptr [[P]], align 4
; CHECK-NEXT: [[Y_0:%.*]] = load i32, ptr [[Q]], align 4
; CHECK-NEXT: [[X_1:%.*]] = load i32, ptr [[P_1]], align 4
; CHECK-NEXT: [[Y_1:%.*]] = load i32, ptr [[Q_1]], align 4
; CHECK-NEXT: [[X_2:%.*]] = load i32, ptr [[P_2]], align 4
; CHECK-NEXT: [[Y_2:%.*]] = load i32, ptr [[Q_2]], align 4
; CHECK-NEXT: [[X_3:%.*]] = load i32, ptr [[P_3]], align 4
; CHECK-NEXT: [[Y_3:%.*]] = load i32, ptr [[Q_3]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[X_0]], i32 0
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[X_1]], i32 1
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[X_2]], i32 2
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[X_3]], i32 3
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> poison, i32 [[Y_0]], i32 0
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[Y_1]], i32 1
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[Y_2]], i32 2
; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[Y_3]], i32 3
; CHECK-NEXT: [[TMP9:%.*]] = sub <4 x i32> [[TMP4]], [[TMP8]]
; CHECK-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP9]], i1 true)
; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP10]])
; CHECK-NEXT: ret i32 [[TMP11]]
; CHECK-NEXT: [[P_2:%.*]] = getelementptr inbounds i32, ptr [[P:%.*]], i64 [[STRIDE:%.*]]
; CHECK-NEXT: [[Q_2:%.*]] = getelementptr inbounds i32, ptr [[Q:%.*]], i64 [[STRIDE]]
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[P]], align 4
; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[Q]], align 4
; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr [[P_2]], align 4
; CHECK-NEXT: [[TMP4:%.*]] = load <2 x i32>, ptr [[Q_2]], align 4
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
; CHECK-NEXT: [[TMP11:%.*]] = sub <4 x i32> [[TMP7]], [[TMP10]]
; CHECK-NEXT: [[TMP12:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP11]], i1 true)
; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP12]])
; CHECK-NEXT: ret i32 [[TMP13]]
;
%x.0 = load i32, ptr %p
%y.0 = load i32, ptr %q
Expand Down Expand Up @@ -914,3 +877,97 @@ define i32 @stride_sum_abs_diff(ptr %p, ptr %q, i64 %stride) {

ret i32 %sum.2
}

define i32 @reduce_sum_2arrays_a(ptr noalias %p, ptr noalias %q) {
; CHECK-LABEL: @reduce_sum_2arrays_a(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr [[P:%.*]], align 1
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[Q:%.*]], align 1
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
; CHECK-NEXT: [[TMP5:%.*]] = zext <8 x i8> [[TMP4]] to <8 x i32>
; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP5]])
; CHECK-NEXT: ret i32 [[TMP6]]
;
entry:
%x.0 = load i8, ptr %p, align 1
%conv = zext i8 %x.0 to i32
%y.0 = load i8, ptr %q, align 1
%conv3 = zext i8 %y.0 to i32
%add4 = add nuw nsw i32 %conv, %conv3

%arrayidx.1 = getelementptr inbounds i8, ptr %p, i64 1
%x.1 = load i8, ptr %arrayidx.1, align 1
%conv.1 = zext i8 %x.1 to i32
%arrayidx2.1 = getelementptr inbounds i8, ptr %q, i64 1
%y.1 = load i8, ptr %arrayidx2.1, align 1
%conv3.1 = zext i8 %y.1 to i32
%add.1 = add nuw nsw i32 %add4, %conv.1
%add4.1 = add nuw nsw i32 %add.1, %conv3.1

%arrayidx.2 = getelementptr inbounds i8, ptr %p, i64 2
%x.2 = load i8, ptr %arrayidx.2, align 1
%conv.2 = zext i8 %x.2 to i32
%arrayidx2.2 = getelementptr inbounds i8, ptr %q, i64 2
%y.2 = load i8, ptr %arrayidx2.2, align 1
%conv3.2 = zext i8 %y.2 to i32
%add.2 = add nuw nsw i32 %add4.1, %conv.2
%add4.2 = add nuw nsw i32 %add.2, %conv3.2

%arrayidx.3 = getelementptr inbounds i8, ptr %p, i64 3
%x.3 = load i8, ptr %arrayidx.3, align 1
%conv.3 = zext i8 %x.3 to i32
%arrayidx2.3 = getelementptr inbounds i8, ptr %q, i64 3
%y.3 = load i8, ptr %arrayidx2.3, align 1
%conv3.3 = zext i8 %y.3 to i32
%add.3 = add nuw nsw i32 %add4.2, %conv.3
%add4.3 = add nuw nsw i32 %add.3, %conv3.3

ret i32 %add4.3
}

define i32 @reduce_sum_2arrays_b(ptr noalias noundef %x, ptr noalias %y) {
; CHECK-LABEL: @reduce_sum_2arrays_b(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr [[X:%.*]], align 1
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[Y:%.*]], align 1
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
; CHECK-NEXT: [[TMP5:%.*]] = zext <8 x i8> [[TMP4]] to <8 x i32>
; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP5]])
; CHECK-NEXT: ret i32 [[TMP6]]
;
entry:
%0 = load i8, ptr %x, align 1
%conv = zext i8 %0 to i32
%arrayidx.1 = getelementptr inbounds i8, ptr %x, i64 1
%1 = load i8, ptr %arrayidx.1, align 1
%conv.1 = zext i8 %1 to i32
%add.1 = add nuw nsw i32 %conv, %conv.1
%arrayidx.2 = getelementptr inbounds i8, ptr %x, i64 2
%2 = load i8, ptr %arrayidx.2, align 1
%conv.2 = zext i8 %2 to i32
%add.2 = add nuw nsw i32 %add.1, %conv.2
%arrayidx.3 = getelementptr inbounds i8, ptr %x, i64 3
%3 = load i8, ptr %arrayidx.3, align 1
%conv.3 = zext i8 %3 to i32
%add.3 = add nuw nsw i32 %add.2, %conv.3
%4 = load i8, ptr %y, align 1
%conv9 = zext i8 %4 to i32
%add10 = add nuw nsw i32 %add.3, %conv9
%arrayidx8.1 = getelementptr inbounds i8, ptr %y, i64 1
%5 = load i8, ptr %arrayidx8.1, align 1
%conv9.1 = zext i8 %5 to i32
%add10.1 = add nuw nsw i32 %add10, %conv9.1
%arrayidx8.2 = getelementptr inbounds i8, ptr %y, i64 2
%6 = load i8, ptr %arrayidx8.2, align 1
%conv9.2 = zext i8 %6 to i32
%add10.2 = add nuw nsw i32 %add10.1, %conv9.2
%arrayidx8.3 = getelementptr inbounds i8, ptr %y, i64 3
%7 = load i8, ptr %arrayidx8.3, align 1
%conv9.3 = zext i8 %7 to i32
%add10.3 = add nuw nsw i32 %add10.2, %conv9.3
ret i32 %add10.3
}