@@ -1,10 +1,13 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -passes=slp-vectorizer -mtriple=riscv64 -mattr=+v \
; RUN: -riscv-v-vector-bits-min=128 -S | FileCheck %s --check-prefixes=CHECK
; RUN: -riscv-v-vector-bits-min=128 -riscv-v-slp-max-vf=0 -S \
; RUN: | FileCheck %s --check-prefixes=CHECK
; RUN: opt < %s -passes=slp-vectorizer -mtriple=riscv64 -mattr=+v \
; RUN: -riscv-v-vector-bits-min=256 -S | FileCheck %s --check-prefixes=CHECK
; RUN: -riscv-v-vector-bits-min=256 -riscv-v-slp-max-vf=0 -S \
; RUN: | FileCheck %s --check-prefixes=CHECK
; RUN: opt < %s -passes=slp-vectorizer -mtriple=riscv64 -mattr=+v \
; RUN: -riscv-v-vector-bits-min=512 -S | FileCheck %s --check-prefixes=CHECK
; RUN: -riscv-v-vector-bits-min=512 -riscv-v-slp-max-vf=0 -S \
; RUN: | FileCheck %s --check-prefixes=CHECK
target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n64-S128"
target triple = "riscv64"
Expand Down
Expand Up
@@ -823,64 +826,24 @@ entry:
declare i32 @llvm.abs.i32 (i32 , i1 )
; FIXME: This horizontal reduction occurs because the cost model thinks it can
; vectorize the loads here. However, because -riscv-v-slp-max-vf is set to 1 by
; default, tryToVectorizeList fails and we end up with this very expensive
; scalarized load.
;
; This is the code the cost model thinks it's going to generate, which you can
; get by passing -riscv-v-slp-max-vf=0
;
; define i32 @stride_sum_abs_diff(ptr %p, ptr %q, i64 %stride) #0 {
; %p.2 = getelementptr inbounds i32, ptr %p, i64 %stride
; %q.2 = getelementptr inbounds i32, ptr %q, i64 %stride
; %p.3 = getelementptr inbounds i32, ptr %p.2, i64 1
; %q.3 = getelementptr inbounds i32, ptr %q.2, i64 1
; %1 = load <2 x i32>, ptr %p, align 4
; %2 = load <2 x i32>, ptr %q, align 4
; %x.2 = load i32, ptr %p.2, align 4
; %y.2 = load i32, ptr %q.2, align 4
; %x.3 = load i32, ptr %p.3, align 4
; %y.3 = load i32, ptr %q.3, align 4
; %3 = shufflevector <2 x i32> %1, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
; %4 = insertelement <4 x i32> %3, i32 %x.2, i32 2
; %5 = insertelement <4 x i32> %4, i32 %x.3, i32 3
; %6 = shufflevector <2 x i32> %2, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
; %7 = insertelement <4 x i32> %6, i32 %y.2, i32 2
; %8 = insertelement <4 x i32> %7, i32 %y.3, i32 3
; %9 = sub <4 x i32> %5, %8
; %10 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %9, i1 true)
; %11 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %10)
; ret i32 %11
; }
define i32 @stride_sum_abs_diff (ptr %p , ptr %q , i64 %stride ) {
; CHECK-LABEL: @stride_sum_abs_diff(
; CHECK-NEXT: [[P_1:%.*]] = getelementptr inbounds i32, ptr [[P:%.*]], i64 1
; CHECK-NEXT: [[Q_1:%.*]] = getelementptr inbounds i32, ptr [[Q:%.*]], i64 1
; CHECK-NEXT: [[P_2:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 [[STRIDE:%.*]]
; CHECK-NEXT: [[Q_2:%.*]] = getelementptr inbounds i32, ptr [[Q]], i64 [[STRIDE]]
; CHECK-NEXT: [[P_3:%.*]] = getelementptr inbounds i32, ptr [[P_2]], i64 1
; CHECK-NEXT: [[Q_3:%.*]] = getelementptr inbounds i32, ptr [[Q_2]], i64 1
; CHECK-NEXT: [[X_0:%.*]] = load i32, ptr [[P]], align 4
; CHECK-NEXT: [[Y_0:%.*]] = load i32, ptr [[Q]], align 4
; CHECK-NEXT: [[X_1:%.*]] = load i32, ptr [[P_1]], align 4
; CHECK-NEXT: [[Y_1:%.*]] = load i32, ptr [[Q_1]], align 4
; CHECK-NEXT: [[X_2:%.*]] = load i32, ptr [[P_2]], align 4
; CHECK-NEXT: [[Y_2:%.*]] = load i32, ptr [[Q_2]], align 4
; CHECK-NEXT: [[X_3:%.*]] = load i32, ptr [[P_3]], align 4
; CHECK-NEXT: [[Y_3:%.*]] = load i32, ptr [[Q_3]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[X_0]], i32 0
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[X_1]], i32 1
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[X_2]], i32 2
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[X_3]], i32 3
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> poison, i32 [[Y_0]], i32 0
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[Y_1]], i32 1
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[Y_2]], i32 2
; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[Y_3]], i32 3
; CHECK-NEXT: [[TMP9:%.*]] = sub <4 x i32> [[TMP4]], [[TMP8]]
; CHECK-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP9]], i1 true)
; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP10]])
; CHECK-NEXT: ret i32 [[TMP11]]
; CHECK-NEXT: [[P_2:%.*]] = getelementptr inbounds i32, ptr [[P:%.*]], i64 [[STRIDE:%.*]]
; CHECK-NEXT: [[Q_2:%.*]] = getelementptr inbounds i32, ptr [[Q:%.*]], i64 [[STRIDE]]
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[P]], align 4
; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[Q]], align 4
; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr [[P_2]], align 4
; CHECK-NEXT: [[TMP4:%.*]] = load <2 x i32>, ptr [[Q_2]], align 4
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
; CHECK-NEXT: [[TMP11:%.*]] = sub <4 x i32> [[TMP7]], [[TMP10]]
; CHECK-NEXT: [[TMP12:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP11]], i1 true)
; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP12]])
; CHECK-NEXT: ret i32 [[TMP13]]
;
%x.0 = load i32 , ptr %p
%y.0 = load i32 , ptr %q
Expand Down
Expand Up
@@ -914,3 +877,97 @@ define i32 @stride_sum_abs_diff(ptr %p, ptr %q, i64 %stride) {
ret i32 %sum.2
}
define i32 @reduce_sum_2arrays_a (ptr noalias %p , ptr noalias %q ) {
; CHECK-LABEL: @reduce_sum_2arrays_a(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr [[P:%.*]], align 1
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[Q:%.*]], align 1
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
; CHECK-NEXT: [[TMP5:%.*]] = zext <8 x i8> [[TMP4]] to <8 x i32>
; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP5]])
; CHECK-NEXT: ret i32 [[TMP6]]
;
entry:
%x.0 = load i8 , ptr %p , align 1
%conv = zext i8 %x.0 to i32
%y.0 = load i8 , ptr %q , align 1
%conv3 = zext i8 %y.0 to i32
%add4 = add nuw nsw i32 %conv , %conv3
%arrayidx.1 = getelementptr inbounds i8 , ptr %p , i64 1
%x.1 = load i8 , ptr %arrayidx.1 , align 1
%conv.1 = zext i8 %x.1 to i32
%arrayidx2.1 = getelementptr inbounds i8 , ptr %q , i64 1
%y.1 = load i8 , ptr %arrayidx2.1 , align 1
%conv3.1 = zext i8 %y.1 to i32
%add.1 = add nuw nsw i32 %add4 , %conv.1
%add4.1 = add nuw nsw i32 %add.1 , %conv3.1
%arrayidx.2 = getelementptr inbounds i8 , ptr %p , i64 2
%x.2 = load i8 , ptr %arrayidx.2 , align 1
%conv.2 = zext i8 %x.2 to i32
%arrayidx2.2 = getelementptr inbounds i8 , ptr %q , i64 2
%y.2 = load i8 , ptr %arrayidx2.2 , align 1
%conv3.2 = zext i8 %y.2 to i32
%add.2 = add nuw nsw i32 %add4.1 , %conv.2
%add4.2 = add nuw nsw i32 %add.2 , %conv3.2
%arrayidx.3 = getelementptr inbounds i8 , ptr %p , i64 3
%x.3 = load i8 , ptr %arrayidx.3 , align 1
%conv.3 = zext i8 %x.3 to i32
%arrayidx2.3 = getelementptr inbounds i8 , ptr %q , i64 3
%y.3 = load i8 , ptr %arrayidx2.3 , align 1
%conv3.3 = zext i8 %y.3 to i32
%add.3 = add nuw nsw i32 %add4.2 , %conv.3
%add4.3 = add nuw nsw i32 %add.3 , %conv3.3
ret i32 %add4.3
}
define i32 @reduce_sum_2arrays_b (ptr noalias noundef %x , ptr noalias %y ) {
; CHECK-LABEL: @reduce_sum_2arrays_b(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr [[X:%.*]], align 1
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[Y:%.*]], align 1
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
; CHECK-NEXT: [[TMP5:%.*]] = zext <8 x i8> [[TMP4]] to <8 x i32>
; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP5]])
; CHECK-NEXT: ret i32 [[TMP6]]
;
entry:
%0 = load i8 , ptr %x , align 1
%conv = zext i8 %0 to i32
%arrayidx.1 = getelementptr inbounds i8 , ptr %x , i64 1
%1 = load i8 , ptr %arrayidx.1 , align 1
%conv.1 = zext i8 %1 to i32
%add.1 = add nuw nsw i32 %conv , %conv.1
%arrayidx.2 = getelementptr inbounds i8 , ptr %x , i64 2
%2 = load i8 , ptr %arrayidx.2 , align 1
%conv.2 = zext i8 %2 to i32
%add.2 = add nuw nsw i32 %add.1 , %conv.2
%arrayidx.3 = getelementptr inbounds i8 , ptr %x , i64 3
%3 = load i8 , ptr %arrayidx.3 , align 1
%conv.3 = zext i8 %3 to i32
%add.3 = add nuw nsw i32 %add.2 , %conv.3
%4 = load i8 , ptr %y , align 1
%conv9 = zext i8 %4 to i32
%add10 = add nuw nsw i32 %add.3 , %conv9
%arrayidx8.1 = getelementptr inbounds i8 , ptr %y , i64 1
%5 = load i8 , ptr %arrayidx8.1 , align 1
%conv9.1 = zext i8 %5 to i32
%add10.1 = add nuw nsw i32 %add10 , %conv9.1
%arrayidx8.2 = getelementptr inbounds i8 , ptr %y , i64 2
%6 = load i8 , ptr %arrayidx8.2 , align 1
%conv9.2 = zext i8 %6 to i32
%add10.2 = add nuw nsw i32 %add10.1 , %conv9.2
%arrayidx8.3 = getelementptr inbounds i8 , ptr %y , i64 3
%7 = load i8 , ptr %arrayidx8.3 , align 1
%conv9.3 = zext i8 %7 to i32
%add10.3 = add nuw nsw i32 %add10.2 , %conv9.3
ret i32 %add10.3
}