77 changes: 26 additions & 51 deletions llvm/test/Transforms/SLPVectorizer/X86/shift-lshr.ll
Original file line number Diff line number Diff line change
Expand Up @@ -431,10 +431,14 @@ define void @lshr_v32i16() {
; AVX-NEXT: ret void
;
; AVX512-LABEL: @lshr_v32i16(
; AVX512-NEXT: [[TMP1:%.*]] = load <32 x i16>, <32 x i16>* bitcast ([32 x i16]* @a16 to <32 x i16>*), align 2
; AVX512-NEXT: [[TMP2:%.*]] = load <32 x i16>, <32 x i16>* bitcast ([32 x i16]* @b16 to <32 x i16>*), align 2
; AVX512-NEXT: [[TMP3:%.*]] = lshr <32 x i16> [[TMP1]], [[TMP2]]
; AVX512-NEXT: store <32 x i16> [[TMP3]], <32 x i16>* bitcast ([32 x i16]* @c16 to <32 x i16>*), align 2
; AVX512-NEXT: [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
; AVX512-NEXT: [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
; AVX512-NEXT: [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
; AVX512-NEXT: [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
; AVX512-NEXT: [[TMP5:%.*]] = lshr <16 x i16> [[TMP1]], [[TMP3]]
; AVX512-NEXT: [[TMP6:%.*]] = lshr <16 x i16> [[TMP2]], [[TMP4]]
; AVX512-NEXT: store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
; AVX512-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
; AVX512-NEXT: ret void
;
; XOP-LABEL: @lshr_v32i16(
Expand Down Expand Up @@ -580,53 +584,24 @@ define void @lshr_v32i16() {
}

define void @lshr_v64i8() {
; SSE-LABEL: @lshr_v64i8(
; SSE-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
; SSE-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
; SSE-NEXT: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
; SSE-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
; SSE-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
; SSE-NEXT: [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
; SSE-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
; SSE-NEXT: [[TMP9:%.*]] = lshr <16 x i8> [[TMP1]], [[TMP5]]
; SSE-NEXT: [[TMP10:%.*]] = lshr <16 x i8> [[TMP2]], [[TMP6]]
; SSE-NEXT: [[TMP11:%.*]] = lshr <16 x i8> [[TMP3]], [[TMP7]]
; SSE-NEXT: [[TMP12:%.*]] = lshr <16 x i8> [[TMP4]], [[TMP8]]
; SSE-NEXT: store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
; SSE-NEXT: store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
; SSE-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
; SSE-NEXT: store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
; SSE-NEXT: ret void
;
; AVX-LABEL: @lshr_v64i8(
; AVX-NEXT: [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @a8 to <32 x i8>*), align 1
; AVX-NEXT: [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
; AVX-NEXT: [[TMP3:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1
; AVX-NEXT: [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1
; AVX-NEXT: [[TMP5:%.*]] = lshr <32 x i8> [[TMP1]], [[TMP3]]
; AVX-NEXT: [[TMP6:%.*]] = lshr <32 x i8> [[TMP2]], [[TMP4]]
; AVX-NEXT: store <32 x i8> [[TMP5]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
; AVX-NEXT: store <32 x i8> [[TMP6]], <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <32 x i8>*), align 1
; AVX-NEXT: ret void
;
; AVX512-LABEL: @lshr_v64i8(
; AVX512-NEXT: [[TMP1:%.*]] = load <64 x i8>, <64 x i8>* bitcast ([64 x i8]* @a8 to <64 x i8>*), align 1
; AVX512-NEXT: [[TMP2:%.*]] = load <64 x i8>, <64 x i8>* bitcast ([64 x i8]* @b8 to <64 x i8>*), align 1
; AVX512-NEXT: [[TMP3:%.*]] = lshr <64 x i8> [[TMP1]], [[TMP2]]
; AVX512-NEXT: store <64 x i8> [[TMP3]], <64 x i8>* bitcast ([64 x i8]* @c8 to <64 x i8>*), align 1
; AVX512-NEXT: ret void
;
; XOP-LABEL: @lshr_v64i8(
; XOP-NEXT: [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @a8 to <32 x i8>*), align 1
; XOP-NEXT: [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
; XOP-NEXT: [[TMP3:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1
; XOP-NEXT: [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1
; XOP-NEXT: [[TMP5:%.*]] = lshr <32 x i8> [[TMP1]], [[TMP3]]
; XOP-NEXT: [[TMP6:%.*]] = lshr <32 x i8> [[TMP2]], [[TMP4]]
; XOP-NEXT: store <32 x i8> [[TMP5]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
; XOP-NEXT: store <32 x i8> [[TMP6]], <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <32 x i8>*), align 1
; XOP-NEXT: ret void
; CHECK-LABEL: @lshr_v64i8(
; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
; CHECK-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
; CHECK-NEXT: [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
; CHECK-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
; CHECK-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
; CHECK-NEXT: [[TMP9:%.*]] = lshr <16 x i8> [[TMP1]], [[TMP5]]
; CHECK-NEXT: [[TMP10:%.*]] = lshr <16 x i8> [[TMP2]], [[TMP6]]
; CHECK-NEXT: [[TMP11:%.*]] = lshr <16 x i8> [[TMP3]], [[TMP7]]
; CHECK-NEXT: [[TMP12:%.*]] = lshr <16 x i8> [[TMP4]], [[TMP8]]
; CHECK-NEXT: store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
; CHECK-NEXT: store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
; CHECK-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
; CHECK-NEXT: store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
; CHECK-NEXT: ret void
;
%a0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 0 ), align 1
%a1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 1 ), align 1
Expand Down
77 changes: 26 additions & 51 deletions llvm/test/Transforms/SLPVectorizer/X86/shift-shl.ll
Original file line number Diff line number Diff line change
Expand Up @@ -383,10 +383,14 @@ define void @shl_v32i16() {
; AVX-NEXT: ret void
;
; AVX512-LABEL: @shl_v32i16(
; AVX512-NEXT: [[TMP1:%.*]] = load <32 x i16>, <32 x i16>* bitcast ([32 x i16]* @a16 to <32 x i16>*), align 2
; AVX512-NEXT: [[TMP2:%.*]] = load <32 x i16>, <32 x i16>* bitcast ([32 x i16]* @b16 to <32 x i16>*), align 2
; AVX512-NEXT: [[TMP3:%.*]] = shl <32 x i16> [[TMP1]], [[TMP2]]
; AVX512-NEXT: store <32 x i16> [[TMP3]], <32 x i16>* bitcast ([32 x i16]* @c16 to <32 x i16>*), align 2
; AVX512-NEXT: [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
; AVX512-NEXT: [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
; AVX512-NEXT: [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
; AVX512-NEXT: [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
; AVX512-NEXT: [[TMP5:%.*]] = shl <16 x i16> [[TMP1]], [[TMP3]]
; AVX512-NEXT: [[TMP6:%.*]] = shl <16 x i16> [[TMP2]], [[TMP4]]
; AVX512-NEXT: store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
; AVX512-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
; AVX512-NEXT: ret void
;
; XOP-LABEL: @shl_v32i16(
Expand Down Expand Up @@ -532,53 +536,24 @@ define void @shl_v32i16() {
}

define void @shl_v64i8() {
; SSE-LABEL: @shl_v64i8(
; SSE-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
; SSE-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
; SSE-NEXT: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
; SSE-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
; SSE-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
; SSE-NEXT: [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
; SSE-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
; SSE-NEXT: [[TMP9:%.*]] = shl <16 x i8> [[TMP1]], [[TMP5]]
; SSE-NEXT: [[TMP10:%.*]] = shl <16 x i8> [[TMP2]], [[TMP6]]
; SSE-NEXT: [[TMP11:%.*]] = shl <16 x i8> [[TMP3]], [[TMP7]]
; SSE-NEXT: [[TMP12:%.*]] = shl <16 x i8> [[TMP4]], [[TMP8]]
; SSE-NEXT: store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
; SSE-NEXT: store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
; SSE-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
; SSE-NEXT: store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
; SSE-NEXT: ret void
;
; AVX-LABEL: @shl_v64i8(
; AVX-NEXT: [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @a8 to <32 x i8>*), align 1
; AVX-NEXT: [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
; AVX-NEXT: [[TMP3:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1
; AVX-NEXT: [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1
; AVX-NEXT: [[TMP5:%.*]] = shl <32 x i8> [[TMP1]], [[TMP3]]
; AVX-NEXT: [[TMP6:%.*]] = shl <32 x i8> [[TMP2]], [[TMP4]]
; AVX-NEXT: store <32 x i8> [[TMP5]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
; AVX-NEXT: store <32 x i8> [[TMP6]], <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <32 x i8>*), align 1
; AVX-NEXT: ret void
;
; AVX512-LABEL: @shl_v64i8(
; AVX512-NEXT: [[TMP1:%.*]] = load <64 x i8>, <64 x i8>* bitcast ([64 x i8]* @a8 to <64 x i8>*), align 1
; AVX512-NEXT: [[TMP2:%.*]] = load <64 x i8>, <64 x i8>* bitcast ([64 x i8]* @b8 to <64 x i8>*), align 1
; AVX512-NEXT: [[TMP3:%.*]] = shl <64 x i8> [[TMP1]], [[TMP2]]
; AVX512-NEXT: store <64 x i8> [[TMP3]], <64 x i8>* bitcast ([64 x i8]* @c8 to <64 x i8>*), align 1
; AVX512-NEXT: ret void
;
; XOP-LABEL: @shl_v64i8(
; XOP-NEXT: [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @a8 to <32 x i8>*), align 1
; XOP-NEXT: [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
; XOP-NEXT: [[TMP3:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1
; XOP-NEXT: [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1
; XOP-NEXT: [[TMP5:%.*]] = shl <32 x i8> [[TMP1]], [[TMP3]]
; XOP-NEXT: [[TMP6:%.*]] = shl <32 x i8> [[TMP2]], [[TMP4]]
; XOP-NEXT: store <32 x i8> [[TMP5]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
; XOP-NEXT: store <32 x i8> [[TMP6]], <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <32 x i8>*), align 1
; XOP-NEXT: ret void
; CHECK-LABEL: @shl_v64i8(
; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
; CHECK-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
; CHECK-NEXT: [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
; CHECK-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
; CHECK-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
; CHECK-NEXT: [[TMP9:%.*]] = shl <16 x i8> [[TMP1]], [[TMP5]]
; CHECK-NEXT: [[TMP10:%.*]] = shl <16 x i8> [[TMP2]], [[TMP6]]
; CHECK-NEXT: [[TMP11:%.*]] = shl <16 x i8> [[TMP3]], [[TMP7]]
; CHECK-NEXT: [[TMP12:%.*]] = shl <16 x i8> [[TMP4]], [[TMP8]]
; CHECK-NEXT: store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
; CHECK-NEXT: store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
; CHECK-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
; CHECK-NEXT: store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
; CHECK-NEXT: ret void
;
%a0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 0 ), align 1
%a1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 1 ), align 1
Expand Down
7 changes: 4 additions & 3 deletions llvm/test/Transforms/SLPVectorizer/X86/store-jumbled.ll
Original file line number Diff line number Diff line change
Expand Up @@ -11,20 +11,21 @@ define i32 @jumbled-load(i32* noalias nocapture %in, i32* noalias nocapture %inn
; CHECK-NEXT: [[GEP_3:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 3
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[IN_ADDR]] to <4 x i32>*
; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 0, i32 2>
; CHECK-NEXT: [[INN_ADDR:%.*]] = getelementptr inbounds i32, i32* [[INN:%.*]], i64 0
; CHECK-NEXT: [[GEP_4:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 1
; CHECK-NEXT: [[GEP_5:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 2
; CHECK-NEXT: [[GEP_6:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 3
; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[INN_ADDR]] to <4 x i32>*
; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
; CHECK-NEXT: [[TMP5:%.*]] = mul <4 x i32> [[TMP2]], [[TMP4]]
; CHECK-NEXT: [[REORDER_SHUFFLE1:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 0, i32 2>
; CHECK-NEXT: [[TMP5:%.*]] = mul <4 x i32> [[REORDER_SHUFFLE]], [[REORDER_SHUFFLE1]]
; CHECK-NEXT: [[GEP_7:%.*]] = getelementptr inbounds i32, i32* [[OUT:%.*]], i64 0
; CHECK-NEXT: [[GEP_8:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 1
; CHECK-NEXT: [[GEP_9:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 2
; CHECK-NEXT: [[GEP_10:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 3
; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 0, i32 2>
; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[GEP_7]] to <4 x i32>*
; CHECK-NEXT: store <4 x i32> [[REORDER_SHUFFLE]], <4 x i32>* [[TMP6]], align 4
; CHECK-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4
; CHECK-NEXT: ret i32 undef
;
%in.addr = getelementptr inbounds i32, i32* %in, i64 0
Expand Down
45 changes: 29 additions & 16 deletions llvm/test/Transforms/SLPVectorizer/X86/stores_vectorize.ll
Original file line number Diff line number Diff line change
Expand Up @@ -92,14 +92,15 @@ define void @store_reverse(i64* %p3) {
; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 3
; CHECK-NEXT: [[TMP0:%.*]] = bitcast i64* [[P3]] to <4 x i64>*
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* [[TMP0]], align 8
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 11
; CHECK-NEXT: [[TMP2:%.*]] = bitcast i64* [[ARRAYIDX1]] to <4 x i64>*
; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* [[TMP2]], align 8
; CHECK-NEXT: [[TMP4:%.*]] = shl <4 x i64> [[TMP1]], [[TMP3]]
; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 4
; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64* [[ARRAYIDX1]] to <4 x i64>*
; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* [[TMP3]], align 8
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; CHECK-NEXT: [[TMP6:%.*]] = bitcast i64* [[ARRAYIDX14]] to <4 x i64>*
; CHECK-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* [[TMP6]], align 8
; CHECK-NEXT: [[TMP6:%.*]] = shl <4 x i64> [[TMP2]], [[TMP5]]
; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 4
; CHECK-NEXT: [[TMP7:%.*]] = bitcast i64* [[ARRAYIDX14]] to <4 x i64>*
; CHECK-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* [[TMP7]], align 8
; CHECK-NEXT: ret void
;
entry:
Expand Down Expand Up @@ -146,11 +147,18 @@ define void @store15(float* %p1, i32 %p2, i64* %p3, float* %p4) {
; CHECK-NEXT: [[ADD:%.*]] = fadd float [[TMP0]], [[TMP1]]
; CHECK-NEXT: store float [[ADD]], float* [[ARRAYIDX2]], align 4
; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 1
; CHECK-NEXT: [[TMP2:%.*]] = bitcast i64* [[P3]] to <2 x i64>*
; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[TMP2]], align 8
; CHECK-NEXT: [[TMP4:%.*]] = lshr <2 x i64> [[TMP3]], <i64 5, i64 5>
; CHECK-NEXT: [[TMP5:%.*]] = bitcast i64* [[P3]] to <2 x i64>*
; CHECK-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* [[TMP5]], align 8
; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 2
; CHECK-NEXT: [[TMP6:%.*]] = load i64, i64* [[ARRAYIDX6]], align 8
; CHECK-NEXT: [[SHR7:%.*]] = lshr i64 [[TMP6]], 5
; CHECK-NEXT: store i64 [[SHR7]], i64* [[ARRAYIDX6]], align 8
; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 3
; CHECK-NEXT: [[TMP2:%.*]] = bitcast i64* [[P3]] to <4 x i64>*
; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* [[TMP2]], align 8
; CHECK-NEXT: [[TMP4:%.*]] = lshr <4 x i64> [[TMP3]], <i64 5, i64 5, i64 5, i64 5>
; CHECK-NEXT: [[TMP7:%.*]] = load i64, i64* [[ARRAYIDX8]], align 8
; CHECK-NEXT: [[SHR9:%.*]] = lshr i64 [[TMP7]], 5
; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 5
; CHECK-NEXT: store i64 5, i64* [[ARRAYIDX9]], align 8
; CHECK-NEXT: store i64 5, i64* [[ARRAYIDX9]], align 8
Expand All @@ -167,8 +175,7 @@ define void @store15(float* %p1, i32 %p2, i64* %p3, float* %p4) {
; CHECK-NEXT: store i64 5, i64* [[ARRAYIDX9]], align 8
; CHECK-NEXT: store i64 5, i64* [[ARRAYIDX9]], align 8
; CHECK-NEXT: store i64 5, i64* [[ARRAYIDX9]], align 8
; CHECK-NEXT: [[TMP5:%.*]] = bitcast i64* [[P3]] to <4 x i64>*
; CHECK-NEXT: store <4 x i64> [[TMP4]], <4 x i64>* [[TMP5]], align 8
; CHECK-NEXT: store i64 [[SHR9]], i64* [[ARRAYIDX8]], align 8
; CHECK-NEXT: ret void
;
entry:
Expand Down Expand Up @@ -228,11 +235,18 @@ define void @store16(float* %p1, i32 %p2, i64* %p3, float* %p4) {
; CHECK-NEXT: [[ADD:%.*]] = fadd float [[TMP0]], [[TMP1]]
; CHECK-NEXT: store float [[ADD]], float* [[ARRAYIDX2]], align 4
; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 1
; CHECK-NEXT: [[TMP2:%.*]] = bitcast i64* [[P3]] to <2 x i64>*
; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[TMP2]], align 8
; CHECK-NEXT: [[TMP4:%.*]] = lshr <2 x i64> [[TMP3]], <i64 5, i64 5>
; CHECK-NEXT: [[TMP5:%.*]] = bitcast i64* [[P3]] to <2 x i64>*
; CHECK-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* [[TMP5]], align 8
; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 2
; CHECK-NEXT: [[TMP6:%.*]] = load i64, i64* [[ARRAYIDX6]], align 8
; CHECK-NEXT: [[SHR7:%.*]] = lshr i64 [[TMP6]], 5
; CHECK-NEXT: store i64 [[SHR7]], i64* [[ARRAYIDX6]], align 8
; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 3
; CHECK-NEXT: [[TMP2:%.*]] = bitcast i64* [[P3]] to <4 x i64>*
; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* [[TMP2]], align 8
; CHECK-NEXT: [[TMP4:%.*]] = lshr <4 x i64> [[TMP3]], <i64 5, i64 5, i64 5, i64 5>
; CHECK-NEXT: [[TMP7:%.*]] = load i64, i64* [[ARRAYIDX8]], align 8
; CHECK-NEXT: [[SHR9:%.*]] = lshr i64 [[TMP7]], 5
; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 5
; CHECK-NEXT: store i64 5, i64* [[ARRAYIDX9]], align 8
; CHECK-NEXT: store i64 5, i64* [[ARRAYIDX9]], align 8
Expand All @@ -250,8 +264,7 @@ define void @store16(float* %p1, i32 %p2, i64* %p3, float* %p4) {
; CHECK-NEXT: store i64 5, i64* [[ARRAYIDX9]], align 8
; CHECK-NEXT: store i64 5, i64* [[ARRAYIDX9]], align 8
; CHECK-NEXT: store i64 5, i64* [[ARRAYIDX9]], align 8
; CHECK-NEXT: [[TMP5:%.*]] = bitcast i64* [[P3]] to <4 x i64>*
; CHECK-NEXT: store <4 x i64> [[TMP4]], <4 x i64>* [[TMP5]], align 8
; CHECK-NEXT: store i64 [[SHR9]], i64* [[ARRAYIDX8]], align 8
; CHECK-NEXT: ret void
;
entry:
Expand Down