164 changes: 58 additions & 106 deletions llvm/test/Analysis/CostModel/AMDGPU/arith-sminmax.ll

Large diffs are not rendered by default.

164 changes: 58 additions & 106 deletions llvm/test/Analysis/CostModel/AMDGPU/arith-uminmax.ll

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions llvm/test/Analysis/CostModel/ARM/cmps.ll
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,7 @@ define void @minmax() {
; CHECK-V8R-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c3 = icmp slt i32 undef, undef
; CHECK-V8R-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s3 = select i1 %c3, i32 undef, i32 undef
; CHECK-V8R-RECIP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %c4 = icmp slt <4 x i32> undef, undef
; CHECK-V8R-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %s4 = select <4 x i1> %c4, <4 x i32> undef, <4 x i32> undef
; CHECK-V8R-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s4 = select <4 x i1> %c4, <4 x i32> undef, <4 x i32> undef
; CHECK-V8R-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c5 = icmp slt ptr undef, undef
; CHECK-V8R-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s5 = select i1 %c5, ptr undef, ptr undef
; CHECK-V8R-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c6 = icmp slt <4 x ptr> undef, undef
Expand Down Expand Up @@ -287,7 +287,7 @@ define void @minmax() {
; CHECK-V8R-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c3 = icmp slt i32 undef, undef
; CHECK-V8R-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s3 = select i1 %c3, i32 undef, i32 undef
; CHECK-V8R-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %c4 = icmp slt <4 x i32> undef, undef
; CHECK-V8R-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %s4 = select <4 x i1> %c4, <4 x i32> undef, <4 x i32> undef
; CHECK-V8R-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s4 = select <4 x i1> %c4, <4 x i32> undef, <4 x i32> undef
; CHECK-V8R-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c5 = icmp slt ptr undef, undef
; CHECK-V8R-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s5 = select i1 %c5, ptr undef, ptr undef
; CHECK-V8R-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c6 = icmp slt <4 x ptr> undef, undef
Expand Down
12 changes: 6 additions & 6 deletions llvm/test/Analysis/CostModel/ARM/reduce-smax.ll
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ define i32 @reduce_i32(i32 %arg) {
; V8M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;
; NEON-LABEL: 'reduce_i32'
; NEON-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> undef)
; NEON-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V2 = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> undef)
; NEON-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef)
; NEON-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef)
; NEON-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16 = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> undef)
Expand Down Expand Up @@ -82,8 +82,8 @@ define i32 @reduce_i16(i32 %arg) {
; V8M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;
; NEON-LABEL: 'reduce_i16'
; NEON-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef)
; NEON-NEXT: Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef)
; NEON-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef)
; NEON-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %V4 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef)
; NEON-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef)
; NEON-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef)
; NEON-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef)
Expand Down Expand Up @@ -120,9 +120,9 @@ define i32 @reduce_i8(i32 %arg) {
; V8M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;
; NEON-LABEL: 'reduce_i8'
; NEON-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef)
; NEON-NEXT: Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef)
; NEON-NEXT: Cost Model: Found an estimated cost of 153 for instruction: %V8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef)
; NEON-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef)
; NEON-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %V4 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef)
; NEON-NEXT: Cost Model: Found an estimated cost of 150 for instruction: %V8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef)
; NEON-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef)
; NEON-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef)
; NEON-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef)
Expand Down
12 changes: 6 additions & 6 deletions llvm/test/Analysis/CostModel/ARM/reduce-smin.ll
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ define i32 @reduce_i32(i32 %arg) {
; V8M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;
; NEON-LABEL: 'reduce_i32'
; NEON-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> undef)
; NEON-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V2 = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> undef)
; NEON-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> undef)
; NEON-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8 = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> undef)
; NEON-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16 = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> undef)
Expand Down Expand Up @@ -82,8 +82,8 @@ define i32 @reduce_i16(i32 %arg) {
; V8M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;
; NEON-LABEL: 'reduce_i16'
; NEON-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef)
; NEON-NEXT: Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> undef)
; NEON-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V2 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef)
; NEON-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %V4 = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> undef)
; NEON-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> undef)
; NEON-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> undef)
; NEON-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32 = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> undef)
Expand Down Expand Up @@ -120,9 +120,9 @@ define i32 @reduce_i8(i32 %arg) {
; V8M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;
; NEON-LABEL: 'reduce_i8'
; NEON-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef)
; NEON-NEXT: Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> undef)
; NEON-NEXT: Cost Model: Found an estimated cost of 153 for instruction: %V8 = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> undef)
; NEON-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V2 = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef)
; NEON-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %V4 = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> undef)
; NEON-NEXT: Cost Model: Found an estimated cost of 150 for instruction: %V8 = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> undef)
; NEON-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> undef)
; NEON-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> undef)
; NEON-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> undef)
Expand Down
12 changes: 6 additions & 6 deletions llvm/test/Analysis/CostModel/ARM/reduce-umax.ll
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ define i32 @reduce_i32(i32 %arg) {
; V8M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;
; NEON-LABEL: 'reduce_i32'
; NEON-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> undef)
; NEON-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V2 = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> undef)
; NEON-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> undef)
; NEON-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8 = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef)
; NEON-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16 = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> undef)
Expand Down Expand Up @@ -82,8 +82,8 @@ define i32 @reduce_i16(i32 %arg) {
; V8M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;
; NEON-LABEL: 'reduce_i16'
; NEON-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef)
; NEON-NEXT: Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef)
; NEON-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef)
; NEON-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %V4 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef)
; NEON-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef)
; NEON-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef)
; NEON-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef)
Expand Down Expand Up @@ -120,9 +120,9 @@ define i32 @reduce_i8(i32 %arg) {
; V8M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;
; NEON-LABEL: 'reduce_i8'
; NEON-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef)
; NEON-NEXT: Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef)
; NEON-NEXT: Cost Model: Found an estimated cost of 153 for instruction: %V8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef)
; NEON-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef)
; NEON-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %V4 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef)
; NEON-NEXT: Cost Model: Found an estimated cost of 150 for instruction: %V8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef)
; NEON-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef)
; NEON-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef)
; NEON-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef)
Expand Down
12 changes: 6 additions & 6 deletions llvm/test/Analysis/CostModel/ARM/reduce-umin.ll
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ define i32 @reduce_i32(i32 %arg) {
; V8M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;
; NEON-LABEL: 'reduce_i32'
; NEON-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> undef)
; NEON-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V2 = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> undef)
; NEON-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> undef)
; NEON-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8 = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> undef)
; NEON-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16 = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> undef)
Expand Down Expand Up @@ -82,8 +82,8 @@ define i32 @reduce_i16(i32 %arg) {
; V8M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;
; NEON-LABEL: 'reduce_i16'
; NEON-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef)
; NEON-NEXT: Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> undef)
; NEON-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V2 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef)
; NEON-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %V4 = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> undef)
; NEON-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> undef)
; NEON-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16 = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> undef)
; NEON-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32 = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> undef)
Expand Down Expand Up @@ -120,9 +120,9 @@ define i32 @reduce_i8(i32 %arg) {
; V8M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;
; NEON-LABEL: 'reduce_i8'
; NEON-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef)
; NEON-NEXT: Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> undef)
; NEON-NEXT: Cost Model: Found an estimated cost of 153 for instruction: %V8 = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> undef)
; NEON-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V2 = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef)
; NEON-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %V4 = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> undef)
; NEON-NEXT: Cost Model: Found an estimated cost of 150 for instruction: %V8 = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> undef)
; NEON-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> undef)
; NEON-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> undef)
; NEON-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> undef)
Expand Down
76 changes: 59 additions & 17 deletions llvm/test/Transforms/SLPVectorizer/AMDGPU/min_max.ll
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,15 @@ define <2 x i16> @uadd_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
;
; GFX8-LABEL: @uadd_sat_v2i16(
; GFX8-NEXT: bb:
; GFX8-NEXT: [[TMP0:%.*]] = call <2 x i16> @llvm.umin.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
; GFX8-NEXT: ret <2 x i16> [[TMP0]]
; GFX8-NEXT: [[ARG0_0:%.*]] = extractelement <2 x i16> [[ARG0:%.*]], i64 0
; GFX8-NEXT: [[ARG0_1:%.*]] = extractelement <2 x i16> [[ARG0]], i64 1
; GFX8-NEXT: [[ARG1_0:%.*]] = extractelement <2 x i16> [[ARG1:%.*]], i64 0
; GFX8-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i16> [[ARG1]], i64 1
; GFX8-NEXT: [[ADD_0:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_0]], i16 [[ARG1_0]])
; GFX8-NEXT: [[ADD_1:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_1]], i16 [[ARG1_1]])
; GFX8-NEXT: [[INS_0:%.*]] = insertelement <2 x i16> poison, i16 [[ADD_0]], i64 0
; GFX8-NEXT: [[INS_1:%.*]] = insertelement <2 x i16> [[INS_0]], i16 [[ADD_1]], i64 1
; GFX8-NEXT: ret <2 x i16> [[INS_1]]
;
; GFX9-LABEL: @uadd_sat_v2i16(
; GFX9-NEXT: bb:
Expand Down Expand Up @@ -53,8 +60,15 @@ define <2 x i16> @usub_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
;
; GFX8-LABEL: @usub_sat_v2i16(
; GFX8-NEXT: bb:
; GFX8-NEXT: [[TMP0:%.*]] = call <2 x i16> @llvm.umax.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
; GFX8-NEXT: ret <2 x i16> [[TMP0]]
; GFX8-NEXT: [[ARG0_0:%.*]] = extractelement <2 x i16> [[ARG0:%.*]], i64 0
; GFX8-NEXT: [[ARG0_1:%.*]] = extractelement <2 x i16> [[ARG0]], i64 1
; GFX8-NEXT: [[ARG1_0:%.*]] = extractelement <2 x i16> [[ARG1:%.*]], i64 0
; GFX8-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i16> [[ARG1]], i64 1
; GFX8-NEXT: [[ADD_0:%.*]] = call i16 @llvm.umax.i16(i16 [[ARG0_0]], i16 [[ARG1_0]])
; GFX8-NEXT: [[ADD_1:%.*]] = call i16 @llvm.umax.i16(i16 [[ARG0_1]], i16 [[ARG1_1]])
; GFX8-NEXT: [[INS_0:%.*]] = insertelement <2 x i16> poison, i16 [[ADD_0]], i64 0
; GFX8-NEXT: [[INS_1:%.*]] = insertelement <2 x i16> [[INS_0]], i16 [[ADD_1]], i64 1
; GFX8-NEXT: ret <2 x i16> [[INS_1]]
;
; GFX9-LABEL: @usub_sat_v2i16(
; GFX9-NEXT: bb:
Expand Down Expand Up @@ -88,8 +102,15 @@ define <2 x i16> @sadd_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
;
; GFX8-LABEL: @sadd_sat_v2i16(
; GFX8-NEXT: bb:
; GFX8-NEXT: [[TMP0:%.*]] = call <2 x i16> @llvm.smin.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
; GFX8-NEXT: ret <2 x i16> [[TMP0]]
; GFX8-NEXT: [[ARG0_0:%.*]] = extractelement <2 x i16> [[ARG0:%.*]], i64 0
; GFX8-NEXT: [[ARG0_1:%.*]] = extractelement <2 x i16> [[ARG0]], i64 1
; GFX8-NEXT: [[ARG1_0:%.*]] = extractelement <2 x i16> [[ARG1:%.*]], i64 0
; GFX8-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i16> [[ARG1]], i64 1
; GFX8-NEXT: [[ADD_0:%.*]] = call i16 @llvm.smin.i16(i16 [[ARG0_0]], i16 [[ARG1_0]])
; GFX8-NEXT: [[ADD_1:%.*]] = call i16 @llvm.smin.i16(i16 [[ARG0_1]], i16 [[ARG1_1]])
; GFX8-NEXT: [[INS_0:%.*]] = insertelement <2 x i16> poison, i16 [[ADD_0]], i64 0
; GFX8-NEXT: [[INS_1:%.*]] = insertelement <2 x i16> [[INS_0]], i16 [[ADD_1]], i64 1
; GFX8-NEXT: ret <2 x i16> [[INS_1]]
;
; GFX9-LABEL: @sadd_sat_v2i16(
; GFX9-NEXT: bb:
Expand Down Expand Up @@ -123,8 +144,15 @@ define <2 x i16> @ssub_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
;
; GFX8-LABEL: @ssub_sat_v2i16(
; GFX8-NEXT: bb:
; GFX8-NEXT: [[TMP0:%.*]] = call <2 x i16> @llvm.smax.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
; GFX8-NEXT: ret <2 x i16> [[TMP0]]
; GFX8-NEXT: [[ARG0_0:%.*]] = extractelement <2 x i16> [[ARG0:%.*]], i64 0
; GFX8-NEXT: [[ARG0_1:%.*]] = extractelement <2 x i16> [[ARG0]], i64 1
; GFX8-NEXT: [[ARG1_0:%.*]] = extractelement <2 x i16> [[ARG1:%.*]], i64 0
; GFX8-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i16> [[ARG1]], i64 1
; GFX8-NEXT: [[ADD_0:%.*]] = call i16 @llvm.smax.i16(i16 [[ARG0_0]], i16 [[ARG1_0]])
; GFX8-NEXT: [[ADD_1:%.*]] = call i16 @llvm.smax.i16(i16 [[ARG0_1]], i16 [[ARG1_1]])
; GFX8-NEXT: [[INS_0:%.*]] = insertelement <2 x i16> poison, i16 [[ADD_0]], i64 0
; GFX8-NEXT: [[INS_1:%.*]] = insertelement <2 x i16> [[INS_0]], i16 [[ADD_1]], i64 1
; GFX8-NEXT: ret <2 x i16> [[INS_1]]
;
; GFX9-LABEL: @ssub_sat_v2i16(
; GFX9-NEXT: bb:
Expand Down Expand Up @@ -262,11 +290,18 @@ define <3 x i16> @uadd_sat_v3i16(<3 x i16> %arg0, <3 x i16> %arg1) {
;
; GFX8-LABEL: @uadd_sat_v3i16(
; GFX8-NEXT: bb:
; GFX8-NEXT: [[ARG0_2:%.*]] = extractelement <3 x i16> [[ARG0:%.*]], i64 2
; GFX8-NEXT: [[ARG1_2:%.*]] = extractelement <3 x i16> [[ARG1:%.*]], i64 2
; GFX8-NEXT: [[TMP0:%.*]] = call <3 x i16> @llvm.umin.v3i16(<3 x i16> [[ARG0]], <3 x i16> [[ARG1]])
; GFX8-NEXT: [[ARG0_0:%.*]] = extractelement <3 x i16> [[ARG0:%.*]], i64 0
; GFX8-NEXT: [[ARG0_1:%.*]] = extractelement <3 x i16> [[ARG0]], i64 1
; GFX8-NEXT: [[ARG0_2:%.*]] = extractelement <3 x i16> [[ARG0]], i64 2
; GFX8-NEXT: [[ARG1_0:%.*]] = extractelement <3 x i16> [[ARG1:%.*]], i64 0
; GFX8-NEXT: [[ARG1_1:%.*]] = extractelement <3 x i16> [[ARG1]], i64 1
; GFX8-NEXT: [[ARG1_2:%.*]] = extractelement <3 x i16> [[ARG1]], i64 2
; GFX8-NEXT: [[ADD_0:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_0]], i16 [[ARG1_0]])
; GFX8-NEXT: [[ADD_1:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_1]], i16 [[ARG1_1]])
; GFX8-NEXT: [[ADD_2:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_2]], i16 [[ARG1_2]])
; GFX8-NEXT: [[INS_2:%.*]] = insertelement <3 x i16> [[TMP0]], i16 [[ADD_2]], i64 2
; GFX8-NEXT: [[INS_0:%.*]] = insertelement <3 x i16> poison, i16 [[ADD_0]], i64 0
; GFX8-NEXT: [[INS_1:%.*]] = insertelement <3 x i16> [[INS_0]], i16 [[ADD_1]], i64 1
; GFX8-NEXT: [[INS_2:%.*]] = insertelement <3 x i16> [[INS_1]], i16 [[ADD_2]], i64 2
; GFX8-NEXT: ret <3 x i16> [[INS_2]]
;
; GFX9-LABEL: @uadd_sat_v3i16(
Expand Down Expand Up @@ -317,11 +352,18 @@ define <4 x i16> @uadd_sat_v4i16(<4 x i16> %arg0, <4 x i16> %arg1) {
;
; GFX8-LABEL: @uadd_sat_v4i16(
; GFX8-NEXT: bb:
; GFX8-NEXT: [[TMP0:%.*]] = call <4 x i16> @llvm.umin.v4i16(<4 x i16> [[ARG0:%.*]], <4 x i16> [[ARG1:%.*]])
; GFX8-NEXT: [[TMP1:%.*]] = call <4 x i16> @llvm.umin.v4i16(<4 x i16> [[ARG0]], <4 x i16> [[ARG1]])
; GFX8-NEXT: [[TMP2:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
; GFX8-NEXT: [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
; GFX8-NEXT: [[INS_31:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
; GFX8-NEXT: [[ARG0_0:%.*]] = extractelement <4 x i16> [[ARG0:%.*]], i64 0
; GFX8-NEXT: [[ARG0_1:%.*]] = extractelement <4 x i16> [[ARG0]], i64 1
; GFX8-NEXT: [[ARG1_0:%.*]] = extractelement <4 x i16> [[ARG1:%.*]], i64 0
; GFX8-NEXT: [[ARG1_1:%.*]] = extractelement <4 x i16> [[ARG1]], i64 1
; GFX8-NEXT: [[ADD_0:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_0]], i16 [[ARG1_0]])
; GFX8-NEXT: [[ADD_1:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_1]], i16 [[ARG1_1]])
; GFX8-NEXT: [[TMP0:%.*]] = call <4 x i16> @llvm.umin.v4i16(<4 x i16> [[ARG0]], <4 x i16> [[ARG1]])
; GFX8-NEXT: [[TMP1:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
; GFX8-NEXT: [[INS_0:%.*]] = insertelement <4 x i16> poison, i16 [[ADD_0]], i64 0
; GFX8-NEXT: [[INS_1:%.*]] = insertelement <4 x i16> [[INS_0]], i16 [[ADD_1]], i64 1
; GFX8-NEXT: [[TMP2:%.*]] = shufflevector <2 x i16> [[TMP1]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
; GFX8-NEXT: [[INS_31:%.*]] = shufflevector <4 x i16> [[INS_1]], <4 x i16> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
; GFX8-NEXT: ret <4 x i16> [[INS_31]]
;
; GFX9-LABEL: @uadd_sat_v4i16(
Expand Down