122 changes: 55 additions & 67 deletions llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2195,28 +2195,28 @@ define <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) {
; CHECK-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: mov w8, #1904214015
; CHECK-NEXT: fcmp s8, #0.0
; CHECK-NEXT: mov x21, #68719476735
; CHECK-NEXT: mov x23, #68719476735
; CHECK-NEXT: mov h0, v0.h[3]
; CHECK-NEXT: fmov s9, w8
; CHECK-NEXT: csel x8, xzr, x1, lt
; CHECK-NEXT: csel x9, xzr, x0, lt
; CHECK-NEXT: csel x8, xzr, x0, lt
; CHECK-NEXT: csel x9, xzr, x1, lt
; CHECK-NEXT: fcmp s8, s9
; CHECK-NEXT: fcvt s8, h0
; CHECK-NEXT: csinv x9, x9, xzr, le
; CHECK-NEXT: csel x20, x21, x8, gt
; CHECK-NEXT: csel x9, x23, x9, gt
; CHECK-NEXT: csinv x8, x8, xzr, le
; CHECK-NEXT: fmov s0, s8
; CHECK-NEXT: str x9, [sp, #24] // 8-byte Folded Spill
; CHECK-NEXT: stp x8, x9, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: bl __fixunssfti
; CHECK-NEXT: fcmp s8, #0.0
; CHECK-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: csel x8, xzr, x1, lt
; CHECK-NEXT: csel x9, xzr, x0, lt
; CHECK-NEXT: csel x8, xzr, x0, lt
; CHECK-NEXT: csel x9, xzr, x1, lt
; CHECK-NEXT: fcmp s8, s9
; CHECK-NEXT: fcvt s8, h0
; CHECK-NEXT: csinv x9, x9, xzr, le
; CHECK-NEXT: csel x23, x21, x8, gt
; CHECK-NEXT: csel x9, x23, x9, gt
; CHECK-NEXT: csinv x24, x8, xzr, le
; CHECK-NEXT: fmov s0, s8
; CHECK-NEXT: str x9, [sp, #16] // 8-byte Folded Spill
; CHECK-NEXT: str x9, [sp, #8] // 8-byte Folded Spill
; CHECK-NEXT: bl __fixunssfti
; CHECK-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: fcmp s8, #0.0
Expand All @@ -2226,7 +2226,7 @@ define <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) {
; CHECK-NEXT: fcmp s8, s9
; CHECK-NEXT: fcvt s8, h0
; CHECK-NEXT: csinv x8, x8, xzr, le
; CHECK-NEXT: csel x24, x21, x9, gt
; CHECK-NEXT: csel x25, x23, x9, gt
; CHECK-NEXT: str x8, [sp, #32] // 8-byte Folded Spill
; CHECK-NEXT: fmov s0, s8
; CHECK-NEXT: bl __fixunssfti
Expand All @@ -2238,29 +2238,29 @@ define <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) {
; CHECK-NEXT: fcmp s8, s9
; CHECK-NEXT: fcvt s8, h0
; CHECK-NEXT: csinv x8, x8, xzr, le
; CHECK-NEXT: csel x26, x21, x9, gt
; CHECK-NEXT: str x8, [sp, #8] // 8-byte Folded Spill
; CHECK-NEXT: csel x27, x23, x9, gt
; CHECK-NEXT: str x8, [sp] // 8-byte Folded Spill
; CHECK-NEXT: fmov s0, s8
; CHECK-NEXT: bl __fixunssfti
; CHECK-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: fcmp s8, #0.0
; CHECK-NEXT: mov h0, v0.h[3]
; CHECK-NEXT: csel x8, xzr, x1, lt
; CHECK-NEXT: csel x9, xzr, x0, lt
; CHECK-NEXT: csel x8, xzr, x0, lt
; CHECK-NEXT: csel x9, xzr, x1, lt
; CHECK-NEXT: fcmp s8, s9
; CHECK-NEXT: fcvt s8, h0
; CHECK-NEXT: csinv x29, x9, xzr, le
; CHECK-NEXT: csel x28, x21, x8, gt
; CHECK-NEXT: csel x29, x23, x9, gt
; CHECK-NEXT: csinv x26, x8, xzr, le
; CHECK-NEXT: fmov s0, s8
; CHECK-NEXT: bl __fixunssfti
; CHECK-NEXT: fcmp s8, #0.0
; CHECK-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: csel x8, xzr, x1, lt
; CHECK-NEXT: csel x9, xzr, x0, lt
; CHECK-NEXT: csel x8, xzr, x0, lt
; CHECK-NEXT: csel x9, xzr, x1, lt
; CHECK-NEXT: fcmp s8, s9
; CHECK-NEXT: fcvt s8, h0
; CHECK-NEXT: csinv x27, x9, xzr, le
; CHECK-NEXT: csel x22, x21, x8, gt
; CHECK-NEXT: csel x28, x23, x9, gt
; CHECK-NEXT: csinv x20, x8, xzr, le
; CHECK-NEXT: fmov s0, s8
; CHECK-NEXT: bl __fixunssfti
; CHECK-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload
Expand All @@ -2270,58 +2270,46 @@ define <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) {
; CHECK-NEXT: csel x9, xzr, x1, lt
; CHECK-NEXT: fcmp s8, s9
; CHECK-NEXT: fcvt s8, h0
; CHECK-NEXT: csinv x8, x8, xzr, le
; CHECK-NEXT: csel x25, x21, x9, gt
; CHECK-NEXT: str x8, [sp] // 8-byte Folded Spill
; CHECK-NEXT: csel x21, x23, x9, gt
; CHECK-NEXT: csinv x22, x8, xzr, le
; CHECK-NEXT: fmov s0, s8
; CHECK-NEXT: bl __fixunssfti
; CHECK-NEXT: ldr x11, [sp, #8] // 8-byte Folded Reload
; CHECK-NEXT: fmov d0, x27
; CHECK-NEXT: fmov d1, x29
; CHECK-NEXT: fcmp s8, #0.0
; CHECK-NEXT: lsr x10, x22, #28
; CHECK-NEXT: stur x11, [x19, #75]
; CHECK-NEXT: lsr x11, x28, #28
; CHECK-NEXT: mov v0.d[1], x22
; CHECK-NEXT: ldr x12, [sp, #32] // 8-byte Folded Reload
; CHECK-NEXT: mov v1.d[1], x28
; CHECK-NEXT: extr x8, x28, x20, #28
; CHECK-NEXT: bfi x21, x26, #36, #28
; CHECK-NEXT: extr x9, x29, x26, #28
; CHECK-NEXT: lsr x11, x29, #28
; CHECK-NEXT: str x22, [x19]
; CHECK-NEXT: stur x8, [x19, #41]
; CHECK-NEXT: csel x8, xzr, x0, lt
; CHECK-NEXT: csel x9, xzr, x1, lt
; CHECK-NEXT: csel x10, xzr, x1, lt
; CHECK-NEXT: fcmp s8, s9
; CHECK-NEXT: stur x12, [x19, #50]
; CHECK-NEXT: fmov x12, d0
; CHECK-NEXT: fmov x13, d1
; CHECK-NEXT: stp x21, x9, [x19, #8]
; CHECK-NEXT: lsr x9, x28, #28
; CHECK-NEXT: strb w11, [x19, #24]
; CHECK-NEXT: bfi x27, x24, #36, #28
; CHECK-NEXT: csel x10, x23, x10, gt
; CHECK-NEXT: csinv x8, x8, xzr, le
; CHECK-NEXT: ldp d0, d1, [sp, #16] // 16-byte Folded Reload
; CHECK-NEXT: csel x9, x21, x9, gt
; CHECK-NEXT: strb w10, [x19, #49]
; CHECK-NEXT: extr x10, x22, x12, #28
; CHECK-NEXT: bfi x9, x12, #36, #28
; CHECK-NEXT: bfi x10, x20, #36, #28
; CHECK-NEXT: strb w9, [x19, #49]
; CHECK-NEXT: stur x8, [x19, #25]
; CHECK-NEXT: extr x8, x28, x13, #28
; CHECK-NEXT: mov v0.d[1], x23
; CHECK-NEXT: strb w11, [x19, #24]
; CHECK-NEXT: mov v1.d[1], x20
; CHECK-NEXT: stur x10, [x19, #41]
; CHECK-NEXT: stur x9, [x19, #33]
; CHECK-NEXT: bfi x25, x13, #36, #28
; CHECK-NEXT: str x8, [x19, #16]
; CHECK-NEXT: lsr x9, x23, #28
; CHECK-NEXT: fmov x8, d0
; CHECK-NEXT: ldr x12, [sp] // 8-byte Folded Reload
; CHECK-NEXT: fmov x11, d1
; CHECK-NEXT: lsr x10, x20, #28
; CHECK-NEXT: strb w9, [x19, #99]
; CHECK-NEXT: stp x12, x25, [x19]
; CHECK-NEXT: extr x12, x23, x8, #28
; CHECK-NEXT: bfi x26, x8, #36, #28
; CHECK-NEXT: extr x8, x20, x11, #28
; CHECK-NEXT: bfi x24, x11, #36, #28
; CHECK-NEXT: strb w10, [x19, #74]
; CHECK-NEXT: stur x12, [x19, #91]
; CHECK-NEXT: stur x26, [x19, #83]
; CHECK-NEXT: stur x8, [x19, #66]
; CHECK-NEXT: stur x24, [x19, #58]
; CHECK-NEXT: stur x10, [x19, #33]
; CHECK-NEXT: ldp x9, x12, [sp] // 16-byte Folded Reload
; CHECK-NEXT: stur x9, [x19, #75]
; CHECK-NEXT: extr x8, x12, x24, #28
; CHECK-NEXT: ldr x9, [sp, #32] // 8-byte Folded Reload
; CHECK-NEXT: stur x9, [x19, #50]
; CHECK-NEXT: ldp x11, x10, [sp, #16] // 16-byte Folded Reload
; CHECK-NEXT: stur x8, [x19, #91]
; CHECK-NEXT: lsr x8, x12, #28
; CHECK-NEXT: stur x27, [x19, #83]
; CHECK-NEXT: extr x9, x10, x11, #28
; CHECK-NEXT: bfi x25, x11, #36, #28
; CHECK-NEXT: strb w8, [x19, #99]
; CHECK-NEXT: stur x9, [x19, #66]
; CHECK-NEXT: lsr x9, x10, #28
; CHECK-NEXT: stur x25, [x19, #58]
; CHECK-NEXT: strb w9, [x19, #74]
; CHECK-NEXT: ldp x20, x19, [sp, #160] // 16-byte Folded Reload
; CHECK-NEXT: ldp x22, x21, [sp, #144] // 16-byte Folded Reload
; CHECK-NEXT: ldp x24, x23, [sp, #128] // 16-byte Folded Reload
Expand Down
372 changes: 181 additions & 191 deletions llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll

Large diffs are not rendered by default.

74 changes: 37 additions & 37 deletions llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1735,94 +1735,94 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX10-NEXT: v_mov_b32_e32 v4, 0
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v7, 0
; GFX10-NEXT: v_mov_b32_e32 v8, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x5
; GFX10-NEXT: global_load_ubyte v5, v0, s[2:3] offset:6
; GFX10-NEXT: global_load_ubyte v4, v0, s[2:3] offset:6
; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] offset:3
; GFX10-NEXT: global_load_ubyte v2, v0, s[2:3] offset:2
; GFX10-NEXT: global_load_ubyte v6, v0, s[2:3] offset:1
; GFX10-NEXT: global_load_short_d16 v4, v0, s[2:3] offset:4
; GFX10-NEXT: global_load_ubyte v5, v0, s[2:3] offset:1
; GFX10-NEXT: global_load_short_d16 v7, v0, s[2:3] offset:4
; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3]
; GFX10-NEXT: s_waitcnt vmcnt(5)
; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v6, v4
; GFX10-NEXT: s_waitcnt vmcnt(4)
; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v3, v1
; GFX10-NEXT: s_waitcnt vmcnt(3)
; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v2, v2
; GFX10-NEXT: s_waitcnt vmcnt(2)
; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, v6
; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v6, v5
; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, v5
; GFX10-NEXT: s_waitcnt vmcnt(1)
; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v5, v4
; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v4, v4
; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v5, v7
; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v4, v7
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
; GFX10-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] offset:16
; GFX10-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
; GFX10-NEXT: global_store_dwordx3 v8, v[4:6], s[0:1] offset:16
; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX9-LABEL: load_v7i8_to_v7f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX9-NEXT: v_mov_b32_e32 v8, 0
; GFX9-NEXT: v_mov_b32_e32 v10, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] offset:4
; GFX9-NEXT: global_load_ubyte v2, v0, s[0:1] offset:6
; GFX9-NEXT: global_load_ubyte v1, v0, s[0:1] offset:6
; GFX9-NEXT: global_load_ushort v2, v0, s[0:1] offset:4
; GFX9-NEXT: global_load_ubyte v3, v0, s[0:1] offset:3
; GFX9-NEXT: global_load_ubyte v4, v0, s[0:1] offset:2
; GFX9-NEXT: global_load_ubyte v5, v0, s[0:1] offset:1
; GFX9-NEXT: global_load_ubyte v7, v0, s[0:1]
; GFX9-NEXT: global_load_ubyte v7, v0, s[0:1] offset:2
; GFX9-NEXT: global_load_ubyte v8, v0, s[0:1] offset:1
; GFX9-NEXT: global_load_ubyte v9, v0, s[0:1]
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX9-NEXT: s_waitcnt vmcnt(5)
; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v1
; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v6, v1
; GFX9-NEXT: s_waitcnt vmcnt(4)
; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v6, v2
; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v5, v2
; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v4, v2
; GFX9-NEXT: s_waitcnt vmcnt(3)
; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, v3
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, v4
; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, v7
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v1, v5
; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v1, v8
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v7
; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v5, v9
; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v4, v9
; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v9
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
; GFX9-NEXT: global_store_dwordx3 v8, v[4:6], s[0:1] offset:16
; GFX9-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1]
; GFX9-NEXT: global_store_dwordx3 v10, v[4:6], s[0:1] offset:16
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: load_v7i8_to_v7f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c
; GFX11-NEXT: v_dual_mov_b32 v7, 0 :: v_dual_lshlrev_b32 v0, 3, v0
; GFX11-NEXT: v_mov_b32_e32 v4, 0
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v8, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x5
; GFX11-NEXT: global_load_u8 v5, v0, s[2:3] offset:6
; GFX11-NEXT: global_load_u8 v4, v0, s[2:3] offset:6
; GFX11-NEXT: global_load_u8 v1, v0, s[2:3] offset:3
; GFX11-NEXT: global_load_u8 v2, v0, s[2:3] offset:2
; GFX11-NEXT: global_load_u8 v6, v0, s[2:3] offset:1
; GFX11-NEXT: global_load_d16_b16 v4, v0, s[2:3] offset:4
; GFX11-NEXT: global_load_u8 v5, v0, s[2:3] offset:1
; GFX11-NEXT: global_load_d16_b16 v7, v0, s[2:3] offset:4
; GFX11-NEXT: global_load_u8 v0, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(5)
; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v6, v4
; GFX11-NEXT: s_waitcnt vmcnt(4)
; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v3, v1
; GFX11-NEXT: s_waitcnt vmcnt(3)
; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v2, v2
; GFX11-NEXT: s_waitcnt vmcnt(2)
; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, v6
; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v6, v5
; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, v5
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_cvt_f32_ubyte1_e32 v5, v4
; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v4, v4
; GFX11-NEXT: v_cvt_f32_ubyte1_e32 v5, v7
; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v4, v7
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b96 v7, v[4:6], s[0:1] offset:16
; GFX11-NEXT: global_store_b128 v7, v[0:3], s[0:1]
; GFX11-NEXT: global_store_b96 v8, v[4:6], s[0:1] offset:16
; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
Expand Down
12 changes: 4 additions & 8 deletions llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll
Original file line number Diff line number Diff line change
Expand Up @@ -51,11 +51,10 @@ define amdgpu_kernel void @local_memory_two_objects(i32 addrspace(1)* %out) #0 {
; SI-NEXT: s_mov_b32 m0, -1
; SI-NEXT: ds_write2_b32 v1, v0, v2 offset1:4
; SI-NEXT: v_sub_i32_e32 v0, vcc, 12, v1
; SI-NEXT: v_sub_i32_e32 v2, vcc, 16, v1
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_barrier
; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v2
; SI-NEXT: v_sub_i32_e32 v2, vcc, 28, v1
; SI-NEXT: ds_read_b32 v0, v0
; SI-NEXT: ds_read_b32 v3, v2
; SI-NEXT: s_mov_b32 s3, 0xf000
Expand All @@ -77,16 +76,13 @@ define amdgpu_kernel void @local_memory_two_objects(i32 addrspace(1)* %out) #0 {
; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_barrier
; CI-NEXT: v_sub_i32_e32 v2, vcc, 16, v1
; CI-NEXT: ds_read_b32 v0, v0 offset:12
; CI-NEXT: ds_read_b32 v3, v2 offset:12
; CI-NEXT: ds_read2_b32 v[3:4], v0 offset0:3 offset1:7
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: v_mov_b32_e32 v2, 0
; CI-NEXT: s_waitcnt lgkmcnt(1)
; CI-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: buffer_store_dword v3, v[1:2], s[0:3], 0 addr64 offset:16
; CI-NEXT: buffer_store_dword v3, v[1:2], s[0:3], 0 addr64
; CI-NEXT: buffer_store_dword v4, v[1:2], s[0:3], 0 addr64 offset:16
; CI-NEXT: s_endpgm
entry:
%x.i = call i32 @llvm.amdgcn.workitem.id.x()
Expand Down
8 changes: 4 additions & 4 deletions llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,8 @@ define amdgpu_kernel void @scalar_to_vector_v2i32(<4 x i16> addrspace(1)* %out,
; VI-NEXT: s_mov_b32 s4, s0
; VI-NEXT: s_mov_b32 s5, s1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; VI-NEXT: v_alignbit_b32 v0, v1, v0, 16
; VI-NEXT: v_mov_b32_e32 v1, v0
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
Expand Down Expand Up @@ -85,8 +85,8 @@ define amdgpu_kernel void @scalar_to_vector_v2f32(<4 x i16> addrspace(1)* %out,
; VI-NEXT: s_mov_b32 s4, s0
; VI-NEXT: s_mov_b32 s5, s1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; VI-NEXT: v_alignbit_b32 v0, v1, v0, 16
; VI-NEXT: v_mov_b32_e32 v1, v0
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
Expand Down
19 changes: 11 additions & 8 deletions llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
Original file line number Diff line number Diff line change
Expand Up @@ -56,19 +56,22 @@ define amdgpu_kernel void @local_store_i55(i55 addrspace(3)* %ptr, i55 %arg) #0
; HAWAII-NEXT: v_mov_b32_e32 v0, s0
; HAWAII-NEXT: v_mov_b32_e32 v1, s5
; HAWAII-NEXT: flat_load_ubyte v0, v[0:1]
; HAWAII-NEXT: s_load_dword s0, s[4:5], 0x0
; HAWAII-NEXT: s_load_dword s1, s[4:5], 0x2
; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x3
; HAWAII-NEXT: s_load_dword s0, s[4:5], 0x3
; HAWAII-NEXT: s_load_dword s1, s[4:5], 0x0
; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x2
; HAWAII-NEXT: s_mov_b32 m0, -1
; HAWAII-NEXT: s_waitcnt lgkmcnt(0)
; HAWAII-NEXT: v_mov_b32_e32 v1, s0
; HAWAII-NEXT: v_mov_b32_e32 v2, s1
; HAWAII-NEXT: s_and_b32 s3, s0, 0xffff
; HAWAII-NEXT: v_mov_b32_e32 v1, s1
; HAWAII-NEXT: v_mov_b32_e32 v2, s0
; HAWAII-NEXT: v_mov_b32_e32 v3, s2
; HAWAII-NEXT: ds_write_b16 v1, v3 offset:4
; HAWAII-NEXT: ds_write_b16 v1, v2 offset:4
; HAWAII-NEXT: s_waitcnt vmcnt(0)
; HAWAII-NEXT: v_and_b32_e32 v0, 0x7f, v0
; HAWAII-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; HAWAII-NEXT: v_or_b32_e32 v0, s3, v0
; HAWAII-NEXT: v_bfe_u32 v0, v0, 16, 7
; HAWAII-NEXT: ds_write_b8 v1, v0 offset:6
; HAWAII-NEXT: ds_write_b32 v1, v2
; HAWAII-NEXT: ds_write_b32 v1, v3
; HAWAII-NEXT: s_endpgm
;
; FIJI-LABEL: local_store_i55:
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/AMDGPU/udiv64.ll
Original file line number Diff line number Diff line change
Expand Up @@ -675,7 +675,7 @@ define amdgpu_kernel void @s_test_udiv24_i48(i48 addrspace(1)* %out, i48 %x, i48
; GCN-NEXT: s_load_dword s2, s[0:1], 0xe
; GCN-NEXT: s_load_dword s4, s[0:1], 0xd
; GCN-NEXT: s_load_dword s6, s[0:1], 0xc
; GCN-NEXT: v_cvt_f32_ubyte3_e32 v2, 0xffff
; GCN-NEXT: v_mov_b32_e32 v2, 0x4f800000
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_and_b32 s3, s2, 0xffff
Expand All @@ -687,7 +687,7 @@ define amdgpu_kernel void @s_test_udiv24_i48(i48 addrspace(1)* %out, i48 %x, i48
; GCN-NEXT: s_load_dword s0, s[0:1], 0xb
; GCN-NEXT: s_and_b32 s8, s6, 0xffff
; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: v_mac_f32_e32 v1, 0x4f800000, v2
; GCN-NEXT: v_mac_f32_e32 v1, 0, v2
; GCN-NEXT: v_rcp_f32_e32 v1, v1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_and_b32 s9, s0, 0xff000000
Expand Down
3 changes: 1 addition & 2 deletions llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
Original file line number Diff line number Diff line change
Expand Up @@ -234,9 +234,8 @@ define amdgpu_kernel void @widen_v2i8_constant_load(<2 x i8> addrspace(4)* %arg)
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dword s0, s[0:1], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_and_b32 s1, s0, 0xffff
; VI-NEXT: s_add_i32 s1, s0, 12
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: s_add_i32 s1, s1, 12
; VI-NEXT: v_add_u32_sdwa v0, vcc, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
; VI-NEXT: s_or_b32 s0, s1, 4
; VI-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
Expand Down
1,234 changes: 715 additions & 519 deletions llvm/test/CodeGen/ARM/aes-erratum-fix.ll

Large diffs are not rendered by default.

37 changes: 18 additions & 19 deletions llvm/test/CodeGen/RISCV/rv32zbp.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1110,29 +1110,28 @@ define i64 @gorc2b_i64(i64 %a) nounwind {
;
; RV32ZBP-LABEL: gorc2b_i64:
; RV32ZBP: # %bb.0:
; RV32ZBP-NEXT: srli a2, a1, 2
; RV32ZBP-NEXT: srli a3, a0, 2
; RV32ZBP-NEXT: lui a4, 209715
; RV32ZBP-NEXT: addi a4, a4, 819
; RV32ZBP-NEXT: and a3, a3, a4
; RV32ZBP-NEXT: or a3, a3, a0
; RV32ZBP-NEXT: or a2, a2, a1
; RV32ZBP-NEXT: orc2.n a1, a1
; RV32ZBP-NEXT: srli a2, a0, 2
; RV32ZBP-NEXT: srli a3, a1, 2
; RV32ZBP-NEXT: or a3, a3, a1
; RV32ZBP-NEXT: or a2, a2, a0
; RV32ZBP-NEXT: orc2.n a0, a0
; RV32ZBP-NEXT: orc2.n a1, a1
; RV32ZBP-NEXT: slli a2, a2, 2
; RV32ZBP-NEXT: slli a3, a3, 2
; RV32ZBP-NEXT: lui a5, 838861
; RV32ZBP-NEXT: addi a5, a5, -820
; RV32ZBP-NEXT: and a3, a3, a5
; RV32ZBP-NEXT: and a2, a2, a5
; RV32ZBP-NEXT: lui a4, 838861
; RV32ZBP-NEXT: addi a4, a4, -820
; RV32ZBP-NEXT: and a3, a3, a4
; RV32ZBP-NEXT: and a2, a2, a4
; RV32ZBP-NEXT: srli a4, a1, 2
; RV32ZBP-NEXT: srli a5, a0, 2
; RV32ZBP-NEXT: srli a6, a1, 2
; RV32ZBP-NEXT: and a6, a6, a4
; RV32ZBP-NEXT: and a4, a5, a4
; RV32ZBP-NEXT: or a0, a4, a0
; RV32ZBP-NEXT: or a1, a6, a1
; RV32ZBP-NEXT: or a1, a1, a2
; RV32ZBP-NEXT: or a0, a0, a3
; RV32ZBP-NEXT: lui a6, 209715
; RV32ZBP-NEXT: addi a6, a6, 819
; RV32ZBP-NEXT: and a5, a5, a6
; RV32ZBP-NEXT: and a4, a4, a6
; RV32ZBP-NEXT: or a1, a4, a1
; RV32ZBP-NEXT: or a0, a5, a0
; RV32ZBP-NEXT: or a0, a0, a2
; RV32ZBP-NEXT: or a1, a1, a3
; RV32ZBP-NEXT: ret
%and1 = shl i64 %a, 2
%shl1 = and i64 %and1, -3689348814741910324
Expand Down
204 changes: 102 additions & 102 deletions llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll
Original file line number Diff line number Diff line change
Expand Up @@ -6,55 +6,55 @@ define arm_aapcs_vfpcc <4 x i32> @loads_i32(<4 x i32> *%A, <4 x i32> *%B, <4 x i
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r6, lr}
; CHECK-NEXT: push {r4, r5, r6, lr}
; CHECK-NEXT: .vsave {d9}
; CHECK-NEXT: vpush {d9}
; CHECK-NEXT: .vsave {d8, d9}
; CHECK-NEXT: vpush {d8, d9}
; CHECK-NEXT: vldrw.u32 q1, [r1]
; CHECK-NEXT: vmov.i64 q0, #0xffffffff
; CHECK-NEXT: vmov.f32 s8, s6
; CHECK-NEXT: vmov.f32 s10, s7
; CHECK-NEXT: vmov.f32 s6, s5
; CHECK-NEXT: vand q3, q2, q0
; CHECK-NEXT: vand q0, q1, q0
; CHECK-NEXT: vldrw.u32 q1, [r0]
; CHECK-NEXT: vmov r3, lr, d0
; CHECK-NEXT: vmov.i64 q2, #0xffffffff
; CHECK-NEXT: vmov.f32 s0, s6
; CHECK-NEXT: vmov r4, r1, d6
; CHECK-NEXT: vmov r0, r12, d7
; CHECK-NEXT: vldrw.u32 q3, [r2]
; CHECK-NEXT: vmov.f32 s10, s7
; CHECK-NEXT: vmov.f32 s8, s14
; CHECK-NEXT: vmov.f32 s18, s15
; CHECK-NEXT: vmov.f32 s14, s5
; CHECK-NEXT: vmov r5, s0
; CHECK-NEXT: vmov.f32 s0, s12
; CHECK-NEXT: vmov.f32 s6, s13
; CHECK-NEXT: adds r2, r5, r4
; CHECK-NEXT: vmov r4, s8
; CHECK-NEXT: asr.w r6, r5, #31
; CHECK-NEXT: adcs r1, r6
; CHECK-NEXT: asrl r2, r1, r4
; CHECK-NEXT: vmov r1, s4
; CHECK-NEXT: adds r6, r1, r3
; CHECK-NEXT: vmov r3, s0
; CHECK-NEXT: asr.w r4, r1, #31
; CHECK-NEXT: adc.w r1, r4, lr
; CHECK-NEXT: asrl r6, r1, r3
; CHECK-NEXT: vmov r5, r4, d1
; CHECK-NEXT: vmov r1, s10
; CHECK-NEXT: vmov q0[2], q0[0], r6, r2
; CHECK-NEXT: adds r0, r0, r1
; CHECK-NEXT: vmov.f32 s2, s7
; CHECK-NEXT: vand q0, q0, q2
; CHECK-NEXT: vmov.f32 s6, s5
; CHECK-NEXT: vmov r4, r5, d0
; CHECK-NEXT: vmov r3, r1, d1
; CHECK-NEXT: vldrw.u32 q0, [r0]
; CHECK-NEXT: vmov.f32 s12, s2
; CHECK-NEXT: vmov.f32 s2, s3
; CHECK-NEXT: vmov r0, s12
; CHECK-NEXT: vand q3, q1, q2
; CHECK-NEXT: vldrw.u32 q1, [r2]
; CHECK-NEXT: vmov lr, r12, d7
; CHECK-NEXT: vmov.f32 s16, s6
; CHECK-NEXT: vmov.f32 s18, s7
; CHECK-NEXT: vand q2, q4, q2
; CHECK-NEXT: asrs r2, r0, #31
; CHECK-NEXT: adds r0, r0, r4
; CHECK-NEXT: adcs r5, r2
; CHECK-NEXT: vmov r2, s8
; CHECK-NEXT: asrl r0, r5, r2
; CHECK-NEXT: vmov r2, s2
; CHECK-NEXT: vmov.f32 s2, s1
; CHECK-NEXT: asrs r4, r2, #31
; CHECK-NEXT: adds r2, r2, r3
; CHECK-NEXT: adcs r1, r4
; CHECK-NEXT: vmov r3, s10
; CHECK-NEXT: asrl r2, r1, r3
; CHECK-NEXT: vmov r4, r5, d6
; CHECK-NEXT: vmov r1, s2
; CHECK-NEXT: vmov.f32 s2, s5
; CHECK-NEXT: adds.w r6, r1, lr
; CHECK-NEXT: asr.w r3, r1, #31
; CHECK-NEXT: adc.w r1, r3, r12
; CHECK-NEXT: vmov r3, s18
; CHECK-NEXT: asrl r0, r1, r3
; CHECK-NEXT: vmov r1, s14
; CHECK-NEXT: adds r6, r1, r5
; CHECK-NEXT: asr.w r2, r1, #31
; CHECK-NEXT: adc.w r1, r2, r4
; CHECK-NEXT: vmov r2, s6
; CHECK-NEXT: asrl r6, r1, r2
; CHECK-NEXT: vmov q0[3], q0[1], r6, r0
; CHECK-NEXT: vpop {d9}
; CHECK-NEXT: vmov r3, s2
; CHECK-NEXT: asrl r6, r1, r3
; CHECK-NEXT: vmov r1, s0
; CHECK-NEXT: adds r4, r4, r1
; CHECK-NEXT: asr.w r3, r1, #31
; CHECK-NEXT: adc.w r1, r3, r5
; CHECK-NEXT: vmov r3, s4
; CHECK-NEXT: asrl r4, r1, r3
; CHECK-NEXT: vmov q0[2], q0[0], r4, r0
; CHECK-NEXT: vmov q0[3], q0[1], r6, r2
; CHECK-NEXT: vpop {d8, d9}
; CHECK-NEXT: pop {r4, r5, r6, pc}
entry:
%a = load <4 x i32>, <4 x i32> *%A, align 4
Expand Down Expand Up @@ -142,56 +142,56 @@ define arm_aapcs_vfpcc void @load_store_i32(<4 x i32> *%A, <4 x i32> *%B, <4 x i
; CHECK-NEXT: push {r4, r5, r6, r7, lr}
; CHECK-NEXT: .pad #4
; CHECK-NEXT: sub sp, #4
; CHECK-NEXT: .vsave {d9}
; CHECK-NEXT: vpush {d9}
; CHECK-NEXT: vldrw.u32 q1, [r1]
; CHECK-NEXT: vmov.i64 q0, #0xffffffff
; CHECK-NEXT: vmov.f32 s8, s6
; CHECK-NEXT: vmov.f32 s10, s7
; CHECK-NEXT: vmov.f32 s6, s5
; CHECK-NEXT: vand q3, q2, q0
; CHECK-NEXT: vand q1, q1, q0
; CHECK-NEXT: vldrw.u32 q0, [r0]
; CHECK-NEXT: vmov r4, lr, d2
; CHECK-NEXT: .vsave {d8, d9, d10, d11}
; CHECK-NEXT: vpush {d8, d9, d10, d11}
; CHECK-NEXT: vldrw.u32 q0, [r1]
; CHECK-NEXT: vmov.i64 q4, #0xffffffff
; CHECK-NEXT: vmov.f32 s4, s2
; CHECK-NEXT: vmov r5, r1, d6
; CHECK-NEXT: vmov r0, r12, d7
; CHECK-NEXT: vldrw.u32 q3, [r2]
; CHECK-NEXT: vmov.f32 s10, s3
; CHECK-NEXT: vmov.f32 s8, s14
; CHECK-NEXT: vmov.f32 s18, s15
; CHECK-NEXT: vmov.f32 s14, s1
; CHECK-NEXT: vmov r6, s4
; CHECK-NEXT: vmov.f32 s4, s12
; CHECK-NEXT: vmov.f32 s2, s13
; CHECK-NEXT: vmov.f32 s2, s1
; CHECK-NEXT: vmov.f32 s6, s3
; CHECK-NEXT: vand q2, q0, q4
; CHECK-NEXT: vldrw.u32 q0, [r0]
; CHECK-NEXT: vand q1, q1, q4
; CHECK-NEXT: vmov r5, r1, d3
; CHECK-NEXT: vmov.f32 s12, s2
; CHECK-NEXT: vmov.f32 s2, s3
; CHECK-NEXT: vmov r0, r12, d2
; CHECK-NEXT: vldrw.u32 q1, [r2]
; CHECK-NEXT: vmov r4, lr, d5
; CHECK-NEXT: vmov.f32 s20, s6
; CHECK-NEXT: vmov.f32 s6, s1
; CHECK-NEXT: vmov.f32 s22, s7
; CHECK-NEXT: vand q4, q5, q4
; CHECK-NEXT: vmov r6, s2
; CHECK-NEXT: vmov.f32 s2, s5
; CHECK-NEXT: adds r2, r6, r5
; CHECK-NEXT: vmov r5, s8
; CHECK-NEXT: vmov r5, s18
; CHECK-NEXT: asr.w r7, r6, #31
; CHECK-NEXT: adcs r1, r7
; CHECK-NEXT: asrl r2, r1, r5
; CHECK-NEXT: vmov r7, s4
; CHECK-NEXT: vmov r1, s0
; CHECK-NEXT: vmov r7, s2
; CHECK-NEXT: vmov r1, s6
; CHECK-NEXT: adds r4, r4, r1
; CHECK-NEXT: asr.w r5, r1, #31
; CHECK-NEXT: adc.w r1, r5, lr
; CHECK-NEXT: asrl r4, r1, r7
; CHECK-NEXT: vmov r6, r5, d3
; CHECK-NEXT: vmov r1, s10
; CHECK-NEXT: vmov q1[2], q1[0], r4, r2
; CHECK-NEXT: vmov r6, r5, d4
; CHECK-NEXT: vmov r1, s12
; CHECK-NEXT: adds r0, r0, r1
; CHECK-NEXT: asr.w r7, r1, #31
; CHECK-NEXT: adc.w r1, r7, r12
; CHECK-NEXT: vmov r7, s18
; CHECK-NEXT: vmov r7, s16
; CHECK-NEXT: asrl r0, r1, r7
; CHECK-NEXT: vmov r1, s14
; CHECK-NEXT: vmov r1, s0
; CHECK-NEXT: adds r6, r6, r1
; CHECK-NEXT: asr.w r2, r1, #31
; CHECK-NEXT: adc.w r1, r2, r5
; CHECK-NEXT: vmov r2, s2
; CHECK-NEXT: asrl r6, r1, r2
; CHECK-NEXT: vmov q1[3], q1[1], r6, r0
; CHECK-NEXT: vstrw.32 q1, [r3]
; CHECK-NEXT: vpop {d9}
; CHECK-NEXT: asr.w r7, r1, #31
; CHECK-NEXT: adc.w r1, r7, r5
; CHECK-NEXT: vmov r7, s4
; CHECK-NEXT: asrl r6, r1, r7
; CHECK-NEXT: vmov q0[2], q0[0], r6, r0
; CHECK-NEXT: vmov q0[3], q0[1], r4, r2
; CHECK-NEXT: vstrw.32 q0, [r3]
; CHECK-NEXT: vpop {d8, d9, d10, d11}
; CHECK-NEXT: add sp, #4
; CHECK-NEXT: pop {r4, r5, r6, r7, pc}
entry:
Expand Down Expand Up @@ -276,36 +276,36 @@ entry:
define arm_aapcs_vfpcc void @load_one_store_i32(<4 x i32> *%A, <4 x i32> *%D) {
; CHECK-LABEL: load_one_store_i32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r7, lr}
; CHECK-NEXT: push {r4, r5, r7, lr}
; CHECK-NEXT: .save {r4, r5, r6, lr}
; CHECK-NEXT: push {r4, r5, r6, lr}
; CHECK-NEXT: vldrw.u32 q0, [r0]
; CHECK-NEXT: vmov.f32 s4, s2
; CHECK-NEXT: vmov.f32 s2, s3
; CHECK-NEXT: vmov r2, s2
; CHECK-NEXT: vmov.f32 s2, s1
; CHECK-NEXT: adds.w r12, r2, r2
; CHECK-NEXT: asr.w r3, r2, #31
; CHECK-NEXT: adc.w r7, r3, r2, asr #31
; CHECK-NEXT: vmov r3, s4
; CHECK-NEXT: asrl r12, r7, r2
; CHECK-NEXT: adds r0, r3, r3
; CHECK-NEXT: asr.w r5, r3, #31
; CHECK-NEXT: adc.w r5, r5, r3, asr #31
; CHECK-NEXT: asrl r0, r5, r3
; CHECK-NEXT: vmov r3, s0
; CHECK-NEXT: adds r4, r3, r3
; CHECK-NEXT: asr.w r5, r3, #31
; CHECK-NEXT: adc.w r5, r5, r3, asr #31
; CHECK-NEXT: asrl r4, r5, r3
; CHECK-NEXT: vmov q1[2], q1[0], r4, r0
; CHECK-NEXT: vmov r0, s2
; CHECK-NEXT: adc.w r3, r3, r2, asr #31
; CHECK-NEXT: asrl r12, r3, r2
; CHECK-NEXT: vmov r3, s2
; CHECK-NEXT: adds r2, r3, r3
; CHECK-NEXT: asr.w r0, r3, #31
; CHECK-NEXT: adc.w r5, r0, r3, asr #31
; CHECK-NEXT: vmov r0, s4
; CHECK-NEXT: asrl r2, r5, r3
; CHECK-NEXT: adds r4, r0, r0
; CHECK-NEXT: asr.w r2, r0, #31
; CHECK-NEXT: adc.w r3, r2, r0, asr #31
; CHECK-NEXT: asr.w r3, r0, #31
; CHECK-NEXT: adc.w r3, r3, r0, asr #31
; CHECK-NEXT: asrl r4, r3, r0
; CHECK-NEXT: vmov q1[3], q1[1], r4, r12
; CHECK-NEXT: vstrw.32 q1, [r1]
; CHECK-NEXT: pop {r4, r5, r7, pc}
; CHECK-NEXT: vmov r0, s0
; CHECK-NEXT: adds r6, r0, r0
; CHECK-NEXT: asr.w r3, r0, #31
; CHECK-NEXT: adc.w r3, r3, r0, asr #31
; CHECK-NEXT: asrl r6, r3, r0
; CHECK-NEXT: vmov q0[2], q0[0], r6, r4
; CHECK-NEXT: vmov q0[3], q0[1], r2, r12
; CHECK-NEXT: vstrw.32 q0, [r1]
; CHECK-NEXT: pop {r4, r5, r6, pc}
entry:
%a = load <4 x i32>, <4 x i32> *%A, align 4
%sa = sext <4 x i32> %a to <4 x i64>
Expand Down
223 changes: 107 additions & 116 deletions llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll
Original file line number Diff line number Diff line change
Expand Up @@ -180,44 +180,44 @@ entry:
define arm_aapcs_vfpcc <4 x i32> @ext_add_ashr_trunc_i32(<4 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: ext_add_ashr_trunc_i32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r6, lr}
; CHECK-NEXT: push {r4, r5, r6, lr}
; CHECK-NEXT: .save {r4, r5, r6, r7, lr}
; CHECK-NEXT: push {r4, r5, r6, r7, lr}
; CHECK-NEXT: vmov.f32 s12, s6
; CHECK-NEXT: vmov.i64 q2, #0xffffffff
; CHECK-NEXT: vmov.f32 s6, s5
; CHECK-NEXT: vmov.f32 s14, s7
; CHECK-NEXT: vand q1, q1, q2
; CHECK-NEXT: vmov r2, r3, d2
; CHECK-NEXT: vmov r3, r7, d2
; CHECK-NEXT: vand q3, q3, q2
; CHECK-NEXT: vmov.f32 s4, s2
; CHECK-NEXT: vmov r0, r1, d6
; CHECK-NEXT: vmov.f32 s2, s3
; CHECK-NEXT: vmov.f32 s10, s1
; CHECK-NEXT: vmov r12, lr, d7
; CHECK-NEXT: vmov r4, s4
; CHECK-NEXT: adds r0, r0, r4
; CHECK-NEXT: asr.w r5, r4, #31
; CHECK-NEXT: vmov lr, r12, d7
; CHECK-NEXT: vmov r2, s4
; CHECK-NEXT: asrs r5, r2, #31
; CHECK-NEXT: adds r2, r2, r0
; CHECK-NEXT: vmov r0, s2
; CHECK-NEXT: adcs r1, r5
; CHECK-NEXT: vmov r5, s0
; CHECK-NEXT: lsrl r2, r1, #1
; CHECK-NEXT: asrs r1, r0, #31
; CHECK-NEXT: adds.w r0, r0, lr
; CHECK-NEXT: adc.w r1, r1, r12
; CHECK-NEXT: asrs r4, r5, #31
; CHECK-NEXT: adds r6, r5, r3
; CHECK-NEXT: vmov r3, r5, d3
; CHECK-NEXT: vmov.f32 s6, s1
; CHECK-NEXT: lsrl r0, r1, #1
; CHECK-NEXT: vmov r1, s0
; CHECK-NEXT: adds r2, r2, r1
; CHECK-NEXT: asr.w r4, r1, #31
; CHECK-NEXT: adcs r3, r4
; CHECK-NEXT: lsrl r2, r3, #1
; CHECK-NEXT: vmov r1, r5, d3
; CHECK-NEXT: vmov r3, s2
; CHECK-NEXT: vmov q0[2], q0[0], r2, r0
; CHECK-NEXT: vmov r0, s10
; CHECK-NEXT: adds.w r4, r3, r12
; CHECK-NEXT: asr.w r6, r3, #31
; CHECK-NEXT: adc.w r3, r6, lr
; CHECK-NEXT: asrs r2, r0, #31
; CHECK-NEXT: adds r0, r0, r1
; CHECK-NEXT: adcs r7, r4
; CHECK-NEXT: lsrl r6, r7, #1
; CHECK-NEXT: vmov q0[2], q0[0], r6, r2
; CHECK-NEXT: vmov r1, s6
; CHECK-NEXT: adds r6, r1, r3
; CHECK-NEXT: asr.w r2, r1, #31
; CHECK-NEXT: adc.w r1, r2, r5
; CHECK-NEXT: lsrl r4, r3, #1
; CHECK-NEXT: lsrl r0, r1, #1
; CHECK-NEXT: vmov q0[3], q0[1], r0, r4
; CHECK-NEXT: pop {r4, r5, r6, pc}
; CHECK-NEXT: lsrl r6, r1, #1
; CHECK-NEXT: vmov q0[3], q0[1], r6, r0
; CHECK-NEXT: pop {r4, r5, r6, r7, pc}
entry:
%sa = sext <4 x i32> %a to <4 x i64>
%sb = zext <4 x i32> %b to <4 x i64>
Expand Down Expand Up @@ -328,107 +328,98 @@ entry:
define arm_aapcs_vfpcc <4 x i32> @ext_ops_trunc_i32(<4 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: ext_ops_trunc_i32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, lr}
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr}
; CHECK-NEXT: .pad #4
; CHECK-NEXT: sub sp, #4
; CHECK-NEXT: .vsave {d8, d9}
; CHECK-NEXT: vpush {d8, d9}
; CHECK-NEXT: vmov.f32 s16, s2
; CHECK-NEXT: vmov.i64 q3, #0xffffffff
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr}
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr}
; CHECK-NEXT: vmov.f32 s8, s2
; CHECK-NEXT: vmov.f32 s2, s3
; CHECK-NEXT: vmov.f32 s8, s6
; CHECK-NEXT: vmov.f32 s10, s7
; CHECK-NEXT: vand q2, q2, q3
; CHECK-NEXT: vmov.f32 s6, s5
; CHECK-NEXT: vmov r1, r7, d4
; CHECK-NEXT: vand q1, q1, q3
; CHECK-NEXT: vmov r2, r12, d5
; CHECK-NEXT: vmov r3, s16
; CHECK-NEXT: vmov r10, s8
; CHECK-NEXT: vmov.f32 s8, s6
; CHECK-NEXT: vmov r6, s2
; CHECK-NEXT: vmov.f32 s2, s1
; CHECK-NEXT: adds r0, r3, r1
; CHECK-NEXT: asr.w r5, r3, #31
; CHECK-NEXT: adcs r5, r7
; CHECK-NEXT: asrl r0, r5, r1
; CHECK-NEXT: subs.w lr, r0, r1
; CHECK-NEXT: asr.w r0, r6, #31
; CHECK-NEXT: sbc.w r8, r5, r7
; CHECK-NEXT: adds r4, r6, r2
; CHECK-NEXT: adc.w r5, r0, r12
; CHECK-NEXT: movs r7, #0
; CHECK-NEXT: asrl r4, r5, r2
; CHECK-NEXT: vmov.f32 s6, s5
; CHECK-NEXT: vmov r2, s8
; CHECK-NEXT: asr.w r0, r10, #31
; CHECK-NEXT: asrs r7, r6, #31
; CHECK-NEXT: adds.w r4, r10, r2
; CHECK-NEXT: adc r3, r0, #0
; CHECK-NEXT: asrl r4, r3, r2
; CHECK-NEXT: subs r0, r4, r2
; CHECK-NEXT: sbc.w r5, r5, r12
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: umull r0, r4, r0, r2
; CHECK-NEXT: mla r5, r5, r2, r4
; CHECK-NEXT: eor.w r4, r3, r1
; CHECK-NEXT: orr.w r4, r4, r3, asr #31
; CHECK-NEXT: cmp r4, #0
; CHECK-NEXT: csetm r4, eq
; CHECK-NEXT: bfi r7, r4, #0, #8
; CHECK-NEXT: eor.w r4, r6, r2
; CHECK-NEXT: orr.w r4, r4, r6, asr #31
; CHECK-NEXT: rsbs r6, r6, #0
; CHECK-NEXT: cmp r4, #0
; CHECK-NEXT: lsll r0, r5, r6
; CHECK-NEXT: csetm r4, eq
; CHECK-NEXT: lsll r0, r5, r2
; CHECK-NEXT: bfi r7, r4, #8, #8
; CHECK-NEXT: rsbs r2, r3, #0
; CHECK-NEXT: sbc lr, r3, #0
; CHECK-NEXT: vmov r3, s10
; CHECK-NEXT: umull r0, r8, r0, r2
; CHECK-NEXT: adds r4, r6, r3
; CHECK-NEXT: eor.w r1, r6, r3
; CHECK-NEXT: adc r5, r7, #0
; CHECK-NEXT: eor.w r7, r10, r2
; CHECK-NEXT: asrl r4, r5, r3
; CHECK-NEXT: orr.w r7, r7, r10, asr #31
; CHECK-NEXT: subs r4, r4, r3
; CHECK-NEXT: orr.w r1, r1, r6, asr #31
; CHECK-NEXT: sbc r5, r5, #0
; CHECK-NEXT: cmp r7, #0
; CHECK-NEXT: umull r4, r12, r4, r3
; CHECK-NEXT: csetm r9, eq
; CHECK-NEXT: movs r7, #0
; CHECK-NEXT: cmp r1, #0
; CHECK-NEXT: bfi r7, r9, #0, #8
; CHECK-NEXT: csetm r1, eq
; CHECK-NEXT: bfi r7, r1, #8, #8
; CHECK-NEXT: mla r5, r5, r3, r12
; CHECK-NEXT: rsbs r1, r6, #0
; CHECK-NEXT: vmsr p0, r7
; CHECK-NEXT: umull r4, r7, lr, r1
; CHECK-NEXT: vmov r3, s0
; CHECK-NEXT: mla r7, r8, r1, r7
; CHECK-NEXT: lsll r4, r7, r2
; CHECK-NEXT: vmov r2, lr, d3
; CHECK-NEXT: lsll r4, r7, r1
; CHECK-NEXT: vmov r1, r7, d2
; CHECK-NEXT: vmov q4[2], q4[0], r4, r0
; CHECK-NEXT: vpsel q2, q4, q2
; CHECK-NEXT: asrs r0, r3, #31
; CHECK-NEXT: adds r4, r3, r1
; CHECK-NEXT: adc.w r5, r0, r7
; CHECK-NEXT: asrl r4, r5, r1
; CHECK-NEXT: subs r0, r4, r1
; CHECK-NEXT: sbc.w r7, r5, r7
; CHECK-NEXT: umull r0, r4, r0, r1
; CHECK-NEXT: mla r9, r7, r1, r4
; CHECK-NEXT: vmov r7, s2
; CHECK-NEXT: adds r6, r7, r2
; CHECK-NEXT: asr.w r4, r7, #31
; CHECK-NEXT: adc.w r5, r4, lr
; CHECK-NEXT: asrl r6, r5, r2
; CHECK-NEXT: subs r4, r6, r2
; CHECK-NEXT: sbc.w r6, r5, lr
; CHECK-NEXT: eor.w r5, r3, r1
; CHECK-NEXT: orr.w r5, r5, r3, asr #31
; CHECK-NEXT: rsbs r3, r3, #0
; CHECK-NEXT: cmp r5, #0
; CHECK-NEXT: lsll r0, r9, r3
; CHECK-NEXT: csetm r5, eq
; CHECK-NEXT: rsbs r3, r7, #0
; CHECK-NEXT: bfi r12, r5, #0, #8
; CHECK-NEXT: eor.w r5, r7, r2
; CHECK-NEXT: orr.w r5, r5, r7, asr #31
; CHECK-NEXT: lsll r0, r9, r1
; CHECK-NEXT: cmp r5, #0
; CHECK-NEXT: csetm r5, eq
; CHECK-NEXT: bfi r12, r5, #8, #8
; CHECK-NEXT: umull r4, r5, r4, r2
; CHECK-NEXT: vmsr p0, r12
; CHECK-NEXT: mla r5, r6, r2, r5
; CHECK-NEXT: mla r7, lr, r2, r8
; CHECK-NEXT: lsll r4, r5, r1
; CHECK-NEXT: rsb.w r1, r10, #0
; CHECK-NEXT: lsll r0, r7, r1
; CHECK-NEXT: vmov lr, s2
; CHECK-NEXT: vmov r1, s6
; CHECK-NEXT: lsll r0, r7, r2
; CHECK-NEXT: lsll r4, r5, r3
; CHECK-NEXT: lsll r4, r5, r2
; CHECK-NEXT: vmov q0[2], q0[0], r0, r4
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: vmov q3[2], q3[0], r0, r4
; CHECK-NEXT: vpsel q2, q3, q2
; CHECK-NEXT: adds.w r2, lr, r1
; CHECK-NEXT: asr.w r0, lr, #31
; CHECK-NEXT: adc r3, r0, #0
; CHECK-NEXT: asrl r2, r3, r1
; CHECK-NEXT: subs r0, r2, r1
; CHECK-NEXT: vmov r2, s0
; CHECK-NEXT: sbc r7, r3, #0
; CHECK-NEXT: vmov r3, s4
; CHECK-NEXT: umull r0, r6, r0, r1
; CHECK-NEXT: asrs r5, r2, #31
; CHECK-NEXT: adds r4, r2, r3
; CHECK-NEXT: adc r5, r5, #0
; CHECK-NEXT: asrl r4, r5, r3
; CHECK-NEXT: subs r4, r4, r3
; CHECK-NEXT: sbc r8, r5, #0
; CHECK-NEXT: mla r5, r7, r1, r6
; CHECK-NEXT: eor.w r6, lr, r1
; CHECK-NEXT: orr.w r6, r6, lr, asr #31
; CHECK-NEXT: eor.w r7, r2, r3
; CHECK-NEXT: cmp r6, #0
; CHECK-NEXT: orr.w r7, r7, r2, asr #31
; CHECK-NEXT: csetm r6, eq
; CHECK-NEXT: cmp r7, #0
; CHECK-NEXT: csetm r7, eq
; CHECK-NEXT: rsb.w lr, lr, #0
; CHECK-NEXT: bfi r12, r7, #0, #8
; CHECK-NEXT: lsll r0, r5, lr
; CHECK-NEXT: bfi r12, r6, #8, #8
; CHECK-NEXT: umull r4, r6, r4, r3
; CHECK-NEXT: lsll r0, r5, r1
; CHECK-NEXT: rsbs r1, r2, #0
; CHECK-NEXT: vmsr p0, r12
; CHECK-NEXT: mla r7, r8, r3, r6
; CHECK-NEXT: lsll r4, r7, r1
; CHECK-NEXT: lsll r4, r7, r3
; CHECK-NEXT: vmov q0[2], q0[0], r4, r0
; CHECK-NEXT: vpsel q0, q0, q1
; CHECK-NEXT: vmov.f32 s1, s2
; CHECK-NEXT: vmov.f32 s2, s8
; CHECK-NEXT: vmov.f32 s3, s10
; CHECK-NEXT: vpop {d8, d9}
; CHECK-NEXT: add sp, #4
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc}
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
entry:
%sa = sext <4 x i32> %a to <4 x i64>
%sb = zext <4 x i32> %b to <4 x i64>
Expand Down
12 changes: 6 additions & 6 deletions llvm/test/CodeGen/Thumb2/mve-sext-masked-load.ll
Original file line number Diff line number Diff line change
Expand Up @@ -57,19 +57,19 @@ define arm_aapcs_vfpcc <4 x double> @foo_v4i32(<4 x i32>* nocapture readonly %pS
; CHECK-NEXT: vpush {d8, d9, d10, d11}
; CHECK-NEXT: vpt.s32 lt, q0, zr
; CHECK-NEXT: vldrwt.u32 q5, [r0]
; CHECK-NEXT: vmov.f32 s2, s21
; CHECK-NEXT: vmov.f32 s2, s23
; CHECK-NEXT: vmov.f32 s16, s22
; CHECK-NEXT: vmov r0, s2
; CHECK-NEXT: asrs r1, r0, #31
; CHECK-NEXT: bl __aeabi_l2d
; CHECK-NEXT: vmov r2, s20
; CHECK-NEXT: vmov r2, s16
; CHECK-NEXT: vmov d9, r0, r1
; CHECK-NEXT: asrs r3, r2, #31
; CHECK-NEXT: mov r0, r2
; CHECK-NEXT: mov r1, r3
; CHECK-NEXT: bl __aeabi_l2d
; CHECK-NEXT: vmov.f32 s2, s23
; CHECK-NEXT: vmov.f32 s2, s21
; CHECK-NEXT: vmov d8, r0, r1
; CHECK-NEXT: vmov.f32 s20, s22
; CHECK-NEXT: vmov r2, s2
; CHECK-NEXT: asrs r3, r2, #31
; CHECK-NEXT: mov r0, r2
Expand All @@ -82,8 +82,8 @@ define arm_aapcs_vfpcc <4 x double> @foo_v4i32(<4 x i32>* nocapture readonly %pS
; CHECK-NEXT: mov r1, r3
; CHECK-NEXT: bl __aeabi_l2d
; CHECK-NEXT: vmov d10, r0, r1
; CHECK-NEXT: vmov q0, q4
; CHECK-NEXT: vmov q1, q5
; CHECK-NEXT: vmov q1, q4
; CHECK-NEXT: vmov q0, q5
; CHECK-NEXT: vpop {d8, d9, d10, d11}
; CHECK-NEXT: pop {r7, pc}
entry:
Expand Down
34 changes: 17 additions & 17 deletions llvm/test/CodeGen/Thumb2/mve-vabdus.ll
Original file line number Diff line number Diff line change
Expand Up @@ -401,26 +401,26 @@ define void @vabd_loop_s32(i32* nocapture readonly %x, i32* nocapture readonly %
; CHECK-NEXT: subs r4, r4, r6
; CHECK-NEXT: sbc.w r9, r3, r6, asr #31
; CHECK-NEXT: vmov r6, s8
; CHECK-NEXT: vmov r3, s6
; CHECK-NEXT: subs r5, r7, r6
; CHECK-NEXT: asr.w r7, r7, #31
; CHECK-NEXT: vmov q2[2], q2[0], r5, r8
; CHECK-NEXT: asr.w r5, r7, #31
; CHECK-NEXT: sbc.w r5, r5, r6, asr #31
; CHECK-NEXT: vmov r6, s14
; CHECK-NEXT: vmov r7, s6
; CHECK-NEXT: subs r3, r7, r6
; CHECK-NEXT: vmov q2[3], q2[1], r4, r3
; CHECK-NEXT: asr.w r3, r5, #31
; CHECK-NEXT: mov.w r4, #0
; CHECK-NEXT: bfi r4, r3, #0, #4
; CHECK-NEXT: asr.w r3, r9, #31
; CHECK-NEXT: bfi r4, r3, #4, #4
; CHECK-NEXT: asr.w r3, r12, #31
; CHECK-NEXT: bfi r4, r3, #8, #4
; CHECK-NEXT: asr.w r3, r7, #31
; CHECK-NEXT: sbc.w r3, r3, r6, asr #31
; CHECK-NEXT: vmov r5, s14
; CHECK-NEXT: sbc.w r6, r7, r6, asr #31
; CHECK-NEXT: asrs r6, r6, #31
; CHECK-NEXT: subs r7, r3, r5
; CHECK-NEXT: asr.w r3, r3, #31
; CHECK-NEXT: vmov q2[3], q2[1], r4, r7
; CHECK-NEXT: mov.w r7, #0
; CHECK-NEXT: sbc.w r3, r3, r5, asr #31
; CHECK-NEXT: bfi r7, r6, #0, #4
; CHECK-NEXT: asr.w r4, r9, #31
; CHECK-NEXT: asr.w r6, r12, #31
; CHECK-NEXT: bfi r7, r4, #4, #4
; CHECK-NEXT: asrs r3, r3, #31
; CHECK-NEXT: bfi r4, r3, #12, #4
; CHECK-NEXT: vmsr p0, r4
; CHECK-NEXT: bfi r7, r6, #8, #4
; CHECK-NEXT: bfi r7, r3, #12, #4
; CHECK-NEXT: vmsr p0, r7
; CHECK-NEXT: vpst
; CHECK-NEXT: vsubt.i32 q2, q0, q2
; CHECK-NEXT: vstrb.8 q2, [r2], #16
Expand Down
72 changes: 35 additions & 37 deletions llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll
Original file line number Diff line number Diff line change
Expand Up @@ -232,34 +232,33 @@ define arm_aapcs_vfpcc <4 x i64> @sext32_0213_ext0(<8 x i32> %src1, i32 %src2) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r7, lr}
; CHECK-NEXT: push {r4, r5, r7, lr}
; CHECK-NEXT: vmov q1, q0
; CHECK-NEXT: vmov r1, s6
; CHECK-NEXT: vmov.f32 s4, s1
; CHECK-NEXT: vmov.f32 s6, s3
; CHECK-NEXT: vmov r3, s4
; CHECK-NEXT: vmov.f32 s4, s5
; CHECK-NEXT: vmov.f32 s6, s7
; CHECK-NEXT: umull lr, r12, r1, r0
; CHECK-NEXT: vmov r1, s6
; CHECK-NEXT: umull r2, r5, r3, r0
; CHECK-NEXT: vmov q0[2], q0[0], r2, lr
; CHECK-NEXT: umull lr, r12, r1, r0
; CHECK-NEXT: vmov q1[2], q1[0], r2, lr
; CHECK-NEXT: asrs r2, r0, #31
; CHECK-NEXT: mla r4, r1, r2, r12
; CHECK-NEXT: asrs r1, r1, #31
; CHECK-NEXT: mla r5, r3, r2, r5
; CHECK-NEXT: asrs r3, r3, #31
; CHECK-NEXT: mla r1, r1, r0, r4
; CHECK-NEXT: vmov r4, s4
; CHECK-NEXT: mla r3, r3, r0, r5
; CHECK-NEXT: vmov q0[3], q0[1], r3, r1
; CHECK-NEXT: vmov r1, s6
; CHECK-NEXT: umull r5, lr, r4, r0
; CHECK-NEXT: umull r3, r12, r1, r0
; CHECK-NEXT: vmov q1[2], q1[0], r5, r3
; CHECK-NEXT: mla r3, r1, r2, r12
; CHECK-NEXT: vmov q1[3], q1[1], r3, r1
; CHECK-NEXT: vmov r1, s2
; CHECK-NEXT: umull r3, r5, r1, r0
; CHECK-NEXT: mla r5, r1, r2, r5
; CHECK-NEXT: asrs r1, r1, #31
; CHECK-NEXT: mla r2, r4, r2, lr
; CHECK-NEXT: mla r1, r1, r0, r3
; CHECK-NEXT: asrs r3, r4, #31
; CHECK-NEXT: mla r0, r3, r0, r2
; CHECK-NEXT: vmov q1[3], q1[1], r0, r1
; CHECK-NEXT: mla r12, r1, r0, r5
; CHECK-NEXT: vmov r5, s0
; CHECK-NEXT: umull r4, r1, r5, r0
; CHECK-NEXT: mla r1, r5, r2, r1
; CHECK-NEXT: asrs r2, r5, #31
; CHECK-NEXT: vmov q0[2], q0[0], r4, r3
; CHECK-NEXT: mla r0, r2, r0, r1
; CHECK-NEXT: vmov q0[3], q0[1], r0, r12
; CHECK-NEXT: pop {r4, r5, r7, pc}
entry:
%shuf1 = shufflevector <8 x i32> %src1, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
Expand All @@ -276,34 +275,33 @@ define arm_aapcs_vfpcc <4 x i64> @sext32_ext0_0213(<8 x i32> %src1, i32 %src2) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r7, lr}
; CHECK-NEXT: push {r4, r5, r7, lr}
; CHECK-NEXT: vmov q1, q0
; CHECK-NEXT: vmov.f32 s4, s1
; CHECK-NEXT: asrs r4, r0, #31
; CHECK-NEXT: vmov r1, s6
; CHECK-NEXT: vmov.f32 s6, s3
; CHECK-NEXT: vmov r3, s4
; CHECK-NEXT: vmov.f32 s4, s5
; CHECK-NEXT: vmov.f32 s6, s7
; CHECK-NEXT: umull lr, r12, r0, r1
; CHECK-NEXT: vmov r1, s6
; CHECK-NEXT: umull r2, r5, r0, r3
; CHECK-NEXT: vmov q0[2], q0[0], r2, lr
; CHECK-NEXT: umull lr, r12, r0, r1
; CHECK-NEXT: vmov q1[2], q1[0], r2, lr
; CHECK-NEXT: asrs r2, r1, #31
; CHECK-NEXT: mla r2, r0, r2, r12
; CHECK-NEXT: mla r1, r4, r1, r2
; CHECK-NEXT: asrs r2, r3, #31
; CHECK-NEXT: mla r2, r0, r2, r5
; CHECK-NEXT: vmov r5, s4
; CHECK-NEXT: mla r2, r4, r3, r2
; CHECK-NEXT: vmov q0[3], q0[1], r2, r1
; CHECK-NEXT: vmov r1, s6
; CHECK-NEXT: umull r3, lr, r0, r5
; CHECK-NEXT: umull r2, r12, r0, r1
; CHECK-NEXT: vmov q1[2], q1[0], r3, r2
; CHECK-NEXT: asrs r2, r1, #31
; CHECK-NEXT: mla r2, r0, r2, r12
; CHECK-NEXT: mla r1, r4, r1, r2
; CHECK-NEXT: asrs r2, r5, #31
; CHECK-NEXT: mla r0, r0, r2, lr
; CHECK-NEXT: mla r0, r4, r5, r0
; CHECK-NEXT: vmov q1[3], q1[1], r0, r1
; CHECK-NEXT: vmov q1[3], q1[1], r2, r1
; CHECK-NEXT: vmov r1, s2
; CHECK-NEXT: umull r2, r3, r0, r1
; CHECK-NEXT: asrs r5, r1, #31
; CHECK-NEXT: mla r3, r0, r5, r3
; CHECK-NEXT: mla r12, r4, r1, r3
; CHECK-NEXT: vmov r3, s0
; CHECK-NEXT: umull r5, r1, r0, r3
; CHECK-NEXT: vmov q0[2], q0[0], r5, r2
; CHECK-NEXT: asrs r2, r3, #31
; CHECK-NEXT: mla r0, r0, r2, r1
; CHECK-NEXT: mla r0, r4, r3, r0
; CHECK-NEXT: vmov q0[3], q0[1], r0, r12
; CHECK-NEXT: pop {r4, r5, r7, pc}
entry:
%shuf1 = shufflevector <8 x i32> %src1, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
Expand Down
17 changes: 7 additions & 10 deletions llvm/test/CodeGen/Thumb2/mve-vst3.ll
Original file line number Diff line number Diff line change
Expand Up @@ -8,21 +8,18 @@ define void @vst3_v2i32(<2 x i32> *%src, <6 x i32> *%dst) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, lr}
; CHECK-NEXT: push {r4, lr}
; CHECK-NEXT: ldrd lr, r12, [r0]
; CHECK-NEXT: ldrd r3, r2, [r0, #8]
; CHECK-NEXT: ldrd r12, r3, [r0]
; CHECK-NEXT: ldrd lr, r2, [r0, #8]
; CHECK-NEXT: ldrd r4, r0, [r0, #16]
; CHECK-NEXT: vmov q1[2], q1[0], lr, r3
; CHECK-NEXT: vmov q1[3], q1[1], r12, r2
; CHECK-NEXT: vmov.32 q0[0], r4
; CHECK-NEXT: vmov.f32 s8, s7
; CHECK-NEXT: vmov.32 q0[1], r0
; CHECK-NEXT: vmov.32 q1[1], r3
; CHECK-NEXT: vmov q1[2], q1[0], r12, lr
; CHECK-NEXT: strd r2, r0, [r1, #16]
; CHECK-NEXT: vmov q0[2], q0[0], r4, r0
; CHECK-NEXT: vmov.f32 s8, s4
; CHECK-NEXT: vmov.f32 s9, s6
; CHECK-NEXT: vmov.f32 s10, s0
; CHECK-NEXT: vmov.f32 s11, s5
; CHECK-NEXT: vmov r2, s8
; CHECK-NEXT: vmov.f32 s8, s4
; CHECK-NEXT: vstrw.32 q2, [r1]
; CHECK-NEXT: strd r2, r0, [r1, #16]
; CHECK-NEXT: pop {r4, pc}
entry:
%s1 = getelementptr <2 x i32>, <2 x i32>* %src, i32 0
Expand Down
24 changes: 11 additions & 13 deletions llvm/test/CodeGen/X86/combine-bitreverse.ll
Original file line number Diff line number Diff line change
Expand Up @@ -349,19 +349,17 @@ define i64 @test_bitreverse_shli_bitreverse_i64(i64 %a) nounwind {
; X64-LABEL: test_bitreverse_shli_bitreverse_i64:
; X64: # %bb.0:
; X64-NEXT: bswapq %rdi
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: shrq $4, %rax
; X64-NEXT: movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
; X64-NEXT: andq %rcx, %rax
; X64-NEXT: andq %rcx, %rdi
; X64-NEXT: shlq $4, %rdi
; X64-NEXT: orq %rax, %rdi
; X64-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
; X64-NEXT: movq %rdi, %rcx
; X64-NEXT: andq %rax, %rcx
; X64-NEXT: shrq $2, %rdi
; X64-NEXT: andq %rax, %rdi
; X64-NEXT: leaq (%rdi,%rcx,4), %rax
; X64-NEXT: movl %edi, %eax
; X64-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F
; X64-NEXT: shll $4, %eax
; X64-NEXT: shrl $4, %edi
; X64-NEXT: andl $252645135, %edi # imm = 0xF0F0F0F
; X64-NEXT: orl %eax, %edi
; X64-NEXT: movl %edi, %eax
; X64-NEXT: andl $858993459, %eax # imm = 0x33333333
; X64-NEXT: shrl $2, %edi
; X64-NEXT: andl $858993459, %edi # imm = 0x33333333
; X64-NEXT: leal (%rdi,%rax,4), %eax
; X64-NEXT: movl %eax, %ecx
; X64-NEXT: andl $357913941, %ecx # imm = 0x15555555
; X64-NEXT: shrl %eax
Expand Down
80 changes: 42 additions & 38 deletions llvm/test/CodeGen/X86/dagcombine-cse.ll
Original file line number Diff line number Diff line change
Expand Up @@ -50,55 +50,59 @@ define i96 @square_high(i96 %x) nounwind {
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: subl $8, %esp
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
; X86-NEXT: movl %edi, %eax
; X86-NEXT: mull %edi
; X86-NEXT: movl %edx, %ebx
; X86-NEXT: movl %esi, %eax
; X86-NEXT: mull %ebx
; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: mull %edi
; X86-NEXT: addl %eax, %ebx
; X86-NEXT: movl %edx, %ebp
; X86-NEXT: adcl $0, %ebp
; X86-NEXT: addl %eax, %ebx
; X86-NEXT: adcl %edx, %ebp
; X86-NEXT: setb %al
; X86-NEXT: movzbl %al, %ecx
; X86-NEXT: movl %esi, %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
; X86-NEXT: mull %esi
; X86-NEXT: movl %eax, %ebx
; X86-NEXT: addl %ebp, %ebx
; X86-NEXT: adcl %edx, %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movl %esi, %eax
; X86-NEXT: mull %edi
; X86-NEXT: movl %edx, %edi
; X86-NEXT: movl %eax, %ebp
; X86-NEXT: addl %ecx, %ebp
; X86-NEXT: adcl $0, %edi
; X86-NEXT: movl %esi, %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
; X86-NEXT: movl %eax, %esi
; X86-NEXT: addl %edi, %esi
; X86-NEXT: adcl $0, %edx
; X86-NEXT: addl %ebp, %ebx
; X86-NEXT: adcl %esi, %ecx
; X86-NEXT: setb %al
; X86-NEXT: movzbl %al, %ecx
; X86-NEXT: movl %edx, %edi
; X86-NEXT: adcl $0, %edi
; X86-NEXT: addl %ebp, %ebx
; X86-NEXT: adcl %ecx, %esi
; X86-NEXT: setb %cl
; X86-NEXT: adcl %edx, %edi
; X86-NEXT: addb $255, %cl
; X86-NEXT: movl %edx, %ecx
; X86-NEXT: adcl $0, %ecx
; X86-NEXT: movl %ebx, %eax
; X86-NEXT: mull %ebx
; X86-NEXT: movl %edx, %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull %ebx
; X86-NEXT: addl %eax, %esi
; X86-NEXT: movl %edx, %ebx
; X86-NEXT: adcl $0, %ebx
; X86-NEXT: addl %eax, %esi
; X86-NEXT: adcl %edx, %ebx
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull %eax
; X86-NEXT: addl %ebx, %eax
; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 1-byte Folded Reload
; X86-NEXT: adcl %edx, %esi
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
; X86-NEXT: addl %edx, %eax
; X86-NEXT: adcl %ebp, %esi
; X86-NEXT: movl %edi, %ebx
; X86-NEXT: setb %ah
; X86-NEXT: addb $255, %al
; X86-NEXT: adcl %edx, %ecx
; X86-NEXT: movzbl %ah, %ebx
; X86-NEXT: adcl $0, %ebx
; X86-NEXT: adcl $0, %ecx
; X86-NEXT: addl %edx, %eax
; X86-NEXT: adcl %ebp, %esi
; X86-NEXT: adcl %edi, %ebx
; X86-NEXT: adcl $0, %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull %eax
; X86-NEXT: addl %eax, %ebx
; X86-NEXT: adcl %edx, %ecx
; X86-NEXT: addl %eax, %edi
; X86-NEXT: adcl %edx, %ebx
; X86-NEXT: movl %esi, %eax
; X86-NEXT: movl %ebx, %edx
; X86-NEXT: addl $8, %esp
; X86-NEXT: movl %edi, %edx
; X86-NEXT: movl %ebx, %ecx
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
Expand Down
48 changes: 20 additions & 28 deletions llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll
Original file line number Diff line number Diff line change
Expand Up @@ -556,18 +556,16 @@ define <4 x i1> @vec_4xi32_nonsplat_eq(<4 x i32> %x, <4 x i32> %y) nounwind {
define <4 x i1> @vec_4xi32_nonsplat_undef0_eq(<4 x i32> %x, <4 x i32> %y) nounwind {
; X86-SSE2-LABEL: vec_4xi32_nonsplat_undef0_eq:
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: movl $1, %eax
; X86-SSE2-NEXT: movd %eax, %xmm2
; X86-SSE2-NEXT: pslld $23, %xmm1
; X86-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
; X86-SSE2-NEXT: cvttps2dq %xmm1, %xmm1
; X86-SSE2-NEXT: pmuludq %xmm1, %xmm2
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; X86-SSE2-NEXT: pand %xmm2, %xmm0
; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; X86-SSE2-NEXT: pand %xmm1, %xmm0
; X86-SSE2-NEXT: pxor %xmm1, %xmm1
; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
; X86-SSE2-NEXT: retl
Expand All @@ -583,18 +581,16 @@ define <4 x i1> @vec_4xi32_nonsplat_undef0_eq(<4 x i32> %x, <4 x i32> %y) nounwi
;
; X64-SSE2-LABEL: vec_4xi32_nonsplat_undef0_eq:
; X64-SSE2: # %bb.0:
; X64-SSE2-NEXT: movl $1, %eax
; X64-SSE2-NEXT: movd %eax, %xmm2
; X64-SSE2-NEXT: pslld $23, %xmm1
; X64-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; X64-SSE2-NEXT: cvttps2dq %xmm1, %xmm1
; X64-SSE2-NEXT: pmuludq %xmm1, %xmm2
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; X64-SSE2-NEXT: pand %xmm2, %xmm0
; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; X64-SSE2-NEXT: pand %xmm1, %xmm0
; X64-SSE2-NEXT: pxor %xmm1, %xmm1
; X64-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
; X64-SSE2-NEXT: retq
Expand Down Expand Up @@ -654,18 +650,16 @@ define <4 x i1> @vec_4xi32_nonsplat_undef1_eq(<4 x i32> %x, <4 x i32> %y) nounwi
define <4 x i1> @vec_4xi32_nonsplat_undef2_eq(<4 x i32> %x, <4 x i32> %y) nounwind {
; X86-SSE2-LABEL: vec_4xi32_nonsplat_undef2_eq:
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: movl $1, %eax
; X86-SSE2-NEXT: movd %eax, %xmm2
; X86-SSE2-NEXT: pslld $23, %xmm1
; X86-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
; X86-SSE2-NEXT: cvttps2dq %xmm1, %xmm1
; X86-SSE2-NEXT: pmuludq %xmm1, %xmm2
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; X86-SSE2-NEXT: pand %xmm2, %xmm0
; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; X86-SSE2-NEXT: pand %xmm1, %xmm0
; X86-SSE2-NEXT: pxor %xmm1, %xmm1
; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
; X86-SSE2-NEXT: retl
Expand All @@ -681,18 +675,16 @@ define <4 x i1> @vec_4xi32_nonsplat_undef2_eq(<4 x i32> %x, <4 x i32> %y) nounwi
;
; X64-SSE2-LABEL: vec_4xi32_nonsplat_undef2_eq:
; X64-SSE2: # %bb.0:
; X64-SSE2-NEXT: movl $1, %eax
; X64-SSE2-NEXT: movd %eax, %xmm2
; X64-SSE2-NEXT: pslld $23, %xmm1
; X64-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; X64-SSE2-NEXT: cvttps2dq %xmm1, %xmm1
; X64-SSE2-NEXT: pmuludq %xmm1, %xmm2
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; X64-SSE2-NEXT: pand %xmm2, %xmm0
; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; X64-SSE2-NEXT: pand %xmm1, %xmm0
; X64-SSE2-NEXT: pxor %xmm1, %xmm1
; X64-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
; X64-SSE2-NEXT: retq
Expand Down
17 changes: 8 additions & 9 deletions llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ define void @i56_or(ptr %a) {
; X64-NEXT: movzwl 4(%rdi), %eax
; X64-NEXT: movzbl 6(%rdi), %ecx
; X64-NEXT: movb %cl, 6(%rdi)
; X64-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
; X64-NEXT: # kill: def $ecx killed $ecx def $rcx
; X64-NEXT: shll $16, %ecx
; X64-NEXT: orl %eax, %ecx
; X64-NEXT: shlq $32, %rcx
Expand Down Expand Up @@ -149,7 +149,7 @@ define void @i56_and_or(ptr %a) {
; X64-NEXT: movzwl 4(%rdi), %eax
; X64-NEXT: movzbl 6(%rdi), %ecx
; X64-NEXT: movb %cl, 6(%rdi)
; X64-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
; X64-NEXT: # kill: def $ecx killed $ecx def $rcx
; X64-NEXT: shll $16, %ecx
; X64-NEXT: orl %eax, %ecx
; X64-NEXT: shlq $32, %rcx
Expand Down Expand Up @@ -187,19 +187,18 @@ define void @i56_insert_bit(ptr %a, i1 zeroext %bit) {
; X64-NEXT: movzwl 4(%rdi), %ecx
; X64-NEXT: movzbl 6(%rdi), %edx
; X64-NEXT: movb %dl, 6(%rdi)
; X64-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx
; X64-NEXT: # kill: def $edx killed $edx def $rdx
; X64-NEXT: shll $16, %edx
; X64-NEXT: orl %ecx, %edx
; X64-NEXT: shlq $32, %rdx
; X64-NEXT: movl (%rdi), %ecx
; X64-NEXT: orq %rdx, %rcx
; X64-NEXT: shlq $13, %rax
; X64-NEXT: movabsq $72057594037919743, %rdx # imm = 0xFFFFFFFFFFDFFF
; X64-NEXT: andq %rcx, %rdx
; X64-NEXT: orq %rax, %rdx
; X64-NEXT: movl %edx, (%rdi)
; X64-NEXT: shrq $32, %rdx
; X64-NEXT: movw %dx, 4(%rdi)
; X64-NEXT: andq $-8193, %rcx # imm = 0xDFFF
; X64-NEXT: orq %rax, %rcx
; X64-NEXT: movl %ecx, (%rdi)
; X64-NEXT: shrq $32, %rcx
; X64-NEXT: movw %cx, 4(%rdi)
; X64-NEXT: retq
%extbit = zext i1 %bit to i56
%b = load i56, ptr %a, align 1
Expand Down
537 changes: 273 additions & 264 deletions llvm/test/CodeGen/X86/smul-with-overflow.ll

Large diffs are not rendered by default.

58 changes: 26 additions & 32 deletions llvm/test/CodeGen/X86/smul_fix_sat_constants.ll
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,13 @@ declare { i64, i1 } @llvm.smul.with.overflow.i64(i64, i64)
define i64 @func() nounwind {
; X64-LABEL: func:
; X64: # %bb.0:
; X64-NEXT: movl $2, %ecx
; X64-NEXT: movl $3, %eax
; X64-NEXT: imulq %rcx
; X64-NEXT: cmpq $2, %rdx
; X64-NEXT: movl $2, %eax
; X64-NEXT: negq %rax
; X64-NEXT: movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF
; X64-NEXT: movl $1, %ecx
; X64-NEXT: cmovgeq %rax, %rcx
; X64-NEXT: cmpq $-2, %rdx
; X64-NEXT: movq $-2, %rax
; X64-NEXT: negq %rax
; X64-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
; X64-NEXT: cmovgeq %rcx, %rax
; X64-NEXT: retq
Expand All @@ -42,16 +41,15 @@ define i64 @func2() nounwind {
define i64 @func3() nounwind {
; X64-LABEL: func3:
; X64: # %bb.0:
; X64-NEXT: movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF
; X64-NEXT: movl $2, %edx
; X64-NEXT: movq %rcx, %rax
; X64-NEXT: imulq %rdx
; X64-NEXT: cmpq $2, %rdx
; X64-NEXT: movabsq $4611686018427387903, %rsi # imm = 0x3FFFFFFFFFFFFFFF
; X64-NEXT: cmovgeq %rcx, %rsi
; X64-NEXT: cmpq $-2, %rdx
; X64-NEXT: movl $2, %eax
; X64-NEXT: negq %rax
; X64-NEXT: movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF
; X64-NEXT: movabsq $4611686018427387903, %rcx # imm = 0x3FFFFFFFFFFFFFFF
; X64-NEXT: cmovgeq %rax, %rcx
; X64-NEXT: movq $-2, %rax
; X64-NEXT: negq %rax
; X64-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
; X64-NEXT: cmovgeq %rsi, %rax
; X64-NEXT: cmovgeq %rcx, %rax
; X64-NEXT: retq
%tmp = call i64 @llvm.smul.fix.sat.i64(i64 9223372036854775807, i64 2, i32 2)
ret i64 %tmp
Expand All @@ -60,16 +58,15 @@ define i64 @func3() nounwind {
define i64 @func4() nounwind {
; X64-LABEL: func4:
; X64: # %bb.0:
; X64-NEXT: movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF
; X64-NEXT: movl $2, %edx
; X64-NEXT: movq %rcx, %rax
; X64-NEXT: imulq %rdx
; X64-NEXT: cmpq $2147483647, %rdx # imm = 0x7FFFFFFF
; X64-NEXT: movl $4294967295, %esi # imm = 0xFFFFFFFF
; X64-NEXT: cmovgq %rcx, %rsi
; X64-NEXT: cmpq $-2147483648, %rdx # imm = 0x80000000
; X64-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF
; X64-NEXT: negq %rax
; X64-NEXT: movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF
; X64-NEXT: movl $4294967295, %ecx # imm = 0xFFFFFFFF
; X64-NEXT: cmovgq %rax, %rcx
; X64-NEXT: movq $-2147483648, %rax # imm = 0x80000000
; X64-NEXT: negq %rax
; X64-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
; X64-NEXT: cmovgeq %rsi, %rax
; X64-NEXT: cmovgeq %rcx, %rax
; X64-NEXT: retq
%tmp = call i64 @llvm.smul.fix.sat.i64(i64 9223372036854775807, i64 2, i32 32)
ret i64 %tmp
Expand All @@ -78,18 +75,15 @@ define i64 @func4() nounwind {
define i64 @func5() nounwind {
; X64-LABEL: func5:
; X64: # %bb.0:
; X64-NEXT: movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF
; X64-NEXT: movl $2, %edx
; X64-NEXT: movq %rcx, %rax
; X64-NEXT: imulq %rdx
; X64-NEXT: movabsq $4611686018427387903, %rax # imm = 0x3FFFFFFFFFFFFFFF
; X64-NEXT: cmpq %rax, %rdx
; X64-NEXT: movl $1, %esi
; X64-NEXT: cmovgq %rcx, %rsi
; X64-NEXT: negq %rax
; X64-NEXT: movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF
; X64-NEXT: movl $1, %ecx
; X64-NEXT: cmovgq %rax, %rcx
; X64-NEXT: movabsq $-4611686018427387904, %rax # imm = 0xC000000000000000
; X64-NEXT: cmpq %rax, %rdx
; X64-NEXT: negq %rax
; X64-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
; X64-NEXT: cmovgeq %rsi, %rax
; X64-NEXT: cmovgeq %rcx, %rax
; X64-NEXT: retq
%tmp = call i64 @llvm.smul.fix.sat.i64(i64 9223372036854775807, i64 2, i32 63)
ret i64 %tmp
Expand Down
34 changes: 15 additions & 19 deletions llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
Original file line number Diff line number Diff line change
Expand Up @@ -558,12 +558,11 @@ define <4 x i32> @test_srem_odd_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = <1,u,268435456,u>
; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,1,268435456,1]
; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,1,1,1]
; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,3,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
Expand All @@ -572,7 +571,7 @@ define <4 x i32> @test_srem_odd_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE2-NEXT: por %xmm2, %xmm0
; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pandn %xmm4, %xmm0
; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-SSE41-LABEL: test_srem_odd_poweroftwo:
Expand Down Expand Up @@ -648,19 +647,18 @@ define <4 x i32> @test_srem_even_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = <2147483648,u,268435456,u>
; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648,268435456,2147483648]
; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,3,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-SSE2-NEXT: por %xmm2, %xmm0
; CHECK-SSE2-NEXT: pxor %xmm4, %xmm0
; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: retq
Expand Down Expand Up @@ -1135,27 +1133,26 @@ define <4 x i32> @test_srem_even_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_srem_even_INT_MIN:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pxor %xmm2, %xmm2
; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = <3067833783,u,1,u>
; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3067833783,3067833783,1,3067833783]
; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = <2147483648,u,2,u>
; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2,2147483648]
; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,3,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648]
; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
; CHECK-SSE2-NEXT: por %xmm4, %xmm3
; CHECK-SSE2-NEXT: pxor %xmm5, %xmm3
; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm1
; CHECK-SSE2-NEXT: pxor %xmm3, %xmm1
Expand Down Expand Up @@ -1379,12 +1376,11 @@ define <4 x i32> @test_srem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = <1,u,268435456,u>
; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,1,268435456,1]
; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,1,1,1]
; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,3,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
Expand All @@ -1393,7 +1389,7 @@ define <4 x i32> @test_srem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE2-NEXT: por %xmm2, %xmm0
; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pandn %xmm4, %xmm0
; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-SSE41-LABEL: test_srem_odd_allones_and_poweroftwo:
Expand Down
153 changes: 72 additions & 81 deletions llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll
Original file line number Diff line number Diff line change
Expand Up @@ -163,19 +163,18 @@ define <4 x i32> @test_urem_even_allones_eq(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_even_allones_eq:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-SSE2-NEXT: por %xmm4, %xmm0
; CHECK-SSE2-NEXT: pxor %xmm2, %xmm0
; CHECK-SSE2-NEXT: por %xmm2, %xmm0
; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: retq
Expand Down Expand Up @@ -241,19 +240,18 @@ define <4 x i32> @test_urem_even_allones_ne(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_even_allones_ne:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-SSE2-NEXT: por %xmm4, %xmm0
; CHECK-SSE2-NEXT: pxor %xmm2, %xmm0
; CHECK-SSE2-NEXT: por %xmm2, %xmm0
; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: psrld $31, %xmm0
; CHECK-SSE2-NEXT: retq
Expand Down Expand Up @@ -479,21 +477,20 @@ define <4 x i32> @test_urem_odd_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_odd_poweroftwo:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1]
; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-SSE2-NEXT: por %xmm4, %xmm0
; CHECK-SSE2-NEXT: por %xmm2, %xmm0
; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pandn %xmm2, %xmm0
; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-SSE41-LABEL: test_urem_odd_poweroftwo:
Expand Down Expand Up @@ -559,19 +556,18 @@ define <4 x i32> @test_urem_even_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_even_poweroftwo:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-SSE2-NEXT: por %xmm4, %xmm0
; CHECK-SSE2-NEXT: pxor %xmm2, %xmm0
; CHECK-SSE2-NEXT: por %xmm2, %xmm0
; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: retq
Expand Down Expand Up @@ -926,21 +922,20 @@ define <4 x i32> @test_urem_odd_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_odd_INT_MIN:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1]
; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-SSE2-NEXT: por %xmm4, %xmm0
; CHECK-SSE2-NEXT: por %xmm2, %xmm0
; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pandn %xmm2, %xmm0
; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-SSE41-LABEL: test_urem_odd_INT_MIN:
Expand Down Expand Up @@ -1006,19 +1001,18 @@ define <4 x i32> @test_urem_even_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_even_INT_MIN:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-SSE2-NEXT: por %xmm4, %xmm0
; CHECK-SSE2-NEXT: pxor %xmm2, %xmm0
; CHECK-SSE2-NEXT: por %xmm2, %xmm0
; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: retq
Expand Down Expand Up @@ -1167,21 +1161,20 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_odd_allones_and_poweroftwo:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1]
; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-SSE2-NEXT: por %xmm4, %xmm0
; CHECK-SSE2-NEXT: por %xmm2, %xmm0
; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pandn %xmm2, %xmm0
; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-SSE41-LABEL: test_urem_odd_allones_and_poweroftwo:
Expand Down Expand Up @@ -1842,21 +1835,20 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nou
; CHECK-SSE2-LABEL: test_urem_odd_allones_and_poweroftwo_and_one:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1]
; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-SSE2-NEXT: por %xmm4, %xmm0
; CHECK-SSE2-NEXT: por %xmm2, %xmm0
; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pandn %xmm2, %xmm0
; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-SSE41-LABEL: test_urem_odd_allones_and_poweroftwo_and_one:
Expand Down Expand Up @@ -1921,21 +1913,20 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) no
; CHECK-SSE2-LABEL: test_urem_even_allones_and_poweroftwo_and_one:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1]
; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-SSE2-NEXT: por %xmm4, %xmm0
; CHECK-SSE2-NEXT: por %xmm2, %xmm0
; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pandn %xmm2, %xmm0
; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-SSE41-LABEL: test_urem_even_allones_and_poweroftwo_and_one:
Expand Down