1,582 changes: 791 additions & 791 deletions llvm/test/CodeGen/AArch64/neon-dotreduce.ll

Large diffs are not rendered by default.

126 changes: 63 additions & 63 deletions llvm/test/CodeGen/AArch64/neon-extadd.ll
Original file line number Diff line number Diff line change
Expand Up @@ -459,56 +459,56 @@ define <20 x i32> @v20(<20 x i8> %s0, <20 x i8> %s1) {
; CHECK-NEXT: mov v0.b[1], w1
; CHECK-NEXT: ld1 { v3.b }[1], [x11]
; CHECK-NEXT: ld1 { v1.b }[1], [x9]
; CHECK-NEXT: add x12, sp, #16
; CHECK-NEXT: add x11, sp, #16
; CHECK-NEXT: add x9, sp, #112
; CHECK-NEXT: add x11, sp, #120
; CHECK-NEXT: add x13, sp, #184
; CHECK-NEXT: ld1 { v2.b }[2], [x10]
; CHECK-NEXT: add x12, sp, #120
; CHECK-NEXT: add x14, sp, #32
; CHECK-NEXT: ldr b5, [sp, #64]
; CHECK-NEXT: ld1 { v3.b }[2], [x12]
; CHECK-NEXT: add x12, sp, #184
; CHECK-NEXT: ld1 { v3.b }[2], [x11]
; CHECK-NEXT: ld1 { v1.b }[2], [x9]
; CHECK-NEXT: ldr b5, [sp, #64]
; CHECK-NEXT: mov v0.b[2], w2
; CHECK-NEXT: ldr b4, [sp, #224]
; CHECK-NEXT: add x9, sp, #128
; CHECK-NEXT: ld1 { v2.b }[3], [x12]
; CHECK-NEXT: add x12, sp, #24
; CHECK-NEXT: add x11, sp, #128
; CHECK-NEXT: ld1 { v2.b }[3], [x13]
; CHECK-NEXT: add x13, sp, #24
; CHECK-NEXT: add x10, sp, #136
; CHECK-NEXT: ld1 { v3.b }[3], [x12]
; CHECK-NEXT: ld1 { v1.b }[3], [x11]
; CHECK-NEXT: add x11, sp, #192
; CHECK-NEXT: add x12, sp, #200
; CHECK-NEXT: ld1 { v3.b }[3], [x13]
; CHECK-NEXT: ld1 { v1.b }[3], [x12]
; CHECK-NEXT: add x12, sp, #192
; CHECK-NEXT: add x13, sp, #200
; CHECK-NEXT: add x15, sp, #80
; CHECK-NEXT: add x13, sp, #144
; CHECK-NEXT: add x9, sp, #144
; CHECK-NEXT: mov v0.b[3], w3
; CHECK-NEXT: ld1 { v2.b }[4], [x11]
; CHECK-NEXT: add x11, sp, #232
; CHECK-NEXT: ld1 { v2.b }[4], [x12]
; CHECK-NEXT: add x12, sp, #232
; CHECK-NEXT: ld1 { v3.b }[4], [x14]
; CHECK-NEXT: add x14, sp, #72
; CHECK-NEXT: ld1 { v4.b }[1], [x11]
; CHECK-NEXT: ld1 { v4.b }[1], [x12]
; CHECK-NEXT: ld1 { v5.b }[1], [x14]
; CHECK-NEXT: add x14, sp, #40
; CHECK-NEXT: ld1 { v1.b }[4], [x9]
; CHECK-NEXT: ld1 { v2.b }[5], [x12]
; CHECK-NEXT: add x11, sp, #208
; CHECK-NEXT: add x12, sp, #48
; CHECK-NEXT: ld1 { v1.b }[4], [x11]
; CHECK-NEXT: ld1 { v2.b }[5], [x13]
; CHECK-NEXT: add x12, sp, #208
; CHECK-NEXT: add x13, sp, #48
; CHECK-NEXT: mov v0.b[4], w4
; CHECK-NEXT: ld1 { v3.b }[5], [x14]
; CHECK-NEXT: add x14, sp, #240
; CHECK-NEXT: ld1 { v4.b }[2], [x14]
; CHECK-NEXT: ld1 { v5.b }[2], [x15]
; CHECK-NEXT: ld1 { v1.b }[5], [x10]
; CHECK-NEXT: ld1 { v2.b }[6], [x11]
; CHECK-NEXT: add x9, sp, #216
; CHECK-NEXT: ld1 { v2.b }[6], [x12]
; CHECK-NEXT: add x11, sp, #216
; CHECK-NEXT: add x10, sp, #56
; CHECK-NEXT: ld1 { v3.b }[6], [x12]
; CHECK-NEXT: add x11, sp, #248
; CHECK-NEXT: add x12, sp, #88
; CHECK-NEXT: ld1 { v3.b }[6], [x13]
; CHECK-NEXT: add x12, sp, #248
; CHECK-NEXT: add x13, sp, #88
; CHECK-NEXT: mov v0.b[5], w5
; CHECK-NEXT: ld1 { v4.b }[3], [x11]
; CHECK-NEXT: ld1 { v5.b }[3], [x12]
; CHECK-NEXT: ld1 { v1.b }[6], [x13]
; CHECK-NEXT: ld1 { v2.b }[7], [x9]
; CHECK-NEXT: ld1 { v4.b }[3], [x12]
; CHECK-NEXT: ld1 { v5.b }[3], [x13]
; CHECK-NEXT: ld1 { v1.b }[6], [x9]
; CHECK-NEXT: ld1 { v2.b }[7], [x11]
; CHECK-NEXT: add x9, sp, #152
; CHECK-NEXT: ld1 { v3.b }[7], [x10]
; CHECK-NEXT: uaddl v4.8h, v5.8b, v4.8b
Expand Down Expand Up @@ -545,48 +545,48 @@ define <16 x i32> @i12(<16 x i12> %s0, <16 x i12> %s1) {
; CHECK-NEXT: .cfi_offset w21, -24
; CHECK-NEXT: .cfi_offset w22, -32
; CHECK-NEXT: .cfi_offset w23, -48
; CHECK-NEXT: ldr w10, [sp, #112]
; CHECK-NEXT: ldr w13, [sp, #144]
; CHECK-NEXT: ldr w13, [sp, #112]
; CHECK-NEXT: ldr w14, [sp, #144]
; CHECK-NEXT: fmov s2, w4
; CHECK-NEXT: ldr w15, [sp, #176]
; CHECK-NEXT: ldr w17, [sp, #208]
; CHECK-NEXT: ldr w17, [sp, #176]
; CHECK-NEXT: ldr w19, [sp, #208]
; CHECK-NEXT: fmov s3, w0
; CHECK-NEXT: ldr w18, [sp, #80]
; CHECK-NEXT: ldr w20, [sp, #48]
; CHECK-NEXT: fmov s5, w10
; CHECK-NEXT: fmov s4, w17
; CHECK-NEXT: fmov s6, w15
; CHECK-NEXT: fmov s7, w13
; CHECK-NEXT: fmov s0, w18
; CHECK-NEXT: fmov s1, w20
; CHECK-NEXT: ldr w9, [sp, #120]
; CHECK-NEXT: ldr w12, [sp, #152]
; CHECK-NEXT: ldr w14, [sp, #184]
; CHECK-NEXT: ldr w16, [sp, #216]
; CHECK-NEXT: ldr w21, [sp, #88]
; CHECK-NEXT: ldr w22, [sp, #56]
; CHECK-NEXT: ldr w20, [sp, #80]
; CHECK-NEXT: ldr w21, [sp, #48]
; CHECK-NEXT: fmov s5, w13
; CHECK-NEXT: fmov s4, w19
; CHECK-NEXT: fmov s6, w17
; CHECK-NEXT: fmov s7, w14
; CHECK-NEXT: fmov s0, w20
; CHECK-NEXT: fmov s1, w21
; CHECK-NEXT: ldr w10, [sp, #120]
; CHECK-NEXT: ldr w11, [sp, #152]
; CHECK-NEXT: ldr w12, [sp, #184]
; CHECK-NEXT: ldr w15, [sp, #216]
; CHECK-NEXT: ldr w22, [sp, #88]
; CHECK-NEXT: ldr w23, [sp, #56]
; CHECK-NEXT: mov v2.h[1], w5
; CHECK-NEXT: mov v3.h[1], w1
; CHECK-NEXT: mov v5.h[1], w9
; CHECK-NEXT: mov v4.h[1], w16
; CHECK-NEXT: mov v0.h[1], w21
; CHECK-NEXT: mov v1.h[1], w22
; CHECK-NEXT: mov v6.h[1], w14
; CHECK-NEXT: mov v7.h[1], w12
; CHECK-NEXT: mov v5.h[1], w10
; CHECK-NEXT: mov v4.h[1], w15
; CHECK-NEXT: mov v0.h[1], w22
; CHECK-NEXT: mov v1.h[1], w23
; CHECK-NEXT: mov v6.h[1], w12
; CHECK-NEXT: mov v7.h[1], w11
; CHECK-NEXT: ldr w8, [sp, #128]
; CHECK-NEXT: ldr w11, [sp, #160]
; CHECK-NEXT: ldr w19, [sp, #64]
; CHECK-NEXT: ldr w23, [sp, #96]
; CHECK-NEXT: ldr w9, [sp, #192]
; CHECK-NEXT: ldr w10, [sp, #224]
; CHECK-NEXT: ldr w9, [sp, #160]
; CHECK-NEXT: ldr w16, [sp, #64]
; CHECK-NEXT: ldr w18, [sp, #96]
; CHECK-NEXT: ldr w10, [sp, #192]
; CHECK-NEXT: ldr w11, [sp, #224]
; CHECK-NEXT: mov v2.h[2], w6
; CHECK-NEXT: mov v3.h[2], w2
; CHECK-NEXT: mov v0.h[2], w23
; CHECK-NEXT: mov v1.h[2], w19
; CHECK-NEXT: mov v0.h[2], w18
; CHECK-NEXT: mov v1.h[2], w16
; CHECK-NEXT: mov v5.h[2], w8
; CHECK-NEXT: mov v4.h[2], w10
; CHECK-NEXT: mov v6.h[2], w9
; CHECK-NEXT: mov v7.h[2], w11
; CHECK-NEXT: mov v4.h[2], w11
; CHECK-NEXT: mov v6.h[2], w10
; CHECK-NEXT: mov v7.h[2], w9
; CHECK-NEXT: ldr w12, [sp, #72]
; CHECK-NEXT: ldr w13, [sp, #104]
; CHECK-NEXT: ldr w8, [sp, #136]
Expand Down
8 changes: 4 additions & 4 deletions llvm/test/CodeGen/AArch64/pow.ll
Original file line number Diff line number Diff line change
Expand Up @@ -110,17 +110,17 @@ define <2 x double> @pow_v2f64_one_fourth_not_enough_fmf(<2 x double> %x) nounwi
; CHECK-LABEL: pow_v2f64_one_fourth_not_enough_fmf:
; CHECK: // %bb.0:
; CHECK-NEXT: sub sp, sp, #48
; CHECK-NEXT: str q0, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill
; CHECK-NEXT: mov d0, v0.d[1]
; CHECK-NEXT: fmov d1, #0.25000000
; CHECK-NEXT: str x30, [sp, #32] // 8-byte Folded Spill
; CHECK-NEXT: bl pow
; CHECK-NEXT: fmov d1, #0.25000000
; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill
; CHECK-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload
; CHECK-NEXT: str q0, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: bl pow
; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload
; CHECK-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload
; CHECK-NEXT: mov v0.d[1], v1.d[0]
Expand Down
72 changes: 36 additions & 36 deletions llvm/test/CodeGen/AArch64/ragreedy-csr.ll
Original file line number Diff line number Diff line change
Expand Up @@ -35,18 +35,18 @@ define fastcc i32 @prune_match(ptr nocapture readonly %a, ptr nocapture readonly
; CHECK-NEXT: adrp x14, __DefaultRuneLocale@GOTPAGE
; CHECK-NEXT: ldrb w12, [x0, #4]
; CHECK-NEXT: ldrb w13, [x1, #4]
; CHECK-NEXT: ldr x10, [x0, #16]
; CHECK-NEXT: ldr x9, [x1, #16]
; CHECK-NEXT: ldr x9, [x0, #16]
; CHECK-NEXT: ldr x10, [x1, #16]
; CHECK-NEXT: mov x11, xzr
; CHECK-NEXT: Lloh1:
; CHECK-NEXT: ldr x14, [x14, __DefaultRuneLocale@GOTPAGEOFF]
; CHECK-NEXT: ldrsb x8, [x10, x11]
; CHECK-NEXT: ldrsb x8, [x9, x11]
; CHECK-NEXT: tbz x8, #63, LBB0_3
; CHECK-NEXT: LBB0_2: ; %cond.false.i.i
; CHECK-NEXT: stp x10, x0, [sp, #32] ; 16-byte Folded Spill
; CHECK-NEXT: stp x9, x0, [sp, #32] ; 16-byte Folded Spill
; CHECK-NEXT: mov w0, w8
; CHECK-NEXT: mov w1, #32768 ; =0x8000
; CHECK-NEXT: str x9, [sp, #8] ; 8-byte Folded Spill
; CHECK-NEXT: str x10, [sp, #8] ; 8-byte Folded Spill
; CHECK-NEXT: str x11, [sp, #24] ; 8-byte Folded Spill
; CHECK-NEXT: str w12, [sp, #4] ; 4-byte Folded Spill
; CHECK-NEXT: str w13, [sp, #20] ; 4-byte Folded Spill
Expand All @@ -56,10 +56,10 @@ define fastcc i32 @prune_match(ptr nocapture readonly %a, ptr nocapture readonly
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: Lloh3:
; CHECK-NEXT: ldr x14, [x14, __DefaultRuneLocale@GOTPAGEOFF]
; CHECK-NEXT: ldp x11, x10, [sp, #24] ; 16-byte Folded Reload
; CHECK-NEXT: ldp x11, x9, [sp, #24] ; 16-byte Folded Reload
; CHECK-NEXT: ldr w13, [sp, #20] ; 4-byte Folded Reload
; CHECK-NEXT: ldr w12, [sp, #4] ; 4-byte Folded Reload
; CHECK-NEXT: ldr x9, [sp, #8] ; 8-byte Folded Reload
; CHECK-NEXT: ldr x10, [sp, #8] ; 8-byte Folded Reload
; CHECK-NEXT: ldr x0, [sp, #40] ; 8-byte Folded Reload
; CHECK-NEXT: cbz w8, LBB0_4
; CHECK-NEXT: b LBB0_6
Expand All @@ -69,28 +69,28 @@ define fastcc i32 @prune_match(ptr nocapture readonly %a, ptr nocapture readonly
; CHECK-NEXT: and w8, w8, #0x8000
; CHECK-NEXT: cbnz w8, LBB0_6
; CHECK-NEXT: LBB0_4: ; %lor.rhs
; CHECK-NEXT: ldrsb x8, [x9, x11]
; CHECK-NEXT: ldrsb x8, [x10, x11]
; CHECK-NEXT: tbnz x8, #63, LBB0_8
; CHECK-NEXT: ; %bb.5: ; %cond.true.i.i217
; CHECK-NEXT: add x8, x14, x8, lsl #2
; CHECK-NEXT: ldr w8, [x8, #60]
; CHECK-NEXT: and w8, w8, #0x8000
; CHECK-NEXT: cbz w8, LBB0_9
; CHECK-NEXT: LBB0_6: ; %while.body
; CHECK-NEXT: ldrb w8, [x10, x11]
; CHECK-NEXT: ldrb w15, [x9, x11]
; CHECK-NEXT: ldrb w8, [x9, x11]
; CHECK-NEXT: ldrb w15, [x10, x11]
; CHECK-NEXT: cmp w8, w15
; CHECK-NEXT: b.ne LBB0_42
; CHECK-NEXT: ; %bb.7: ; %if.end17
; CHECK-NEXT: add x11, x11, #1
; CHECK-NEXT: ldrsb x8, [x10, x11]
; CHECK-NEXT: ldrsb x8, [x9, x11]
; CHECK-NEXT: tbz x8, #63, LBB0_3
; CHECK-NEXT: b LBB0_2
; CHECK-NEXT: LBB0_8: ; %cond.false.i.i219
; CHECK-NEXT: stp x10, x0, [sp, #32] ; 16-byte Folded Spill
; CHECK-NEXT: stp x9, x0, [sp, #32] ; 16-byte Folded Spill
; CHECK-NEXT: mov w0, w8
; CHECK-NEXT: mov w1, #32768 ; =0x8000
; CHECK-NEXT: str x9, [sp, #8] ; 8-byte Folded Spill
; CHECK-NEXT: str x10, [sp, #8] ; 8-byte Folded Spill
; CHECK-NEXT: str x11, [sp, #24] ; 8-byte Folded Spill
; CHECK-NEXT: str w12, [sp, #4] ; 4-byte Folded Spill
; CHECK-NEXT: str w13, [sp, #20] ; 4-byte Folded Spill
Expand All @@ -100,27 +100,27 @@ define fastcc i32 @prune_match(ptr nocapture readonly %a, ptr nocapture readonly
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: Lloh5:
; CHECK-NEXT: ldr x14, [x14, __DefaultRuneLocale@GOTPAGEOFF]
; CHECK-NEXT: ldp x11, x10, [sp, #24] ; 16-byte Folded Reload
; CHECK-NEXT: ldp x11, x9, [sp, #24] ; 16-byte Folded Reload
; CHECK-NEXT: ldr w13, [sp, #20] ; 4-byte Folded Reload
; CHECK-NEXT: ldr w12, [sp, #4] ; 4-byte Folded Reload
; CHECK-NEXT: ldr x9, [sp, #8] ; 8-byte Folded Reload
; CHECK-NEXT: ldr x10, [sp, #8] ; 8-byte Folded Reload
; CHECK-NEXT: ldr x0, [sp, #40] ; 8-byte Folded Reload
; CHECK-NEXT: cbnz w8, LBB0_6
; CHECK-NEXT: LBB0_9: ; %while.end
; CHECK-NEXT: orr w8, w13, w12
; CHECK-NEXT: cbnz w8, LBB0_24
; CHECK-NEXT: ; %bb.10: ; %if.then23
; CHECK-NEXT: ldr x12, [x0, #16]
; CHECK-NEXT: ldrb w8, [x10, x11]
; CHECK-NEXT: ldrb w8, [x9, x11]
; CHECK-NEXT: ldrb w13, [x12]
; CHECK-NEXT: cmp w13, #83
; CHECK-NEXT: b.eq LBB0_19
; CHECK-NEXT: LBB0_11: ; %while.cond59.preheader
; CHECK-NEXT: cbz w8, LBB0_23
; CHECK-NEXT: LBB0_12: ; %land.rhs.preheader
; CHECK-NEXT: add x10, x10, x11
; CHECK-NEXT: add x9, x9, x11
; CHECK-NEXT: add x10, x10, #1
; CHECK-NEXT: add x12, x9, x11
; CHECK-NEXT: add x9, x10, x11
; CHECK-NEXT: add x10, x12, #1
; CHECK-NEXT: LBB0_13: ; %land.rhs
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldrb w11, [x9], #1
Expand Down Expand Up @@ -154,11 +154,11 @@ define fastcc i32 @prune_match(ptr nocapture readonly %a, ptr nocapture readonly
; CHECK-NEXT: cmp w8, #112
; CHECK-NEXT: b.ne LBB0_12
; CHECK-NEXT: ; %bb.21: ; %land.lhs.true35
; CHECK-NEXT: ldrb w13, [x9, x11]
; CHECK-NEXT: ldrb w13, [x10, x11]
; CHECK-NEXT: cmp w13, #112
; CHECK-NEXT: b.ne LBB0_12
; CHECK-NEXT: ; %bb.22: ; %land.lhs.true43
; CHECK-NEXT: sub x12, x10, x12
; CHECK-NEXT: sub x12, x9, x12
; CHECK-NEXT: add x12, x12, x11
; CHECK-NEXT: cmp x12, #1
; CHECK-NEXT: b.ne LBB0_44
Expand All @@ -172,22 +172,22 @@ define fastcc i32 @prune_match(ptr nocapture readonly %a, ptr nocapture readonly
; CHECK-NEXT: cmp w13, #2
; CHECK-NEXT: b.ne LBB0_33
; CHECK-NEXT: ; %bb.26: ; %while.cond95.preheader
; CHECK-NEXT: ldrb w12, [x10, x11]
; CHECK-NEXT: ldrb w12, [x9, x11]
; CHECK-NEXT: cbz w12, LBB0_23
; CHECK-NEXT: ; %bb.27: ; %land.rhs99.preheader
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: mov w0, #1 ; =0x1
; CHECK-NEXT: b LBB0_29
; CHECK-NEXT: LBB0_28: ; %if.then117
; CHECK-NEXT: ; in Loop: Header=BB0_29 Depth=1
; CHECK-NEXT: add x12, x10, x8
; CHECK-NEXT: add x12, x9, x8
; CHECK-NEXT: add x8, x8, #1
; CHECK-NEXT: add x12, x12, x11
; CHECK-NEXT: ldrb w12, [x12, #1]
; CHECK-NEXT: cbz w12, LBB0_43
; CHECK-NEXT: LBB0_29: ; %land.rhs99
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: add x13, x9, x8
; CHECK-NEXT: add x13, x10, x8
; CHECK-NEXT: ldrb w13, [x13, x11]
; CHECK-NEXT: cbz w13, LBB0_23
; CHECK-NEXT: ; %bb.30: ; %while.body104
Expand All @@ -211,35 +211,35 @@ define fastcc i32 @prune_match(ptr nocapture readonly %a, ptr nocapture readonly
; CHECK-NEXT: cmp w12, #2
; CHECK-NEXT: b.ne LBB0_43
; CHECK-NEXT: ; %bb.35: ; %while.cond130.preheader
; CHECK-NEXT: ldrb w12, [x10, x11]
; CHECK-NEXT: cbz w12, LBB0_23
; CHECK-NEXT: ldrb w8, [x9, x11]
; CHECK-NEXT: cbz w8, LBB0_23
; CHECK-NEXT: ; %bb.36: ; %land.rhs134.preheader
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: mov x12, xzr
; CHECK-NEXT: mov w0, #1 ; =0x1
; CHECK-NEXT: b LBB0_38
; CHECK-NEXT: LBB0_37: ; %if.then152
; CHECK-NEXT: ; in Loop: Header=BB0_38 Depth=1
; CHECK-NEXT: add x12, x10, x8
; CHECK-NEXT: add x8, x8, #1
; CHECK-NEXT: add x12, x12, x11
; CHECK-NEXT: ldrb w12, [x12, #1]
; CHECK-NEXT: cbz w12, LBB0_43
; CHECK-NEXT: add x8, x9, x12
; CHECK-NEXT: add x12, x12, #1
; CHECK-NEXT: add x8, x8, x11
; CHECK-NEXT: ldrb w8, [x8, #1]
; CHECK-NEXT: cbz w8, LBB0_43
; CHECK-NEXT: LBB0_38: ; %land.rhs134
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: add x13, x9, x8
; CHECK-NEXT: add x13, x10, x12
; CHECK-NEXT: ldrb w13, [x13, x11]
; CHECK-NEXT: cbz w13, LBB0_23
; CHECK-NEXT: ; %bb.39: ; %while.body139
; CHECK-NEXT: ; in Loop: Header=BB0_38 Depth=1
; CHECK-NEXT: cmp w12, w13
; CHECK-NEXT: cmp w8, w13
; CHECK-NEXT: b.eq LBB0_37
; CHECK-NEXT: ; %bb.40: ; %while.body139
; CHECK-NEXT: ; in Loop: Header=BB0_38 Depth=1
; CHECK-NEXT: cmp w13, #42
; CHECK-NEXT: b.eq LBB0_37
; CHECK-NEXT: ; %bb.41: ; %while.body139
; CHECK-NEXT: ; in Loop: Header=BB0_38 Depth=1
; CHECK-NEXT: cmp w12, #94
; CHECK-NEXT: cmp w8, #94
; CHECK-NEXT: b.eq LBB0_37
; CHECK-NEXT: LBB0_42:
; CHECK-NEXT: mov w0, wzr
Expand All @@ -251,7 +251,7 @@ define fastcc i32 @prune_match(ptr nocapture readonly %a, ptr nocapture readonly
; CHECK-NEXT: cmp x12, #2
; CHECK-NEXT: b.ne LBB0_11
; CHECK-NEXT: ; %bb.45: ; %land.lhs.true52
; CHECK-NEXT: add x12, x10, x11
; CHECK-NEXT: add x12, x9, x11
; CHECK-NEXT: mov w0, #1 ; =0x1
; CHECK-NEXT: ldurb w12, [x12, #-1]
; CHECK-NEXT: cmp w12, #73
Expand Down
248 changes: 127 additions & 121 deletions llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,14 @@
define dso_local void @run_test() local_unnamed_addr uwtable {
; CHECK-LABEL: run_test:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: sub sp, sp, #176
; CHECK-NEXT: .cfi_def_cfa_offset 176
; CHECK-NEXT: stp d15, d14, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #96] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #112] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #128] // 16-byte Folded Spill
; CHECK-NEXT: stp x22, x21, [sp, #144] // 16-byte Folded Spill
; CHECK-NEXT: stp x20, x19, [sp, #160] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #192
; CHECK-NEXT: .cfi_def_cfa_offset 192
; CHECK-NEXT: stp d15, d14, [sp, #96] // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #112] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #128] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #144] // 16-byte Folded Spill
; CHECK-NEXT: stp x22, x21, [sp, #160] // 16-byte Folded Spill
; CHECK-NEXT: stp x20, x19, [sp, #176] // 16-byte Folded Spill
; CHECK-NEXT: .cfi_offset w19, -8
; CHECK-NEXT: .cfi_offset w20, -16
; CHECK-NEXT: .cfi_offset w21, -24
Expand All @@ -29,157 +29,159 @@ define dso_local void @run_test() local_unnamed_addr uwtable {
; CHECK-NEXT: .cfi_offset b14, -88
; CHECK-NEXT: .cfi_offset b15, -96
; CHECK-NEXT: movi v1.2d, #0000000000000000
; CHECK-NEXT: // implicit-def: $q6
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: mov x9, xzr
; CHECK-NEXT: adrp x10, B+48
; CHECK-NEXT: add x10, x10, :lo12:B+48
; CHECK-NEXT: adrp x11, A
; CHECK-NEXT: add x11, x11, :lo12:A
; CHECK-NEXT: // kill: killed $q6
; CHECK-NEXT: // implicit-def: $q6
; CHECK-NEXT: // implicit-def: $q0
; CHECK-NEXT: // implicit-def: $q2
; CHECK-NEXT: // implicit-def: $q3
; CHECK-NEXT: // implicit-def: $q4
; CHECK-NEXT: // implicit-def: $q5
; CHECK-NEXT: // implicit-def: $q7
; CHECK-NEXT: // implicit-def: $q16
; CHECK-NEXT: // implicit-def: $q17
; CHECK-NEXT: // implicit-def: $q18
; CHECK-NEXT: // implicit-def: $q10
; CHECK-NEXT: // implicit-def: $q19
; CHECK-NEXT: // implicit-def: $q20
; CHECK-NEXT: // implicit-def: $q21
; CHECK-NEXT: // implicit-def: $q22
; CHECK-NEXT: // implicit-def: $q23
; CHECK-NEXT: // implicit-def: $q24
; CHECK-NEXT: // implicit-def: $q25
; CHECK-NEXT: // implicit-def: $q26
; CHECK-NEXT: // implicit-def: $q27
; CHECK-NEXT: // implicit-def: $q26
; CHECK-NEXT: // implicit-def: $q28
; CHECK-NEXT: // implicit-def: $q30
; CHECK-NEXT: // implicit-def: $q15
; CHECK-NEXT: // implicit-def: $q18
; CHECK-NEXT: // implicit-def: $q29
; CHECK-NEXT: // implicit-def: $q31
; CHECK-NEXT: // implicit-def: $q8
; CHECK-NEXT: // implicit-def: $q9
; CHECK-NEXT: // implicit-def: $q10
; CHECK-NEXT: // implicit-def: $q11
; CHECK-NEXT: // implicit-def: $q12
; CHECK-NEXT: // implicit-def: $q13
; CHECK-NEXT: // implicit-def: $q11
; CHECK-NEXT: // kill: killed $q6
; CHECK-NEXT: // implicit-def: $q6
; CHECK-NEXT: // kill: killed $q6
; CHECK-NEXT: .LBB0_1: // %for.cond1.preheader
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: mov x12, xzr
; CHECK-NEXT: ldr q14, [x8]
; CHECK-NEXT: stp q29, q15, [sp] // 32-byte Folded Spill
; CHECK-NEXT: mov x12, xzr
; CHECK-NEXT: str q18, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: ldr x14, [x12]
; CHECK-NEXT: ldr q15, [x12]
; CHECK-NEXT: ldr x13, [x12]
; CHECK-NEXT: add x7, x11, x8
; CHECK-NEXT: fmov x12, d14
; CHECK-NEXT: mov x14, v14.d[1]
; CHECK-NEXT: fmov x15, d14
; CHECK-NEXT: mov x16, v14.d[1]
; CHECK-NEXT: ldr q18, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: fmov x18, d15
; CHECK-NEXT: mov x13, v15.d[1]
; CHECK-NEXT: ldr x5, [x8]
; CHECK-NEXT: fmov x17, d15
; CHECK-NEXT: ldr q14, [x10], #64
; CHECK-NEXT: mov x16, v15.d[1]
; CHECK-NEXT: ldr x7, [x7, #128]
; CHECK-NEXT: stp q30, q28, [sp, #32] // 32-byte Folded Spill
; CHECK-NEXT: mul x15, x12, x13
; CHECK-NEXT: mov x0, v14.d[1]
; CHECK-NEXT: mul x17, x15, x14
; CHECK-NEXT: mov v6.16b, v0.16b
; CHECK-NEXT: mov v9.16b, v27.16b
; CHECK-NEXT: mov x12, v14.d[1]
; CHECK-NEXT: fmov x4, d14
; CHECK-NEXT: mov v30.16b, v27.16b
; CHECK-NEXT: mov v27.16b, v24.16b
; CHECK-NEXT: mov v24.16b, v21.16b
; CHECK-NEXT: mul x18, x17, x13
; CHECK-NEXT: mov v21.16b, v18.16b
; CHECK-NEXT: mov v18.16b, v7.16b
; CHECK-NEXT: mov v27.16b, v23.16b
; CHECK-NEXT: mul x1, x16, x14
; CHECK-NEXT: mov v23.16b, v19.16b
; CHECK-NEXT: mov v19.16b, v7.16b
; CHECK-NEXT: mov v7.16b, v2.16b
; CHECK-NEXT: mov v28.16b, v25.16b
; CHECK-NEXT: mov v25.16b, v22.16b
; CHECK-NEXT: mul x19, x12, x5
; CHECK-NEXT: mov v22.16b, v19.16b
; CHECK-NEXT: mov v19.16b, v16.16b
; CHECK-NEXT: fmov d15, x15
; CHECK-NEXT: stp q26, q31, [sp] // 32-byte Folded Spill
; CHECK-NEXT: mov v31.16b, v22.16b
; CHECK-NEXT: mul x0, x18, x14
; CHECK-NEXT: mov v26.16b, v10.16b
; CHECK-NEXT: mov v22.16b, v5.16b
; CHECK-NEXT: fmov d15, x17
; CHECK-NEXT: mov v5.16b, v1.16b
; CHECK-NEXT: mov v8.16b, v20.16b
; CHECK-NEXT: mul x2, x13, x14
; CHECK-NEXT: mov v20.16b, v16.16b
; CHECK-NEXT: mov v16.16b, v3.16b
; CHECK-NEXT: mov v29.16b, v26.16b
; CHECK-NEXT: mul x12, x12, x7
; CHECK-NEXT: mov v26.16b, v23.16b
; CHECK-NEXT: mov v23.16b, v20.16b
; CHECK-NEXT: fmov d14, x18
; CHECK-NEXT: mov v20.16b, v17.16b
; CHECK-NEXT: mov v10.16b, v21.16b
; CHECK-NEXT: mov v21.16b, v17.16b
; CHECK-NEXT: mov v17.16b, v4.16b
; CHECK-NEXT: mul x1, x14, x13
; CHECK-NEXT: str q5, [sp, #64] // 16-byte Folded Spill
; CHECK-NEXT: mov v5.16b, v1.16b
; CHECK-NEXT: fmov d1, x19
; CHECK-NEXT: mov v15.d[1], x1
; CHECK-NEXT: mul x3, x12, x14
; CHECK-NEXT: add x8, x8, #8
; CHECK-NEXT: add x9, x9, #1
; CHECK-NEXT: mul x2, x16, x13
; CHECK-NEXT: fmov d14, x0
; CHECK-NEXT: cmp x8, #64
; CHECK-NEXT: fmov d2, x12
; CHECK-NEXT: mul x3, x0, x13
; CHECK-NEXT: mov v15.d[1], x1
; CHECK-NEXT: mul x13, x4, x13
; CHECK-NEXT: add x9, x9, #1
; CHECK-NEXT: mul x14, x4, x14
; CHECK-NEXT: add v18.2d, v18.2d, v15.2d
; CHECK-NEXT: mul x19, x15, x5
; CHECK-NEXT: mov v14.d[1], x2
; CHECK-NEXT: mul x21, x17, x7
; CHECK-NEXT: add v12.2d, v12.2d, v15.2d
; CHECK-NEXT: ldr q15, [sp, #16] // 16-byte Folded Reload
; CHECK-NEXT: mul x6, x14, x5
; CHECK-NEXT: fmov d0, x13
; CHECK-NEXT: add v13.2d, v13.2d, v14.2d
; CHECK-NEXT: add v11.2d, v11.2d, v14.2d
; CHECK-NEXT: mul x14, x14, x7
; CHECK-NEXT: fmov d3, x21
; CHECK-NEXT: mul x18, x4, x7
; CHECK-NEXT: mul x15, x15, x7
; CHECK-NEXT: fmov d0, x14
; CHECK-NEXT: str q18, [sp, #64] // 16-byte Folded Spill
; CHECK-NEXT: ldp q18, q15, [sp, #32] // 32-byte Folded Reload
; CHECK-NEXT: mul x6, x16, x5
; CHECK-NEXT: fmov d1, x19
; CHECK-NEXT: mov v0.d[1], x3
; CHECK-NEXT: mul x16, x16, x7
; CHECK-NEXT: fmov d2, x15
; CHECK-NEXT: add v15.2d, v15.2d, v14.2d
; CHECK-NEXT: mul x21, x18, x7
; CHECK-NEXT: mov v1.d[1], x6
; CHECK-NEXT: mul x20, x16, x7
; CHECK-NEXT: mov v2.d[1], x14
; CHECK-NEXT: mul x22, x0, x7
; CHECK-NEXT: add v10.2d, v10.2d, v0.2d
; CHECK-NEXT: fmov d4, x18
; CHECK-NEXT: add v8.2d, v8.2d, v1.2d
; CHECK-NEXT: mul x13, x17, x5
; CHECK-NEXT: mov v3.d[1], x20
; CHECK-NEXT: add v15.2d, v15.2d, v2.2d
; CHECK-NEXT: mul x0, x4, x7
; CHECK-NEXT: str q15, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: add v15.2d, v11.2d, v14.2d
; CHECK-NEXT: mov v2.d[1], x16
; CHECK-NEXT: ldr q11, [sp, #80] // 16-byte Folded Reload
; CHECK-NEXT: mul x20, x13, x7
; CHECK-NEXT: fmov d3, x21
; CHECK-NEXT: add v11.2d, v11.2d, v0.2d
; CHECK-NEXT: add v12.2d, v12.2d, v1.2d
; CHECK-NEXT: mul x22, x12, x7
; CHECK-NEXT: fmov d4, x0
; CHECK-NEXT: add v18.2d, v18.2d, v2.2d
; CHECK-NEXT: mov v2.16b, v7.16b
; CHECK-NEXT: mul x14, x4, x5
; CHECK-NEXT: mov v7.16b, v18.16b
; CHECK-NEXT: mov v18.16b, v21.16b
; CHECK-NEXT: mul x14, x18, x5
; CHECK-NEXT: mov v7.16b, v19.16b
; CHECK-NEXT: mov v19.16b, v23.16b
; CHECK-NEXT: mov v3.d[1], x20
; CHECK-NEXT: mov v23.16b, v27.16b
; CHECK-NEXT: mov v27.16b, v9.16b
; CHECK-NEXT: mul x15, x4, x5
; CHECK-NEXT: add v27.2d, v9.2d, v1.2d
; CHECK-NEXT: str q11, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: mov v4.d[1], x22
; CHECK-NEXT: mov v21.16b, v24.16b
; CHECK-NEXT: mov v24.16b, v27.16b
; CHECK-NEXT: mul x12, x16, x5
; CHECK-NEXT: mov v27.16b, v30.16b
; CHECK-NEXT: ldr q30, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: fmov d14, x13
; CHECK-NEXT: add v19.2d, v19.2d, v1.2d
; CHECK-NEXT: add v7.2d, v7.2d, v1.2d
; CHECK-NEXT: mul x13, x0, x5
; CHECK-NEXT: mul x13, x13, x5
; CHECK-NEXT: add v23.2d, v23.2d, v1.2d
; CHECK-NEXT: add v1.2d, v5.2d, v1.2d
; CHECK-NEXT: fmov d14, x14
; CHECK-NEXT: add v30.2d, v30.2d, v3.2d
; CHECK-NEXT: mov v3.16b, v16.16b
; CHECK-NEXT: mov v16.16b, v19.16b
; CHECK-NEXT: mov v19.16b, v22.16b
; CHECK-NEXT: mov v22.16b, v25.16b
; CHECK-NEXT: mov v25.16b, v28.16b
; CHECK-NEXT: ldr q28, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: fmov d0, x14
; CHECK-NEXT: mov v14.d[1], x12
; CHECK-NEXT: mul x12, x12, x5
; CHECK-NEXT: mov v16.16b, v20.16b
; CHECK-NEXT: mov v5.16b, v22.16b
; CHECK-NEXT: fmov d0, x15
; CHECK-NEXT: add v28.2d, v28.2d, v4.2d
; CHECK-NEXT: mov v4.16b, v17.16b
; CHECK-NEXT: mov v17.16b, v20.16b
; CHECK-NEXT: mov v20.16b, v23.16b
; CHECK-NEXT: mov v23.16b, v26.16b
; CHECK-NEXT: mov v26.16b, v29.16b
; CHECK-NEXT: mov v0.d[1], x13
; CHECK-NEXT: ldr q29, [sp] // 16-byte Folded Reload
; CHECK-NEXT: add v19.2d, v19.2d, v1.2d
; CHECK-NEXT: add v9.2d, v9.2d, v14.2d
; CHECK-NEXT: mov v17.16b, v21.16b
; CHECK-NEXT: mov v21.16b, v10.16b
; CHECK-NEXT: mov v10.16b, v26.16b
; CHECK-NEXT: mov v14.d[1], x13
; CHECK-NEXT: mov v22.16b, v31.16b
; CHECK-NEXT: mov v20.16b, v8.16b
; CHECK-NEXT: ldp q26, q31, [sp] // 32-byte Folded Reload
; CHECK-NEXT: mov v11.16b, v15.16b
; CHECK-NEXT: mov v0.d[1], x12
; CHECK-NEXT: add v13.2d, v13.2d, v14.2d
; CHECK-NEXT: add v31.2d, v31.2d, v14.2d
; CHECK-NEXT: add v27.2d, v27.2d, v14.2d
; CHECK-NEXT: add v26.2d, v26.2d, v1.2d
; CHECK-NEXT: add v23.2d, v23.2d, v1.2d
; CHECK-NEXT: add v1.2d, v5.2d, v1.2d
; CHECK-NEXT: ldr q5, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: add v26.2d, v26.2d, v14.2d
; CHECK-NEXT: add v24.2d, v24.2d, v14.2d
; CHECK-NEXT: add v22.2d, v22.2d, v14.2d
; CHECK-NEXT: add v20.2d, v20.2d, v14.2d
; CHECK-NEXT: add v18.2d, v18.2d, v14.2d
; CHECK-NEXT: add v20.2d, v8.2d, v14.2d
; CHECK-NEXT: add v10.2d, v10.2d, v14.2d
; CHECK-NEXT: add v16.2d, v16.2d, v14.2d
; CHECK-NEXT: add v5.2d, v5.2d, v14.2d
; CHECK-NEXT: add v3.2d, v3.2d, v14.2d
Expand All @@ -189,34 +191,38 @@ define dso_local void @run_test() local_unnamed_addr uwtable {
; CHECK-NEXT: add v21.2d, v21.2d, v0.2d
; CHECK-NEXT: add v17.2d, v17.2d, v0.2d
; CHECK-NEXT: add v4.2d, v4.2d, v0.2d
; CHECK-NEXT: add v6.2d, v6.2d, v0.2d
; CHECK-NEXT: add v0.2d, v6.2d, v0.2d
; CHECK-NEXT: b.ne .LBB0_1
; CHECK-NEXT: // %bb.2: // %for.cond.cleanup
; CHECK-NEXT: ldr q6, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: adrp x8, C
; CHECK-NEXT: add x8, x8, :lo12:C
; CHECK-NEXT: stp q13, q12, [x8]
; CHECK-NEXT: ldp x20, x19, [sp, #160] // 16-byte Folded Reload
; CHECK-NEXT: stp q11, q10, [x8, #32]
; CHECK-NEXT: ldp x22, x21, [sp, #144] // 16-byte Folded Reload
; CHECK-NEXT: stp q9, q8, [x8, #64]
; CHECK-NEXT: ldp d9, d8, [sp, #128] // 16-byte Folded Reload
; CHECK-NEXT: stp q15, q30, [x8, #144]
; CHECK-NEXT: ldp d11, d10, [sp, #112] // 16-byte Folded Reload
; CHECK-NEXT: stp q31, q29, [x8, #96]
; CHECK-NEXT: ldp d13, d12, [sp, #96] // 16-byte Folded Reload
; CHECK-NEXT: stp q28, q27, [x8, #176]
; CHECK-NEXT: ldp d15, d14, [sp, #80] // 16-byte Folded Reload
; CHECK-NEXT: str q26, [x8, #208]
; CHECK-NEXT: stp q12, q31, [x8, #80]
; CHECK-NEXT: ldp x20, x19, [sp, #176] // 16-byte Folded Reload
; CHECK-NEXT: str q6, [x8]
; CHECK-NEXT: ldr q6, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: str q29, [x8, #112]
; CHECK-NEXT: ldp x22, x21, [sp, #160] // 16-byte Folded Reload
; CHECK-NEXT: stp q6, q11, [x8, #16]
; CHECK-NEXT: ldr q6, [sp, #80] // 16-byte Folded Reload
; CHECK-NEXT: stp q18, q30, [x8, #144]
; CHECK-NEXT: ldp d9, d8, [sp, #144] // 16-byte Folded Reload
; CHECK-NEXT: stp q6, q13, [x8, #48]
; CHECK-NEXT: ldp d13, d12, [sp, #112] // 16-byte Folded Reload
; CHECK-NEXT: stp q28, q26, [x8, #176]
; CHECK-NEXT: ldp d15, d14, [sp, #96] // 16-byte Folded Reload
; CHECK-NEXT: stp q19, q10, [x8, #336]
; CHECK-NEXT: ldp d11, d10, [sp, #128] // 16-byte Folded Reload
; CHECK-NEXT: str q27, [x8, #208]
; CHECK-NEXT: stp q25, q24, [x8, #240]
; CHECK-NEXT: stp q23, q22, [x8, #272]
; CHECK-NEXT: stp q21, q20, [x8, #304]
; CHECK-NEXT: stp q19, q18, [x8, #336]
; CHECK-NEXT: stp q17, q16, [x8, #368]
; CHECK-NEXT: stp q7, q5, [x8, #400]
; CHECK-NEXT: stp q4, q3, [x8, #432]
; CHECK-NEXT: stp q1, q2, [x8, #464]
; CHECK-NEXT: str q6, [x8, #496]
; CHECK-NEXT: add sp, sp, #176
; CHECK-NEXT: str q0, [x8, #496]
; CHECK-NEXT: add sp, sp, #192
; CHECK-NEXT: .cfi_def_cfa_offset 0
; CHECK-NEXT: .cfi_restore w19
; CHECK-NEXT: .cfi_restore w20
Expand Down
36 changes: 18 additions & 18 deletions llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll
Original file line number Diff line number Diff line change
Expand Up @@ -333,21 +333,21 @@ define void @masked_scatter_v8i32(ptr %a, ptr %b) #0 {
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
; VBITS_GE_256-NEXT: ptrue p1.d, vl4
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: ld1d { z3.d }, p1/z, [x1, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z4.d }, p1/z, [x1]
; VBITS_GE_256-NEXT: cmpeq p0.s, p0/z, z0.s, #0
; VBITS_GE_256-NEXT: cmpeq p1.s, p0/z, z0.s, #0
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_256-NEXT: uunpklo z2.d, z0.s
; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
; VBITS_GE_256-NEXT: punpklo p2.h, p0.b
; VBITS_GE_256-NEXT: mov z1.s, p0/z, #-1 // =0xffffffffffffffff
; VBITS_GE_256-NEXT: mov z1.s, p1/z, #-1 // =0xffffffffffffffff
; VBITS_GE_256-NEXT: punpklo p2.h, p1.b
; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s
; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16
; VBITS_GE_256-NEXT: and p0.b, p2/z, p2.b, p1.b
; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1]
; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s
; VBITS_GE_256-NEXT: st1w { z2.d }, p0, [z4.d]
; VBITS_GE_256-NEXT: cmpne p0.d, p1/z, z1.d, #0
; VBITS_GE_256-NEXT: and p1.b, p2/z, p2.b, p0.b
; VBITS_GE_256-NEXT: cmpne p0.d, p0/z, z1.d, #0
; VBITS_GE_256-NEXT: st1w { z2.d }, p1, [z4.d]
; VBITS_GE_256-NEXT: st1w { z0.d }, p0, [z3.d]
; VBITS_GE_256-NEXT: ret
;
Expand Down Expand Up @@ -711,21 +711,21 @@ define void @masked_scatter_v8f32(ptr %a, ptr %b) #0 {
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
; VBITS_GE_256-NEXT: ptrue p1.d, vl4
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: ld1d { z3.d }, p1/z, [x1, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z4.d }, p1/z, [x1]
; VBITS_GE_256-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0
; VBITS_GE_256-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_256-NEXT: uunpklo z2.d, z0.s
; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
; VBITS_GE_256-NEXT: mov z1.s, p1/z, #-1 // =0xffffffffffffffff
; VBITS_GE_256-NEXT: punpklo p2.h, p1.b
; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s
; VBITS_GE_256-NEXT: punpklo p2.h, p0.b
; VBITS_GE_256-NEXT: mov z1.s, p0/z, #-1 // =0xffffffffffffffff
; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1]
; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16
; VBITS_GE_256-NEXT: and p0.b, p2/z, p2.b, p1.b
; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s
; VBITS_GE_256-NEXT: st1w { z2.d }, p0, [z4.d]
; VBITS_GE_256-NEXT: cmpne p0.d, p1/z, z1.d, #0
; VBITS_GE_256-NEXT: and p1.b, p2/z, p2.b, p0.b
; VBITS_GE_256-NEXT: cmpne p0.d, p0/z, z1.d, #0
; VBITS_GE_256-NEXT: st1w { z2.d }, p1, [z4.d]
; VBITS_GE_256-NEXT: st1w { z0.d }, p0, [z3.d]
; VBITS_GE_256-NEXT: ret
;
Expand Down
52 changes: 26 additions & 26 deletions llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll
Original file line number Diff line number Diff line change
Expand Up @@ -33,75 +33,75 @@ define void @crash_when_lowering_extract_shuffle(ptr %dst, i1 %cond) vscale_rang
; CHECK-NEXT: umov w8, v0.b[8]
; CHECK-NEXT: umov w9, v0.b[9]
; CHECK-NEXT: umov w10, v0.b[1]
; CHECK-NEXT: mov v2.16b, v0.16b
; CHECK-NEXT: mov v1.16b, v0.16b
; CHECK-NEXT: umov w11, v0.b[15]
; CHECK-NEXT: fmov s1, w8
; CHECK-NEXT: fmov s2, w8
; CHECK-NEXT: umov w8, v0.b[10]
; CHECK-NEXT: mov v2.b[1], w10
; CHECK-NEXT: mov v1.b[1], w10
; CHECK-NEXT: umov w10, v0.b[11]
; CHECK-NEXT: mov v1.b[1], w9
; CHECK-NEXT: mov v2.b[1], w9
; CHECK-NEXT: umov w9, v0.b[2]
; CHECK-NEXT: mov v1.b[2], w8
; CHECK-NEXT: mov v2.b[2], w8
; CHECK-NEXT: umov w8, v0.b[3]
; CHECK-NEXT: mov v2.b[2], w9
; CHECK-NEXT: mov v1.b[2], w9
; CHECK-NEXT: umov w9, v0.b[12]
; CHECK-NEXT: mov v1.b[3], w10
; CHECK-NEXT: mov v2.b[3], w10
; CHECK-NEXT: umov w10, v0.b[4]
; CHECK-NEXT: mov v2.b[3], w8
; CHECK-NEXT: mov v1.b[3], w8
; CHECK-NEXT: umov w8, v0.b[13]
; CHECK-NEXT: mov v1.b[4], w9
; CHECK-NEXT: mov v2.b[4], w9
; CHECK-NEXT: umov w9, v0.b[5]
; CHECK-NEXT: mov v2.b[4], w10
; CHECK-NEXT: mov v1.b[4], w10
; CHECK-NEXT: umov w10, v0.b[14]
; CHECK-NEXT: mov v1.b[5], w8
; CHECK-NEXT: mov v2.b[5], w8
; CHECK-NEXT: umov w8, v0.b[6]
; CHECK-NEXT: mov v2.b[5], w9
; CHECK-NEXT: mov v1.b[5], w9
; CHECK-NEXT: umov w9, v0.b[7]
; CHECK-NEXT: ext z0.b, z0.b, z0.b, #16
; CHECK-NEXT: mov v1.b[6], w10
; CHECK-NEXT: mov v2.b[6], w8
; CHECK-NEXT: mov v2.b[6], w10
; CHECK-NEXT: mov v1.b[6], w8
; CHECK-NEXT: ext v3.16b, v0.16b, v0.16b, #8
; CHECK-NEXT: uunpklo z0.h, z0.b
; CHECK-NEXT: mov x8, #16 // =0x10
; CHECK-NEXT: mov x10, #8 // =0x8
; CHECK-NEXT: ld1w { z4.s }, p0/z, [x0, x8, lsl #2]
; CHECK-NEXT: mov v1.b[7], w11
; CHECK-NEXT: mov v2.b[7], w9
; CHECK-NEXT: mov v2.b[7], w11
; CHECK-NEXT: mov v1.b[7], w9
; CHECK-NEXT: uunpklo z3.h, z3.b
; CHECK-NEXT: uunpklo z0.s, z0.h
; CHECK-NEXT: mov x9, #24 // =0x18
; CHECK-NEXT: uunpklo z1.h, z1.b
; CHECK-NEXT: uunpklo z2.h, z2.b
; CHECK-NEXT: uunpklo z1.h, z1.b
; CHECK-NEXT: uunpklo z3.s, z3.h
; CHECK-NEXT: lsl z0.s, z0.s, #31
; CHECK-NEXT: uunpklo z1.s, z1.h
; CHECK-NEXT: uunpklo z2.s, z2.h
; CHECK-NEXT: uunpklo z1.s, z1.h
; CHECK-NEXT: lsl z3.s, z3.s, #31
; CHECK-NEXT: asr z0.s, z0.s, #31
; CHECK-NEXT: asr z3.s, z3.s, #31
; CHECK-NEXT: lsl z1.s, z1.s, #31
; CHECK-NEXT: lsl z2.s, z2.s, #31
; CHECK-NEXT: lsl z1.s, z1.s, #31
; CHECK-NEXT: and z0.s, z0.s, #0x1
; CHECK-NEXT: and z3.s, z3.s, #0x1
; CHECK-NEXT: asr z1.s, z1.s, #31
; CHECK-NEXT: asr z2.s, z2.s, #31
; CHECK-NEXT: asr z1.s, z1.s, #31
; CHECK-NEXT: cmpne p1.s, p0/z, z0.s, #0
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2]
; CHECK-NEXT: cmpne p2.s, p0/z, z3.s, #0
; CHECK-NEXT: ld1w { z3.s }, p0/z, [x0, x10, lsl #2]
; CHECK-NEXT: and z1.s, z1.s, #0x1
; CHECK-NEXT: and z2.s, z2.s, #0x1
; CHECK-NEXT: and z1.s, z1.s, #0x1
; CHECK-NEXT: mov z4.s, p1/m, #0 // =0x0
; CHECK-NEXT: mov z0.s, p2/m, #0 // =0x0
; CHECK-NEXT: cmpne p3.s, p0/z, z1.s, #0
; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0]
; CHECK-NEXT: cmpne p1.s, p0/z, z2.s, #0
; CHECK-NEXT: cmpne p3.s, p0/z, z2.s, #0
; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0]
; CHECK-NEXT: cmpne p1.s, p0/z, z1.s, #0
; CHECK-NEXT: st1w { z4.s }, p0, [x0, x8, lsl #2]
; CHECK-NEXT: st1w { z0.s }, p0, [x0, x9, lsl #2]
; CHECK-NEXT: mov z3.s, p3/m, #0 // =0x0
; CHECK-NEXT: mov z1.s, p1/m, #0 // =0x0
; CHECK-NEXT: mov z2.s, p1/m, #0 // =0x0
; CHECK-NEXT: st1w { z3.s }, p0, [x0, x10, lsl #2]
; CHECK-NEXT: st1w { z1.s }, p0, [x0]
; CHECK-NEXT: st1w { z2.s }, p0, [x0]
; CHECK-NEXT: .LBB1_2: // %exit
; CHECK-NEXT: ret
%broadcast.splat = shufflevector <32 x i1> zeroinitializer, <32 x i1> zeroinitializer, <32 x i32> zeroinitializer
Expand Down
14 changes: 7 additions & 7 deletions llvm/test/CodeGen/AArch64/sve-int-arith.ll
Original file line number Diff line number Diff line change
Expand Up @@ -770,19 +770,19 @@ define void @mad_in_loop(ptr %dst, ptr %src1, ptr %src2, i32 %n) {
; CHECK-NEXT: b.lt .LBB70_3
; CHECK-NEXT: // %bb.1: // %for.body.preheader
; CHECK-NEXT: mov w9, w3
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: ptrue p1.s
; CHECK-NEXT: mov z0.s, #1 // =0x1
; CHECK-NEXT: whilelo p1.s, xzr, x9
; CHECK-NEXT: whilelo p0.s, xzr, x9
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: cntw x10
; CHECK-NEXT: .LBB70_2: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ld1w { z1.s }, p1/z, [x1, x8, lsl #2]
; CHECK-NEXT: ld1w { z2.s }, p1/z, [x2, x8, lsl #2]
; CHECK-NEXT: mad z1.s, p0/m, z2.s, z0.s
; CHECK-NEXT: st1w { z1.s }, p1, [x0, x8, lsl #2]
; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2]
; CHECK-NEXT: ld1w { z2.s }, p0/z, [x2, x8, lsl #2]
; CHECK-NEXT: mad z1.s, p1/m, z2.s, z0.s
; CHECK-NEXT: st1w { z1.s }, p0, [x0, x8, lsl #2]
; CHECK-NEXT: add x8, x8, x10
; CHECK-NEXT: whilelo p1.s, x8, x9
; CHECK-NEXT: whilelo p0.s, x8, x9
; CHECK-NEXT: b.mi .LBB70_2
; CHECK-NEXT: .LBB70_3: // %for.cond.cleanup
; CHECK-NEXT: ret
Expand Down
120 changes: 60 additions & 60 deletions llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll
Original file line number Diff line number Diff line change
Expand Up @@ -215,44 +215,44 @@ define void @fcvtzu_v16f16_v16i64(ptr %a, ptr %b) {
; CHECK: // %bb.0:
; CHECK-NEXT: sub sp, sp, #128
; CHECK-NEXT: .cfi_def_cfa_offset 128
; CHECK-NEXT: ldp q0, q4, [x0]
; CHECK-NEXT: mov z1.h, z0.h[1]
; CHECK-NEXT: mov z2.h, z0.h[3]
; CHECK-NEXT: mov z3.h, z0.h[2]
; CHECK-NEXT: fcvtzu x8, h0
; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8
; CHECK-NEXT: mov z5.h, z4.h[3]
; CHECK-NEXT: fcvtzu x10, h4
; CHECK-NEXT: fcvtzu x9, h1
; CHECK-NEXT: fcvtzu x11, h2
; CHECK-NEXT: fcvtzu x12, h3
; CHECK-NEXT: mov z1.h, z0.h[1]
; CHECK-NEXT: mov z3.h, z0.h[3]
; CHECK-NEXT: fcvtzu x13, h0
; CHECK-NEXT: mov z0.h, z0.h[2]
; CHECK-NEXT: mov z2.h, z4.h[1]
; CHECK-NEXT: stp x8, x9, [sp, #32]
; CHECK-NEXT: ldp q1, q0, [x0]
; CHECK-NEXT: mov z2.h, z1.h[1]
; CHECK-NEXT: mov z3.h, z1.h[3]
; CHECK-NEXT: mov z4.h, z1.h[2]
; CHECK-NEXT: fcvtzu x8, h1
; CHECK-NEXT: fcvtzu x9, h3
; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8
; CHECK-NEXT: mov z5.h, z0.h[3]
; CHECK-NEXT: fcvtzu x10, h0
; CHECK-NEXT: fcvtzu x9, h2
; CHECK-NEXT: fcvtzu x11, h3
; CHECK-NEXT: fcvtzu x12, h4
; CHECK-NEXT: mov z2.h, z1.h[1]
; CHECK-NEXT: mov z4.h, z1.h[3]
; CHECK-NEXT: fcvtzu x13, h1
; CHECK-NEXT: mov z1.h, z1.h[2]
; CHECK-NEXT: mov z3.h, z0.h[1]
; CHECK-NEXT: stp x8, x9, [sp, #32]
; CHECK-NEXT: fcvtzu x8, h2
; CHECK-NEXT: fcvtzu x9, h4
; CHECK-NEXT: stp x12, x11, [sp, #48]
; CHECK-NEXT: fcvtzu x11, h0
; CHECK-NEXT: mov z1.h, z4.h[2]
; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8
; CHECK-NEXT: fcvtzu x12, h2
; CHECK-NEXT: fcvtzu x11, h1
; CHECK-NEXT: mov z2.h, z0.h[2]
; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8
; CHECK-NEXT: fcvtzu x12, h3
; CHECK-NEXT: stp x13, x8, [sp]
; CHECK-NEXT: fcvtzu x8, h5
; CHECK-NEXT: stp x11, x9, [sp, #16]
; CHECK-NEXT: fcvtzu x9, h1
; CHECK-NEXT: mov z0.h, z4.h[1]
; CHECK-NEXT: mov z1.h, z4.h[3]
; CHECK-NEXT: mov z2.h, z4.h[2]
; CHECK-NEXT: fcvtzu x11, h4
; CHECK-NEXT: fcvtzu x9, h2
; CHECK-NEXT: mov z1.h, z0.h[1]
; CHECK-NEXT: mov z2.h, z0.h[3]
; CHECK-NEXT: fcvtzu x11, h0
; CHECK-NEXT: mov z0.h, z0.h[2]
; CHECK-NEXT: stp x10, x12, [sp, #96]
; CHECK-NEXT: ldp q3, q4, [sp]
; CHECK-NEXT: fcvtzu x10, h0
; CHECK-NEXT: fcvtzu x12, h1
; CHECK-NEXT: fcvtzu x10, h1
; CHECK-NEXT: fcvtzu x12, h2
; CHECK-NEXT: stp x9, x8, [sp, #112]
; CHECK-NEXT: fcvtzu x8, h2
; CHECK-NEXT: fcvtzu x8, h0
; CHECK-NEXT: ldp q0, q1, [sp, #32]
; CHECK-NEXT: ldp q6, q7, [sp, #96]
; CHECK-NEXT: stp x11, x10, [sp, #64]
Expand Down Expand Up @@ -965,44 +965,44 @@ define void @fcvtzs_v16f16_v16i64(ptr %a, ptr %b) {
; CHECK: // %bb.0:
; CHECK-NEXT: sub sp, sp, #128
; CHECK-NEXT: .cfi_def_cfa_offset 128
; CHECK-NEXT: ldp q0, q4, [x0]
; CHECK-NEXT: mov z1.h, z0.h[1]
; CHECK-NEXT: mov z2.h, z0.h[3]
; CHECK-NEXT: mov z3.h, z0.h[2]
; CHECK-NEXT: fcvtzs x8, h0
; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8
; CHECK-NEXT: mov z5.h, z4.h[3]
; CHECK-NEXT: fcvtzs x10, h4
; CHECK-NEXT: fcvtzs x9, h1
; CHECK-NEXT: fcvtzs x11, h2
; CHECK-NEXT: fcvtzs x12, h3
; CHECK-NEXT: mov z1.h, z0.h[1]
; CHECK-NEXT: mov z3.h, z0.h[3]
; CHECK-NEXT: fcvtzs x13, h0
; CHECK-NEXT: mov z0.h, z0.h[2]
; CHECK-NEXT: mov z2.h, z4.h[1]
; CHECK-NEXT: stp x8, x9, [sp, #32]
; CHECK-NEXT: ldp q1, q0, [x0]
; CHECK-NEXT: mov z2.h, z1.h[1]
; CHECK-NEXT: mov z3.h, z1.h[3]
; CHECK-NEXT: mov z4.h, z1.h[2]
; CHECK-NEXT: fcvtzs x8, h1
; CHECK-NEXT: fcvtzs x9, h3
; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8
; CHECK-NEXT: mov z5.h, z0.h[3]
; CHECK-NEXT: fcvtzs x10, h0
; CHECK-NEXT: fcvtzs x9, h2
; CHECK-NEXT: fcvtzs x11, h3
; CHECK-NEXT: fcvtzs x12, h4
; CHECK-NEXT: mov z2.h, z1.h[1]
; CHECK-NEXT: mov z4.h, z1.h[3]
; CHECK-NEXT: fcvtzs x13, h1
; CHECK-NEXT: mov z1.h, z1.h[2]
; CHECK-NEXT: mov z3.h, z0.h[1]
; CHECK-NEXT: stp x8, x9, [sp, #32]
; CHECK-NEXT: fcvtzs x8, h2
; CHECK-NEXT: fcvtzs x9, h4
; CHECK-NEXT: stp x12, x11, [sp, #48]
; CHECK-NEXT: fcvtzs x11, h0
; CHECK-NEXT: mov z1.h, z4.h[2]
; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8
; CHECK-NEXT: fcvtzs x12, h2
; CHECK-NEXT: fcvtzs x11, h1
; CHECK-NEXT: mov z2.h, z0.h[2]
; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8
; CHECK-NEXT: fcvtzs x12, h3
; CHECK-NEXT: stp x13, x8, [sp]
; CHECK-NEXT: fcvtzs x8, h5
; CHECK-NEXT: stp x11, x9, [sp, #16]
; CHECK-NEXT: fcvtzs x9, h1
; CHECK-NEXT: mov z0.h, z4.h[1]
; CHECK-NEXT: mov z1.h, z4.h[3]
; CHECK-NEXT: mov z2.h, z4.h[2]
; CHECK-NEXT: fcvtzs x11, h4
; CHECK-NEXT: fcvtzs x9, h2
; CHECK-NEXT: mov z1.h, z0.h[1]
; CHECK-NEXT: mov z2.h, z0.h[3]
; CHECK-NEXT: fcvtzs x11, h0
; CHECK-NEXT: mov z0.h, z0.h[2]
; CHECK-NEXT: stp x10, x12, [sp, #96]
; CHECK-NEXT: ldp q3, q4, [sp]
; CHECK-NEXT: fcvtzs x10, h0
; CHECK-NEXT: fcvtzs x12, h1
; CHECK-NEXT: fcvtzs x10, h1
; CHECK-NEXT: fcvtzs x12, h2
; CHECK-NEXT: stp x9, x8, [sp, #112]
; CHECK-NEXT: fcvtzs x8, h2
; CHECK-NEXT: fcvtzs x8, h0
; CHECK-NEXT: ldp q0, q1, [sp, #32]
; CHECK-NEXT: ldp q6, q7, [sp, #96]
; CHECK-NEXT: stp x11, x10, [sp, #64]
Expand Down
148 changes: 74 additions & 74 deletions llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll
Original file line number Diff line number Diff line change
Expand Up @@ -105,55 +105,55 @@ define <16 x i8> @sdiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
define void @sdiv_v32i8(ptr %a, ptr %b) {
; CHECK-LABEL: sdiv_v32i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ldp q6, q0, [x0]
; CHECK-NEXT: ldp q6, q2, [x0]
; CHECK-NEXT: ptrue p0.s, vl4
; CHECK-NEXT: ldp q7, q1, [x1]
; CHECK-NEXT: mov z3.d, z0.d
; CHECK-NEXT: ldp q7, q3, [x1]
; CHECK-NEXT: mov z1.d, z2.d
; CHECK-NEXT: mov z16.d, z6.d
; CHECK-NEXT: mov z2.d, z1.d
; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8
; CHECK-NEXT: sunpklo z0.h, z0.b
; CHECK-NEXT: mov z0.d, z3.d
; CHECK-NEXT: ext z1.b, z1.b, z2.b, #8
; CHECK-NEXT: ext z16.b, z16.b, z6.b, #8
; CHECK-NEXT: ext z2.b, z2.b, z1.b, #8
; CHECK-NEXT: sunpklo z1.h, z1.b
; CHECK-NEXT: sunpklo z6.h, z6.b
; CHECK-NEXT: ext z0.b, z0.b, z3.b, #8
; CHECK-NEXT: sunpklo z3.h, z3.b
; CHECK-NEXT: sunpklo z1.h, z1.b
; CHECK-NEXT: sunpklo z16.h, z16.b
; CHECK-NEXT: sunpklo z2.h, z2.b
; CHECK-NEXT: sunpklo z5.s, z3.h
; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8
; CHECK-NEXT: sunpklo z4.h, z0.b
; CHECK-NEXT: sunpklo z5.s, z1.h
; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8
; CHECK-NEXT: sunpklo z18.s, z16.h
; CHECK-NEXT: sunpklo z4.s, z2.h
; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8
; CHECK-NEXT: sunpklo z0.s, z4.h
; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8
; CHECK-NEXT: ext z16.b, z16.b, z16.b, #8
; CHECK-NEXT: sunpklo z3.s, z3.h
; CHECK-NEXT: sunpklo z2.s, z2.h
; CHECK-NEXT: sunpklo z16.s, z16.h
; CHECK-NEXT: sdivr z4.s, p0/m, z4.s, z5.s
; CHECK-NEXT: sunpklo z5.s, z0.h
; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8
; CHECK-NEXT: sunpklo z0.s, z0.h
; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
; CHECK-NEXT: sunpklo z3.s, z1.h
; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8
; CHECK-NEXT: sunpklo z1.s, z1.h
; CHECK-NEXT: sdivr z3.s, p0/m, z3.s, z5.s
; CHECK-NEXT: sunpklo z4.s, z4.h
; CHECK-NEXT: sunpklo z16.s, z16.h
; CHECK-NEXT: sdivr z0.s, p0/m, z0.s, z5.s
; CHECK-NEXT: sdiv z1.s, p0/m, z1.s, z4.s
; CHECK-NEXT: sunpklo z4.h, z2.b
; CHECK-NEXT: sunpklo z2.s, z3.h
; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8
; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
; CHECK-NEXT: sunpklo z5.s, z4.h
; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8
; CHECK-NEXT: sunpklo z3.s, z3.h
; CHECK-NEXT: sunpklo z4.s, z4.h
; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z5.s
; CHECK-NEXT: mov z5.d, z7.d
; CHECK-NEXT: ext z5.b, z5.b, z7.b, #8
; CHECK-NEXT: sunpklo z7.h, z7.b
; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h
; CHECK-NEXT: uzp1 z1.h, z1.h, z1.h
; CHECK-NEXT: sunpklo z5.h, z5.b
; CHECK-NEXT: sunpklo z17.s, z5.h
; CHECK-NEXT: ext z5.b, z5.b, z5.b, #8
; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
; CHECK-NEXT: uzp1 z1.h, z4.h, z4.h
; CHECK-NEXT: sdivr z3.s, p0/m, z3.s, z4.s
; CHECK-NEXT: sunpklo z5.s, z5.h
; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h
; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h
; CHECK-NEXT: sdivr z17.s, p0/m, z17.s, z18.s
; CHECK-NEXT: sunpklo z18.s, z6.h
; CHECK-NEXT: ext z6.b, z6.b, z6.b, #8
; CHECK-NEXT: sunpklo z6.s, z6.h
; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h
; CHECK-NEXT: sdivr z5.s, p0/m, z5.s, z16.s
; CHECK-NEXT: sunpklo z16.s, z7.h
; CHECK-NEXT: ext z7.b, z7.b, z7.b, #8
Expand All @@ -165,18 +165,18 @@ define void @sdiv_v32i8(ptr %a, ptr %b) {
; CHECK-NEXT: ptrue p0.h, vl4
; CHECK-NEXT: uzp1 z7.h, z16.h, z16.h
; CHECK-NEXT: splice z4.h, p0, z4.h, z5.h
; CHECK-NEXT: splice z1.h, p0, z1.h, z2.h
; CHECK-NEXT: splice z3.h, p0, z3.h, z0.h
; CHECK-NEXT: uzp1 z0.b, z4.b, z4.b
; CHECK-NEXT: uzp1 z1.b, z1.b, z1.b
; CHECK-NEXT: uzp1 z3.b, z3.b, z3.b
; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h
; CHECK-NEXT: splice z2.h, p0, z2.h, z3.h
; CHECK-NEXT: uzp1 z1.b, z4.b, z4.b
; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b
; CHECK-NEXT: uzp1 z2.b, z2.b, z2.b
; CHECK-NEXT: uzp1 z6.h, z6.h, z6.h
; CHECK-NEXT: splice z7.h, p0, z7.h, z6.h
; CHECK-NEXT: ptrue p0.b, vl8
; CHECK-NEXT: uzp1 z2.b, z7.b, z7.b
; CHECK-NEXT: splice z3.b, p0, z3.b, z1.b
; CHECK-NEXT: uzp1 z3.b, z7.b, z7.b
; CHECK-NEXT: splice z2.b, p0, z2.b, z0.b
; CHECK-NEXT: stp q2, q3, [x0]
; CHECK-NEXT: splice z3.b, p0, z3.b, z1.b
; CHECK-NEXT: stp q3, q2, [x0]
; CHECK-NEXT: ret
%op1 = load <32 x i8>, ptr %a
%op2 = load <32 x i8>, ptr %b
Expand Down Expand Up @@ -472,55 +472,55 @@ define <16 x i8> @udiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
define void @udiv_v32i8(ptr %a, ptr %b) {
; CHECK-LABEL: udiv_v32i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ldp q6, q0, [x0]
; CHECK-NEXT: ldp q6, q2, [x0]
; CHECK-NEXT: ptrue p0.s, vl4
; CHECK-NEXT: ldp q7, q1, [x1]
; CHECK-NEXT: mov z3.d, z0.d
; CHECK-NEXT: ldp q7, q3, [x1]
; CHECK-NEXT: mov z1.d, z2.d
; CHECK-NEXT: mov z16.d, z6.d
; CHECK-NEXT: mov z2.d, z1.d
; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8
; CHECK-NEXT: uunpklo z0.h, z0.b
; CHECK-NEXT: mov z0.d, z3.d
; CHECK-NEXT: ext z1.b, z1.b, z2.b, #8
; CHECK-NEXT: ext z16.b, z16.b, z6.b, #8
; CHECK-NEXT: ext z2.b, z2.b, z1.b, #8
; CHECK-NEXT: uunpklo z1.h, z1.b
; CHECK-NEXT: uunpklo z6.h, z6.b
; CHECK-NEXT: ext z0.b, z0.b, z3.b, #8
; CHECK-NEXT: uunpklo z3.h, z3.b
; CHECK-NEXT: uunpklo z1.h, z1.b
; CHECK-NEXT: uunpklo z16.h, z16.b
; CHECK-NEXT: uunpklo z2.h, z2.b
; CHECK-NEXT: uunpklo z5.s, z3.h
; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8
; CHECK-NEXT: uunpklo z4.h, z0.b
; CHECK-NEXT: uunpklo z5.s, z1.h
; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8
; CHECK-NEXT: uunpklo z18.s, z16.h
; CHECK-NEXT: uunpklo z4.s, z2.h
; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8
; CHECK-NEXT: uunpklo z0.s, z4.h
; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8
; CHECK-NEXT: ext z16.b, z16.b, z16.b, #8
; CHECK-NEXT: uunpklo z3.s, z3.h
; CHECK-NEXT: uunpklo z2.s, z2.h
; CHECK-NEXT: uunpklo z16.s, z16.h
; CHECK-NEXT: udivr z4.s, p0/m, z4.s, z5.s
; CHECK-NEXT: uunpklo z5.s, z0.h
; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8
; CHECK-NEXT: uunpklo z0.s, z0.h
; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z3.s
; CHECK-NEXT: uunpklo z3.s, z1.h
; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8
; CHECK-NEXT: uunpklo z1.s, z1.h
; CHECK-NEXT: udivr z3.s, p0/m, z3.s, z5.s
; CHECK-NEXT: uunpklo z4.s, z4.h
; CHECK-NEXT: uunpklo z16.s, z16.h
; CHECK-NEXT: udivr z0.s, p0/m, z0.s, z5.s
; CHECK-NEXT: udiv z1.s, p0/m, z1.s, z4.s
; CHECK-NEXT: uunpklo z4.h, z2.b
; CHECK-NEXT: uunpklo z2.s, z3.h
; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8
; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
; CHECK-NEXT: uunpklo z5.s, z4.h
; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8
; CHECK-NEXT: uunpklo z3.s, z3.h
; CHECK-NEXT: uunpklo z4.s, z4.h
; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z5.s
; CHECK-NEXT: mov z5.d, z7.d
; CHECK-NEXT: ext z5.b, z5.b, z7.b, #8
; CHECK-NEXT: uunpklo z7.h, z7.b
; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h
; CHECK-NEXT: uzp1 z1.h, z1.h, z1.h
; CHECK-NEXT: uunpklo z5.h, z5.b
; CHECK-NEXT: uunpklo z17.s, z5.h
; CHECK-NEXT: ext z5.b, z5.b, z5.b, #8
; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s
; CHECK-NEXT: uzp1 z1.h, z4.h, z4.h
; CHECK-NEXT: udivr z3.s, p0/m, z3.s, z4.s
; CHECK-NEXT: uunpklo z5.s, z5.h
; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h
; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h
; CHECK-NEXT: udivr z17.s, p0/m, z17.s, z18.s
; CHECK-NEXT: uunpklo z18.s, z6.h
; CHECK-NEXT: ext z6.b, z6.b, z6.b, #8
; CHECK-NEXT: uunpklo z6.s, z6.h
; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h
; CHECK-NEXT: udivr z5.s, p0/m, z5.s, z16.s
; CHECK-NEXT: uunpklo z16.s, z7.h
; CHECK-NEXT: ext z7.b, z7.b, z7.b, #8
Expand All @@ -532,18 +532,18 @@ define void @udiv_v32i8(ptr %a, ptr %b) {
; CHECK-NEXT: ptrue p0.h, vl4
; CHECK-NEXT: uzp1 z7.h, z16.h, z16.h
; CHECK-NEXT: splice z4.h, p0, z4.h, z5.h
; CHECK-NEXT: splice z1.h, p0, z1.h, z2.h
; CHECK-NEXT: splice z3.h, p0, z3.h, z0.h
; CHECK-NEXT: uzp1 z0.b, z4.b, z4.b
; CHECK-NEXT: uzp1 z1.b, z1.b, z1.b
; CHECK-NEXT: uzp1 z3.b, z3.b, z3.b
; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h
; CHECK-NEXT: splice z2.h, p0, z2.h, z3.h
; CHECK-NEXT: uzp1 z1.b, z4.b, z4.b
; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b
; CHECK-NEXT: uzp1 z2.b, z2.b, z2.b
; CHECK-NEXT: uzp1 z6.h, z6.h, z6.h
; CHECK-NEXT: splice z7.h, p0, z7.h, z6.h
; CHECK-NEXT: ptrue p0.b, vl8
; CHECK-NEXT: uzp1 z2.b, z7.b, z7.b
; CHECK-NEXT: splice z3.b, p0, z3.b, z1.b
; CHECK-NEXT: uzp1 z3.b, z7.b, z7.b
; CHECK-NEXT: splice z2.b, p0, z2.b, z0.b
; CHECK-NEXT: stp q2, q3, [x0]
; CHECK-NEXT: splice z3.b, p0, z3.b, z1.b
; CHECK-NEXT: stp q3, q2, [x0]
; CHECK-NEXT: ret
%op1 = load <32 x i8>, ptr %a
%op2 = load <32 x i8>, ptr %b
Expand Down
206 changes: 106 additions & 100 deletions llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll
Original file line number Diff line number Diff line change
Expand Up @@ -109,81 +109,84 @@ define <16 x i8> @srem_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
define void @srem_v32i8(ptr %a, ptr %b) {
; CHECK-LABEL: srem_v32i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ldp q16, q0, [x0]
; CHECK-NEXT: ldr q0, [x0, #16]
; CHECK-NEXT: ldr q1, [x1, #16]
; CHECK-NEXT: ptrue p0.s, vl4
; CHECK-NEXT: ldp q17, q1, [x1]
; CHECK-NEXT: ptrue p1.b, vl16
; CHECK-NEXT: mov z3.d, z0.d
; CHECK-NEXT: mov z18.d, z16.d
; CHECK-NEXT: mov z2.d, z1.d
; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8
; CHECK-NEXT: ext z18.b, z18.b, z16.b, #8
; CHECK-NEXT: mov z3.d, z0.d
; CHECK-NEXT: sunpklo z7.h, z1.b
; CHECK-NEXT: sunpklo z16.h, z0.b
; CHECK-NEXT: ext z2.b, z2.b, z1.b, #8
; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8
; CHECK-NEXT: sunpklo z6.s, z7.h
; CHECK-NEXT: ext z7.b, z7.b, z7.b, #8
; CHECK-NEXT: sunpklo z17.s, z16.h
; CHECK-NEXT: ext z16.b, z16.b, z16.b, #8
; CHECK-NEXT: sunpklo z4.h, z2.b
; CHECK-NEXT: sunpklo z3.h, z3.b
; CHECK-NEXT: sunpklo z18.h, z18.b
; CHECK-NEXT: sunpklo z2.h, z2.b
; CHECK-NEXT: sunpklo z7.s, z7.h
; CHECK-NEXT: sunpklo z16.s, z16.h
; CHECK-NEXT: sdivr z6.s, p0/m, z6.s, z17.s
; CHECK-NEXT: sunpklo z2.s, z4.h
; CHECK-NEXT: sunpklo z5.s, z3.h
; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8
; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8
; CHECK-NEXT: sunpklo z4.s, z4.h
; CHECK-NEXT: sunpklo z3.s, z3.h
; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z5.s
; CHECK-NEXT: movprfx z5, z3
; CHECK-NEXT: sdiv z5.s, p0/m, z5.s, z4.s
; CHECK-NEXT: ldr q3, [x0]
; CHECK-NEXT: ldr q4, [x1]
; CHECK-NEXT: uzp1 z5.h, z5.h, z5.h
; CHECK-NEXT: mov z18.d, z3.d
; CHECK-NEXT: mov z17.d, z4.d
; CHECK-NEXT: uzp1 z6.h, z6.h, z6.h
; CHECK-NEXT: ext z18.b, z18.b, z3.b, #8
; CHECK-NEXT: ext z17.b, z17.b, z4.b, #8
; CHECK-NEXT: sunpklo z18.h, z18.b
; CHECK-NEXT: sunpklo z17.h, z17.b
; CHECK-NEXT: sunpklo z20.s, z18.h
; CHECK-NEXT: sunpklo z4.s, z2.h
; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8
; CHECK-NEXT: ext z18.b, z18.b, z18.b, #8
; CHECK-NEXT: sunpklo z3.s, z3.h
; CHECK-NEXT: sunpklo z2.s, z2.h
; CHECK-NEXT: sunpklo z19.s, z17.h
; CHECK-NEXT: ext z17.b, z17.b, z17.b, #8
; CHECK-NEXT: sdivr z7.s, p0/m, z7.s, z16.s
; CHECK-NEXT: sunpklo z18.s, z18.h
; CHECK-NEXT: sdivr z4.s, p0/m, z4.s, z5.s
; CHECK-NEXT: sunpklo z5.h, z0.b
; CHECK-NEXT: sunpklo z7.s, z5.h
; CHECK-NEXT: ext z5.b, z5.b, z5.b, #8
; CHECK-NEXT: sunpklo z5.s, z5.h
; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
; CHECK-NEXT: sunpklo z3.h, z1.b
; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h
; CHECK-NEXT: sunpklo z6.s, z3.h
; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8
; CHECK-NEXT: sunpklo z3.s, z3.h
; CHECK-NEXT: sdivr z6.s, p0/m, z6.s, z7.s
; CHECK-NEXT: mov z7.d, z17.d
; CHECK-NEXT: ext z7.b, z7.b, z17.b, #8
; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h
; CHECK-NEXT: sunpklo z7.h, z7.b
; CHECK-NEXT: sunpklo z19.s, z7.h
; CHECK-NEXT: ext z7.b, z7.b, z7.b, #8
; CHECK-NEXT: sdivr z3.s, p0/m, z3.s, z5.s
; CHECK-NEXT: sunpklo z7.s, z7.h
; CHECK-NEXT: uzp1 z6.h, z6.h, z6.h
; CHECK-NEXT: sunpklo z17.s, z17.h
; CHECK-NEXT: sdivr z19.s, p0/m, z19.s, z20.s
; CHECK-NEXT: sunpklo z20.h, z16.b
; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h
; CHECK-NEXT: sunpklo z20.h, z3.b
; CHECK-NEXT: uzp1 z7.h, z7.h, z7.h
; CHECK-NEXT: sunpklo z22.s, z20.h
; CHECK-NEXT: ext z20.b, z20.b, z20.b, #8
; CHECK-NEXT: sunpklo z20.s, z20.h
; CHECK-NEXT: sdivr z7.s, p0/m, z7.s, z18.s
; CHECK-NEXT: sunpklo z18.h, z17.b
; CHECK-NEXT: uzp1 z5.h, z19.h, z19.h
; CHECK-NEXT: sdivr z17.s, p0/m, z17.s, z18.s
; CHECK-NEXT: sunpklo z18.h, z4.b
; CHECK-NEXT: uzp1 z16.h, z19.h, z19.h
; CHECK-NEXT: sunpklo z21.s, z18.h
; CHECK-NEXT: ext z18.b, z18.b, z18.b, #8
; CHECK-NEXT: sunpklo z18.s, z18.h
; CHECK-NEXT: sdivr z21.s, p0/m, z21.s, z22.s
; CHECK-NEXT: uzp1 z7.h, z7.h, z7.h
; CHECK-NEXT: uzp1 z17.h, z17.h, z17.h
; CHECK-NEXT: sdivr z18.s, p0/m, z18.s, z20.s
; CHECK-NEXT: ptrue p0.h, vl4
; CHECK-NEXT: uzp1 z19.h, z21.h, z21.h
; CHECK-NEXT: splice z5.h, p0, z5.h, z7.h
; CHECK-NEXT: splice z4.h, p0, z4.h, z2.h
; CHECK-NEXT: splice z6.h, p0, z6.h, z3.h
; CHECK-NEXT: uzp1 z2.b, z5.b, z5.b
; CHECK-NEXT: uzp1 z4.b, z4.b, z4.b
; CHECK-NEXT: uzp1 z5.b, z6.b, z6.b
; CHECK-NEXT: splice z2.h, p0, z2.h, z5.h
; CHECK-NEXT: splice z6.h, p0, z6.h, z7.h
; CHECK-NEXT: splice z16.h, p0, z16.h, z17.h
; CHECK-NEXT: uzp1 z2.b, z2.b, z2.b
; CHECK-NEXT: uzp1 z6.b, z6.b, z6.b
; CHECK-NEXT: uzp1 z5.b, z16.b, z16.b
; CHECK-NEXT: uzp1 z18.h, z18.h, z18.h
; CHECK-NEXT: splice z19.h, p0, z19.h, z18.h
; CHECK-NEXT: ptrue p0.b, vl8
; CHECK-NEXT: uzp1 z3.b, z19.b, z19.b
; CHECK-NEXT: splice z5.b, p0, z5.b, z4.b
; CHECK-NEXT: splice z3.b, p0, z3.b, z2.b
; CHECK-NEXT: movprfx z2, z16
; CHECK-NEXT: mls z2.b, p1/m, z3.b, z17.b
; CHECK-NEXT: mls z0.b, p1/m, z5.b, z1.b
; CHECK-NEXT: uzp1 z7.b, z19.b, z19.b
; CHECK-NEXT: splice z6.b, p0, z6.b, z2.b
; CHECK-NEXT: splice z7.b, p0, z7.b, z5.b
; CHECK-NEXT: movprfx z2, z3
; CHECK-NEXT: mls z2.b, p1/m, z7.b, z4.b
; CHECK-NEXT: mls z0.b, p1/m, z6.b, z1.b
; CHECK-NEXT: stp q2, q0, [x0]
; CHECK-NEXT: ret
%op1 = load <32 x i8>, ptr %a
Expand Down Expand Up @@ -495,81 +498,84 @@ define <16 x i8> @urem_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
define void @urem_v32i8(ptr %a, ptr %b) {
; CHECK-LABEL: urem_v32i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ldp q16, q0, [x0]
; CHECK-NEXT: ldr q0, [x0, #16]
; CHECK-NEXT: ldr q1, [x1, #16]
; CHECK-NEXT: ptrue p0.s, vl4
; CHECK-NEXT: ldp q17, q1, [x1]
; CHECK-NEXT: ptrue p1.b, vl16
; CHECK-NEXT: mov z3.d, z0.d
; CHECK-NEXT: mov z18.d, z16.d
; CHECK-NEXT: mov z2.d, z1.d
; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8
; CHECK-NEXT: ext z18.b, z18.b, z16.b, #8
; CHECK-NEXT: mov z3.d, z0.d
; CHECK-NEXT: uunpklo z7.h, z1.b
; CHECK-NEXT: uunpklo z16.h, z0.b
; CHECK-NEXT: ext z2.b, z2.b, z1.b, #8
; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8
; CHECK-NEXT: uunpklo z6.s, z7.h
; CHECK-NEXT: ext z7.b, z7.b, z7.b, #8
; CHECK-NEXT: uunpklo z17.s, z16.h
; CHECK-NEXT: ext z16.b, z16.b, z16.b, #8
; CHECK-NEXT: uunpklo z4.h, z2.b
; CHECK-NEXT: uunpklo z3.h, z3.b
; CHECK-NEXT: uunpklo z18.h, z18.b
; CHECK-NEXT: uunpklo z2.h, z2.b
; CHECK-NEXT: uunpklo z7.s, z7.h
; CHECK-NEXT: uunpklo z16.s, z16.h
; CHECK-NEXT: udivr z6.s, p0/m, z6.s, z17.s
; CHECK-NEXT: uunpklo z2.s, z4.h
; CHECK-NEXT: uunpklo z5.s, z3.h
; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8
; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8
; CHECK-NEXT: uunpklo z4.s, z4.h
; CHECK-NEXT: uunpklo z3.s, z3.h
; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z5.s
; CHECK-NEXT: movprfx z5, z3
; CHECK-NEXT: udiv z5.s, p0/m, z5.s, z4.s
; CHECK-NEXT: ldr q3, [x0]
; CHECK-NEXT: ldr q4, [x1]
; CHECK-NEXT: uzp1 z5.h, z5.h, z5.h
; CHECK-NEXT: mov z18.d, z3.d
; CHECK-NEXT: mov z17.d, z4.d
; CHECK-NEXT: uzp1 z6.h, z6.h, z6.h
; CHECK-NEXT: ext z18.b, z18.b, z3.b, #8
; CHECK-NEXT: ext z17.b, z17.b, z4.b, #8
; CHECK-NEXT: uunpklo z18.h, z18.b
; CHECK-NEXT: uunpklo z17.h, z17.b
; CHECK-NEXT: uunpklo z20.s, z18.h
; CHECK-NEXT: uunpklo z4.s, z2.h
; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8
; CHECK-NEXT: ext z18.b, z18.b, z18.b, #8
; CHECK-NEXT: uunpklo z3.s, z3.h
; CHECK-NEXT: uunpklo z2.s, z2.h
; CHECK-NEXT: uunpklo z19.s, z17.h
; CHECK-NEXT: ext z17.b, z17.b, z17.b, #8
; CHECK-NEXT: udivr z7.s, p0/m, z7.s, z16.s
; CHECK-NEXT: uunpklo z18.s, z18.h
; CHECK-NEXT: udivr z4.s, p0/m, z4.s, z5.s
; CHECK-NEXT: uunpklo z5.h, z0.b
; CHECK-NEXT: uunpklo z7.s, z5.h
; CHECK-NEXT: ext z5.b, z5.b, z5.b, #8
; CHECK-NEXT: uunpklo z5.s, z5.h
; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z3.s
; CHECK-NEXT: uunpklo z3.h, z1.b
; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h
; CHECK-NEXT: uunpklo z6.s, z3.h
; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8
; CHECK-NEXT: uunpklo z3.s, z3.h
; CHECK-NEXT: udivr z6.s, p0/m, z6.s, z7.s
; CHECK-NEXT: mov z7.d, z17.d
; CHECK-NEXT: ext z7.b, z7.b, z17.b, #8
; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h
; CHECK-NEXT: uunpklo z7.h, z7.b
; CHECK-NEXT: uunpklo z19.s, z7.h
; CHECK-NEXT: ext z7.b, z7.b, z7.b, #8
; CHECK-NEXT: udivr z3.s, p0/m, z3.s, z5.s
; CHECK-NEXT: uunpklo z7.s, z7.h
; CHECK-NEXT: uzp1 z6.h, z6.h, z6.h
; CHECK-NEXT: uunpklo z17.s, z17.h
; CHECK-NEXT: udivr z19.s, p0/m, z19.s, z20.s
; CHECK-NEXT: uunpklo z20.h, z16.b
; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h
; CHECK-NEXT: uunpklo z20.h, z3.b
; CHECK-NEXT: uzp1 z7.h, z7.h, z7.h
; CHECK-NEXT: uunpklo z22.s, z20.h
; CHECK-NEXT: ext z20.b, z20.b, z20.b, #8
; CHECK-NEXT: uunpklo z20.s, z20.h
; CHECK-NEXT: udivr z7.s, p0/m, z7.s, z18.s
; CHECK-NEXT: uunpklo z18.h, z17.b
; CHECK-NEXT: uzp1 z5.h, z19.h, z19.h
; CHECK-NEXT: udivr z17.s, p0/m, z17.s, z18.s
; CHECK-NEXT: uunpklo z18.h, z4.b
; CHECK-NEXT: uzp1 z16.h, z19.h, z19.h
; CHECK-NEXT: uunpklo z21.s, z18.h
; CHECK-NEXT: ext z18.b, z18.b, z18.b, #8
; CHECK-NEXT: uunpklo z18.s, z18.h
; CHECK-NEXT: udivr z21.s, p0/m, z21.s, z22.s
; CHECK-NEXT: uzp1 z7.h, z7.h, z7.h
; CHECK-NEXT: uzp1 z17.h, z17.h, z17.h
; CHECK-NEXT: udivr z18.s, p0/m, z18.s, z20.s
; CHECK-NEXT: ptrue p0.h, vl4
; CHECK-NEXT: uzp1 z19.h, z21.h, z21.h
; CHECK-NEXT: splice z5.h, p0, z5.h, z7.h
; CHECK-NEXT: splice z4.h, p0, z4.h, z2.h
; CHECK-NEXT: splice z6.h, p0, z6.h, z3.h
; CHECK-NEXT: uzp1 z2.b, z5.b, z5.b
; CHECK-NEXT: uzp1 z4.b, z4.b, z4.b
; CHECK-NEXT: uzp1 z5.b, z6.b, z6.b
; CHECK-NEXT: splice z2.h, p0, z2.h, z5.h
; CHECK-NEXT: splice z6.h, p0, z6.h, z7.h
; CHECK-NEXT: splice z16.h, p0, z16.h, z17.h
; CHECK-NEXT: uzp1 z2.b, z2.b, z2.b
; CHECK-NEXT: uzp1 z6.b, z6.b, z6.b
; CHECK-NEXT: uzp1 z5.b, z16.b, z16.b
; CHECK-NEXT: uzp1 z18.h, z18.h, z18.h
; CHECK-NEXT: splice z19.h, p0, z19.h, z18.h
; CHECK-NEXT: ptrue p0.b, vl8
; CHECK-NEXT: uzp1 z3.b, z19.b, z19.b
; CHECK-NEXT: splice z5.b, p0, z5.b, z4.b
; CHECK-NEXT: splice z3.b, p0, z3.b, z2.b
; CHECK-NEXT: movprfx z2, z16
; CHECK-NEXT: mls z2.b, p1/m, z3.b, z17.b
; CHECK-NEXT: mls z0.b, p1/m, z5.b, z1.b
; CHECK-NEXT: uzp1 z7.b, z19.b, z19.b
; CHECK-NEXT: splice z6.b, p0, z6.b, z2.b
; CHECK-NEXT: splice z7.b, p0, z7.b, z5.b
; CHECK-NEXT: movprfx z2, z3
; CHECK-NEXT: mls z2.b, p1/m, z7.b, z4.b
; CHECK-NEXT: mls z0.b, p1/m, z6.b, z1.b
; CHECK-NEXT: stp q2, q0, [x0]
; CHECK-NEXT: ret
%op1 = load <32 x i8>, ptr %a
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -80,119 +80,119 @@ define void @zip_v32i16(ptr %a, ptr %b) {
; CHECK: // %bb.0:
; CHECK-NEXT: sub sp, sp, #64
; CHECK-NEXT: .cfi_def_cfa_offset 64
; CHECK-NEXT: ldp q2, q4, [x1]
; CHECK-NEXT: ldp q0, q5, [x0]
; CHECK-NEXT: ldp q1, q6, [x0, #32]
; CHECK-NEXT: mov z16.h, z4.h[7]
; CHECK-NEXT: mov z18.h, z4.h[6]
; CHECK-NEXT: mov z17.h, z5.h[7]
; CHECK-NEXT: ldp q3, q7, [x1, #32]
; CHECK-NEXT: mov z19.h, z5.h[6]
; CHECK-NEXT: ldp q1, q3, [x1]
; CHECK-NEXT: ldp q0, q4, [x0]
; CHECK-NEXT: ldp q2, q6, [x0, #32]
; CHECK-NEXT: mov z16.h, z3.h[7]
; CHECK-NEXT: mov z18.h, z3.h[6]
; CHECK-NEXT: mov z17.h, z4.h[7]
; CHECK-NEXT: ldp q5, q7, [x1, #32]
; CHECK-NEXT: mov z19.h, z4.h[6]
; CHECK-NEXT: fmov w8, s16
; CHECK-NEXT: mov z16.h, z4.h[5]
; CHECK-NEXT: mov z16.h, z3.h[5]
; CHECK-NEXT: fmov w9, s17
; CHECK-NEXT: mov z17.h, z5.h[5]
; CHECK-NEXT: mov z17.h, z4.h[5]
; CHECK-NEXT: mov z20.h, z7.h[6]
; CHECK-NEXT: strh w8, [sp, #30]
; CHECK-NEXT: fmov w8, s18
; CHECK-NEXT: mov z18.h, z4.h[4]
; CHECK-NEXT: mov z18.h, z3.h[4]
; CHECK-NEXT: strh w9, [sp, #28]
; CHECK-NEXT: fmov w9, s19
; CHECK-NEXT: mov z19.h, z7.h[6]
; CHECK-NEXT: zip1 z4.h, z5.h, z4.h
; CHECK-NEXT: mov z19.h, z6.h[7]
; CHECK-NEXT: zip1 z3.h, z4.h, z3.h
; CHECK-NEXT: strh w8, [sp, #26]
; CHECK-NEXT: fmov w8, s16
; CHECK-NEXT: mov z16.h, z5.h[4]
; CHECK-NEXT: mov z16.h, z4.h[4]
; CHECK-NEXT: strh w9, [sp, #24]
; CHECK-NEXT: zip1 z5.h, z6.h, z7.h
; CHECK-NEXT: zip1 z4.h, z6.h, z7.h
; CHECK-NEXT: strh w8, [sp, #22]
; CHECK-NEXT: fmov w8, s17
; CHECK-NEXT: mov z17.h, z2.h[7]
; CHECK-NEXT: mov z17.h, z1.h[7]
; CHECK-NEXT: add z3.h, z3.h, z4.h
; CHECK-NEXT: strh w8, [sp, #20]
; CHECK-NEXT: fmov w8, s18
; CHECK-NEXT: mov z18.h, z0.h[7]
; CHECK-NEXT: strh w8, [sp, #18]
; CHECK-NEXT: fmov w8, s16
; CHECK-NEXT: mov z16.h, z2.h[6]
; CHECK-NEXT: mov z16.h, z1.h[6]
; CHECK-NEXT: strh w8, [sp, #16]
; CHECK-NEXT: fmov w8, s17
; CHECK-NEXT: mov z17.h, z0.h[6]
; CHECK-NEXT: strh w8, [sp, #62]
; CHECK-NEXT: fmov w8, s18
; CHECK-NEXT: mov z18.h, z2.h[5]
; CHECK-NEXT: mov z18.h, z1.h[5]
; CHECK-NEXT: strh w8, [sp, #60]
; CHECK-NEXT: fmov w8, s16
; CHECK-NEXT: mov z16.h, z0.h[5]
; CHECK-NEXT: strh w8, [sp, #58]
; CHECK-NEXT: fmov w8, s17
; CHECK-NEXT: mov z17.h, z2.h[4]
; CHECK-NEXT: mov z17.h, z1.h[4]
; CHECK-NEXT: strh w8, [sp, #56]
; CHECK-NEXT: fmov w8, s18
; CHECK-NEXT: mov z18.h, z0.h[4]
; CHECK-NEXT: zip1 z0.h, z0.h, z2.h
; CHECK-NEXT: zip1 z0.h, z0.h, z1.h
; CHECK-NEXT: zip1 z1.h, z2.h, z5.h
; CHECK-NEXT: strh w8, [sp, #54]
; CHECK-NEXT: fmov w8, s16
; CHECK-NEXT: mov z16.h, z7.h[7]
; CHECK-NEXT: ldr q16, [sp, #16]
; CHECK-NEXT: add z0.h, z0.h, z1.h
; CHECK-NEXT: strh w8, [sp, #52]
; CHECK-NEXT: fmov w8, s17
; CHECK-NEXT: mov z17.h, z6.h[7]
; CHECK-NEXT: strh w8, [sp, #50]
; CHECK-NEXT: fmov w8, s18
; CHECK-NEXT: ldr q18, [sp, #16]
; CHECK-NEXT: mov z18.h, z7.h[7]
; CHECK-NEXT: strh w8, [sp, #48]
; CHECK-NEXT: fmov w8, s16
; CHECK-NEXT: mov z16.h, z6.h[6]
; CHECK-NEXT: ldr q20, [sp, #48]
; CHECK-NEXT: fmov w8, s18
; CHECK-NEXT: mov z18.h, z6.h[6]
; CHECK-NEXT: ldr q17, [sp, #48]
; CHECK-NEXT: strh w8, [sp, #46]
; CHECK-NEXT: fmov w8, s17
; CHECK-NEXT: mov z17.h, z7.h[5]
; CHECK-NEXT: strh w8, [sp, #44]
; CHECK-NEXT: fmov w8, s19
; CHECK-NEXT: mov z19.h, z6.h[5]
; CHECK-NEXT: mov z19.h, z7.h[5]
; CHECK-NEXT: strh w8, [sp, #44]
; CHECK-NEXT: fmov w8, s20
; CHECK-NEXT: mov z20.h, z6.h[5]
; CHECK-NEXT: strh w8, [sp, #42]
; CHECK-NEXT: fmov w8, s16
; CHECK-NEXT: mov z16.h, z7.h[4]
; CHECK-NEXT: fmov w8, s18
; CHECK-NEXT: mov z18.h, z7.h[4]
; CHECK-NEXT: strh w8, [sp, #40]
; CHECK-NEXT: fmov w8, s17
; CHECK-NEXT: mov z17.h, z6.h[4]
; CHECK-NEXT: strh w8, [sp, #38]
; CHECK-NEXT: fmov w8, s19
; CHECK-NEXT: mov z19.h, z3.h[7]
; CHECK-NEXT: mov z19.h, z6.h[4]
; CHECK-NEXT: strh w8, [sp, #38]
; CHECK-NEXT: fmov w8, s20
; CHECK-NEXT: mov z20.h, z5.h[7]
; CHECK-NEXT: strh w8, [sp, #36]
; CHECK-NEXT: fmov w8, s16
; CHECK-NEXT: mov z16.h, z1.h[7]
; CHECK-NEXT: fmov w8, s18
; CHECK-NEXT: mov z18.h, z2.h[7]
; CHECK-NEXT: strh w8, [sp, #34]
; CHECK-NEXT: fmov w8, s17
; CHECK-NEXT: mov z17.h, z3.h[6]
; CHECK-NEXT: strh w8, [sp, #32]
; CHECK-NEXT: fmov w8, s19
; CHECK-NEXT: mov z19.h, z1.h[6]
; CHECK-NEXT: ldr q2, [sp, #32]
; CHECK-NEXT: mov z19.h, z5.h[6]
; CHECK-NEXT: strh w8, [sp, #32]
; CHECK-NEXT: fmov w8, s20
; CHECK-NEXT: mov z20.h, z2.h[6]
; CHECK-NEXT: strh w8, [sp, #14]
; CHECK-NEXT: fmov w8, s16
; CHECK-NEXT: mov z16.h, z3.h[5]
; CHECK-NEXT: add z2.h, z18.h, z2.h
; CHECK-NEXT: fmov w8, s18
; CHECK-NEXT: mov z18.h, z5.h[5]
; CHECK-NEXT: strh w8, [sp, #12]
; CHECK-NEXT: fmov w8, s17
; CHECK-NEXT: mov z17.h, z1.h[5]
; CHECK-NEXT: strh w8, [sp, #10]
; CHECK-NEXT: fmov w8, s19
; CHECK-NEXT: mov z19.h, z3.h[4]
; CHECK-NEXT: fmov w9, s17
; CHECK-NEXT: mov z19.h, z2.h[5]
; CHECK-NEXT: strh w8, [sp, #10]
; CHECK-NEXT: fmov w8, s20
; CHECK-NEXT: mov z20.h, z5.h[4]
; CHECK-NEXT: fmov w9, s19
; CHECK-NEXT: strh w8, [sp, #8]
; CHECK-NEXT: fmov w8, s16
; CHECK-NEXT: mov z16.h, z1.h[4]
; CHECK-NEXT: zip1 z1.h, z1.h, z3.h
; CHECK-NEXT: fmov w8, s18
; CHECK-NEXT: mov z18.h, z2.h[4]
; CHECK-NEXT: strh w9, [sp, #4]
; CHECK-NEXT: add z3.h, z4.h, z5.h
; CHECK-NEXT: ldr q2, [sp, #32]
; CHECK-NEXT: strh w8, [sp, #6]
; CHECK-NEXT: fmov w8, s19
; CHECK-NEXT: add z0.h, z0.h, z1.h
; CHECK-NEXT: fmov w8, s20
; CHECK-NEXT: add z2.h, z16.h, z2.h
; CHECK-NEXT: strh w8, [sp, #2]
; CHECK-NEXT: fmov w8, s16
; CHECK-NEXT: fmov w8, s18
; CHECK-NEXT: strh w8, [sp]
; CHECK-NEXT: ldr q4, [sp]
; CHECK-NEXT: stp q3, q2, [x0, #32]
; CHECK-NEXT: add z1.h, z20.h, z4.h
; CHECK-NEXT: add z1.h, z17.h, z4.h
; CHECK-NEXT: stp q0, q1, [x0]
; CHECK-NEXT: add sp, sp, #64
; CHECK-NEXT: ret
Expand Down Expand Up @@ -956,20 +956,22 @@ define void @uzp_v16i16(ptr %a, ptr %b) #0{
; CHECK: // %bb.0:
; CHECK-NEXT: sub sp, sp, #64
; CHECK-NEXT: .cfi_def_cfa_offset 64
; CHECK-NEXT: ldp q3, q2, [x0]
; CHECK-NEXT: ldp q2, q3, [x0]
; CHECK-NEXT: ldp q0, q1, [x1]
; CHECK-NEXT: mov z4.h, z2.h[6]
; CHECK-NEXT: fmov w8, s2
; CHECK-NEXT: mov z6.h, z2.h[2]
; CHECK-NEXT: mov z5.h, z2.h[4]
; CHECK-NEXT: fmov w9, s3
; CHECK-NEXT: mov z7.h, z3.h[6]
; CHECK-NEXT: mov z4.h, z3.h[6]
; CHECK-NEXT: fmov w8, s3
; CHECK-NEXT: mov z6.h, z3.h[2]
; CHECK-NEXT: mov z5.h, z3.h[4]
; CHECK-NEXT: fmov w9, s2
; CHECK-NEXT: mov z7.h, z2.h[6]
; CHECK-NEXT: mov z17.h, z2.h[7]
; CHECK-NEXT: mov z16.h, z3.h[1]
; CHECK-NEXT: strh w8, [sp, #40]
; CHECK-NEXT: fmov w8, s4
; CHECK-NEXT: mov z4.h, z3.h[4]
; CHECK-NEXT: mov z4.h, z2.h[4]
; CHECK-NEXT: strh w9, [sp, #32]
; CHECK-NEXT: fmov w9, s5
; CHECK-NEXT: mov z5.h, z3.h[2]
; CHECK-NEXT: mov z5.h, z2.h[2]
; CHECK-NEXT: strh w8, [sp, #46]
; CHECK-NEXT: fmov w8, s6
; CHECK-NEXT: mov z6.h, z1.h[2]
Expand All @@ -980,12 +982,13 @@ define void @uzp_v16i16(ptr %a, ptr %b) #0{
; CHECK-NEXT: fmov w8, s4
; CHECK-NEXT: mov z4.h, z1.h[6]
; CHECK-NEXT: strh w9, [sp, #38]
; CHECK-NEXT: fmov w9, s16
; CHECK-NEXT: strh w8, [sp, #36]
; CHECK-NEXT: fmov w8, s5
; CHECK-NEXT: mov z5.h, z1.h[4]
; CHECK-NEXT: strh w9, [sp, #56]
; CHECK-NEXT: strh w8, [sp, #34]
; CHECK-NEXT: fmov w8, s1
; CHECK-NEXT: ldr q16, [sp, #32]
; CHECK-NEXT: strh w8, [sp, #8]
; CHECK-NEXT: fmov w8, s0
; CHECK-NEXT: strh w8, [sp]
Expand All @@ -996,66 +999,63 @@ define void @uzp_v16i16(ptr %a, ptr %b) #0{
; CHECK-NEXT: mov z5.h, z0.h[2]
; CHECK-NEXT: strh w8, [sp, #12]
; CHECK-NEXT: fmov w8, s6
; CHECK-NEXT: mov z6.h, z2.h[7]
; CHECK-NEXT: mov z6.h, z3.h[7]
; CHECK-NEXT: strh w8, [sp, #10]
; CHECK-NEXT: fmov w8, s7
; CHECK-NEXT: mov z7.h, z3.h[7]
; CHECK-NEXT: mov z7.h, z3.h[5]
; CHECK-NEXT: strh w8, [sp, #6]
; CHECK-NEXT: fmov w8, s4
; CHECK-NEXT: mov z4.h, z2.h[5]
; CHECK-NEXT: strh w8, [sp, #4]
; CHECK-NEXT: fmov w8, s5
; CHECK-NEXT: mov z5.h, z2.h[3]
; CHECK-NEXT: mov z2.h, z2.h[1]
; CHECK-NEXT: mov z5.h, z3.h[3]
; CHECK-NEXT: ldr q3, [sp, #32]
; CHECK-NEXT: strh w8, [sp, #2]
; CHECK-NEXT: fmov w8, s6
; CHECK-NEXT: fmov w9, s2
; CHECK-NEXT: mov z2.h, z3.h[1]
; CHECK-NEXT: ldr q6, [sp]
; CHECK-NEXT: mov z6.h, z2.h[5]
; CHECK-NEXT: ldr q4, [sp]
; CHECK-NEXT: strh w8, [sp, #62]
; CHECK-NEXT: fmov w8, s4
; CHECK-NEXT: mov z4.h, z3.h[5]
; CHECK-NEXT: strh w9, [sp, #56]
; CHECK-NEXT: fmov w9, s2
; CHECK-NEXT: mov z2.h, z0.h[7]
; CHECK-NEXT: fmov w8, s7
; CHECK-NEXT: mov z7.h, z1.h[7]
; CHECK-NEXT: strh w8, [sp, #60]
; CHECK-NEXT: fmov w8, s5
; CHECK-NEXT: mov z5.h, z3.h[3]
; CHECK-NEXT: mov z3.h, z1.h[7]
; CHECK-NEXT: strh w9, [sp, #48]
; CHECK-NEXT: mov z5.h, z2.h[3]
; CHECK-NEXT: mov z2.h, z2.h[1]
; CHECK-NEXT: strh w8, [sp, #58]
; CHECK-NEXT: fmov w8, s7
; CHECK-NEXT: fmov w8, s17
; CHECK-NEXT: fmov w9, s2
; CHECK-NEXT: mov z2.h, z0.h[7]
; CHECK-NEXT: strh w8, [sp, #54]
; CHECK-NEXT: fmov w8, s4
; CHECK-NEXT: mov z4.h, z1.h[5]
; CHECK-NEXT: fmov w8, s6
; CHECK-NEXT: mov z6.h, z1.h[5]
; CHECK-NEXT: strh w9, [sp, #48]
; CHECK-NEXT: strh w8, [sp, #52]
; CHECK-NEXT: fmov w8, s5
; CHECK-NEXT: mov z5.h, z1.h[3]
; CHECK-NEXT: mov z1.h, z1.h[1]
; CHECK-NEXT: strh w8, [sp, #50]
; CHECK-NEXT: fmov w8, s3
; CHECK-NEXT: mov z3.h, z0.h[5]
; CHECK-NEXT: fmov w8, s7
; CHECK-NEXT: strh w8, [sp, #30]
; CHECK-NEXT: fmov w8, s4
; CHECK-NEXT: mov z4.h, z0.h[3]
; CHECK-NEXT: mov z0.h, z0.h[1]
; CHECK-NEXT: fmov w8, s6
; CHECK-NEXT: mov z6.h, z0.h[5]
; CHECK-NEXT: strh w8, [sp, #28]
; CHECK-NEXT: fmov w8, s5
; CHECK-NEXT: mov z5.h, z0.h[3]
; CHECK-NEXT: mov z0.h, z0.h[1]
; CHECK-NEXT: strh w8, [sp, #26]
; CHECK-NEXT: fmov w8, s1
; CHECK-NEXT: strh w8, [sp, #24]
; CHECK-NEXT: fmov w8, s2
; CHECK-NEXT: strh w8, [sp, #22]
; CHECK-NEXT: fmov w8, s3
; CHECK-NEXT: fmov w8, s6
; CHECK-NEXT: strh w8, [sp, #20]
; CHECK-NEXT: fmov w8, s4
; CHECK-NEXT: fmov w8, s5
; CHECK-NEXT: strh w8, [sp, #18]
; CHECK-NEXT: fmov w8, s0
; CHECK-NEXT: ldr q0, [sp, #48]
; CHECK-NEXT: add z0.h, z16.h, z0.h
; CHECK-NEXT: add z0.h, z3.h, z0.h
; CHECK-NEXT: strh w8, [sp, #16]
; CHECK-NEXT: ldr q1, [sp, #16]
; CHECK-NEXT: add z1.h, z6.h, z1.h
; CHECK-NEXT: add z1.h, z4.h, z1.h
; CHECK-NEXT: stp q0, q1, [x0]
; CHECK-NEXT: add sp, sp, #64
; CHECK-NEXT: ret
Expand Down Expand Up @@ -1133,45 +1133,45 @@ define void @uzp_v8i16(ptr %a, ptr %b) #0{
; CHECK: // %bb.0:
; CHECK-NEXT: sub sp, sp, #32
; CHECK-NEXT: .cfi_def_cfa_offset 32
; CHECK-NEXT: ldr q0, [x1]
; CHECK-NEXT: ldr q1, [x0]
; CHECK-NEXT: mov z2.h, z0.h[6]
; CHECK-NEXT: fmov w8, s0
; CHECK-NEXT: mov z4.h, z0.h[2]
; CHECK-NEXT: mov z6.h, z1.h[4]
; CHECK-NEXT: mov z3.h, z0.h[4]
; CHECK-NEXT: fmov w9, s1
; CHECK-NEXT: mov z5.h, z1.h[6]
; CHECK-NEXT: ldr q1, [x1]
; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: mov z2.h, z1.h[6]
; CHECK-NEXT: fmov w8, s1
; CHECK-NEXT: mov z4.h, z1.h[2]
; CHECK-NEXT: mov z6.h, z0.h[4]
; CHECK-NEXT: mov z3.h, z1.h[4]
; CHECK-NEXT: fmov w9, s0
; CHECK-NEXT: mov z5.h, z0.h[6]
; CHECK-NEXT: strh w8, [sp, #8]
; CHECK-NEXT: fmov w8, s2
; CHECK-NEXT: mov z2.h, z1.h[2]
; CHECK-NEXT: mov z2.h, z0.h[2]
; CHECK-NEXT: strh w9, [sp]
; CHECK-NEXT: fmov w9, s3
; CHECK-NEXT: mov z3.h, z0.h[7]
; CHECK-NEXT: mov z3.h, z1.h[7]
; CHECK-NEXT: strh w8, [sp, #14]
; CHECK-NEXT: fmov w8, s4
; CHECK-NEXT: mov z4.h, z0.h[5]
; CHECK-NEXT: mov z4.h, z1.h[5]
; CHECK-NEXT: strh w9, [sp, #12]
; CHECK-NEXT: fmov w9, s5
; CHECK-NEXT: mov z5.h, z0.h[3]
; CHECK-NEXT: mov z0.h, z0.h[1]
; CHECK-NEXT: mov z5.h, z1.h[3]
; CHECK-NEXT: mov z1.h, z1.h[1]
; CHECK-NEXT: strh w8, [sp, #10]
; CHECK-NEXT: fmov w8, s6
; CHECK-NEXT: strh w9, [sp, #6]
; CHECK-NEXT: fmov w9, s0
; CHECK-NEXT: mov z0.h, z1.h[1]
; CHECK-NEXT: fmov w9, s1
; CHECK-NEXT: strh w8, [sp, #4]
; CHECK-NEXT: fmov w8, s2
; CHECK-NEXT: mov z2.h, z1.h[7]
; CHECK-NEXT: mov z2.h, z0.h[7]
; CHECK-NEXT: strh w9, [sp, #24]
; CHECK-NEXT: strh w8, [sp, #2]
; CHECK-NEXT: fmov w8, s3
; CHECK-NEXT: strh w8, [sp, #30]
; CHECK-NEXT: fmov w8, s4
; CHECK-NEXT: mov z4.h, z1.h[5]
; CHECK-NEXT: mov z4.h, z0.h[5]
; CHECK-NEXT: strh w8, [sp, #28]
; CHECK-NEXT: fmov w8, s5
; CHECK-NEXT: mov z5.h, z1.h[3]
; CHECK-NEXT: mov z5.h, z0.h[3]
; CHECK-NEXT: mov z0.h, z0.h[1]
; CHECK-NEXT: strh w8, [sp, #26]
; CHECK-NEXT: fmov w8, s2
; CHECK-NEXT: strh w8, [sp, #22]
Expand Down
39 changes: 19 additions & 20 deletions llvm/test/CodeGen/AArch64/swifterror.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1618,7 +1618,7 @@ define swiftcc { i64, i64, i64, i64, i64, i64, i64, i64 } @params_and_return_in_
; CHECK-APPLE-LABEL: params_and_return_in_reg:
; CHECK-APPLE: ; %bb.0:
; CHECK-APPLE-NEXT: sub sp, sp, #128
; CHECK-APPLE-NEXT: stp x21, x28, [sp, #24] ; 16-byte Folded Spill
; CHECK-APPLE-NEXT: stp x20, x28, [sp, #24] ; 16-byte Folded Spill
; CHECK-APPLE-NEXT: stp x27, x26, [sp, #48] ; 16-byte Folded Spill
; CHECK-APPLE-NEXT: stp x25, x24, [sp, #64] ; 16-byte Folded Spill
; CHECK-APPLE-NEXT: stp x23, x22, [sp, #80] ; 16-byte Folded Spill
Expand All @@ -1637,8 +1637,8 @@ define swiftcc { i64, i64, i64, i64, i64, i64, i64, i64 } @params_and_return_in_
; CHECK-APPLE-NEXT: .cfi_offset w26, -72
; CHECK-APPLE-NEXT: .cfi_offset w27, -80
; CHECK-APPLE-NEXT: .cfi_offset w28, -96
; CHECK-APPLE-NEXT: str x20, [sp, #8] ; 8-byte Folded Spill
; CHECK-APPLE-NEXT: mov x23, x7
; CHECK-APPLE-NEXT: mov x23, x21
; CHECK-APPLE-NEXT: str x7, [sp, #16] ; 8-byte Folded Spill
; CHECK-APPLE-NEXT: mov x24, x6
; CHECK-APPLE-NEXT: mov x25, x5
; CHECK-APPLE-NEXT: mov x26, x4
Expand All @@ -1657,26 +1657,25 @@ define swiftcc { i64, i64, i64, i64, i64, i64, i64, i64 } @params_and_return_in_
; CHECK-APPLE-NEXT: mov x20, xzr
; CHECK-APPLE-NEXT: mov x21, xzr
; CHECK-APPLE-NEXT: bl _params_in_reg2
; CHECK-APPLE-NEXT: str x21, [sp, #16] ; 8-byte Folded Spill
; CHECK-APPLE-NEXT: str x21, [sp, #8] ; 8-byte Folded Spill
; CHECK-APPLE-NEXT: mov x0, x22
; CHECK-APPLE-NEXT: mov x1, x19
; CHECK-APPLE-NEXT: mov x2, x28
; CHECK-APPLE-NEXT: mov x3, x27
; CHECK-APPLE-NEXT: mov x4, x26
; CHECK-APPLE-NEXT: mov x5, x25
; CHECK-APPLE-NEXT: mov x6, x24
; CHECK-APPLE-NEXT: mov x7, x23
; CHECK-APPLE-NEXT: ldr x20, [sp, #8] ; 8-byte Folded Reload
; CHECK-APPLE-NEXT: ldr x21, [sp, #24] ; 8-byte Folded Reload
; CHECK-APPLE-NEXT: ldp x7, x20, [sp, #16] ; 16-byte Folded Reload
; CHECK-APPLE-NEXT: mov x21, x23
; CHECK-APPLE-NEXT: bl _params_and_return_in_reg2
; CHECK-APPLE-NEXT: mov x19, x0
; CHECK-APPLE-NEXT: mov x22, x1
; CHECK-APPLE-NEXT: mov x23, x2
; CHECK-APPLE-NEXT: mov x24, x3
; CHECK-APPLE-NEXT: mov x25, x4
; CHECK-APPLE-NEXT: mov x26, x5
; CHECK-APPLE-NEXT: mov x27, x6
; CHECK-APPLE-NEXT: mov x28, x7
; CHECK-APPLE-NEXT: mov x24, x2
; CHECK-APPLE-NEXT: mov x25, x3
; CHECK-APPLE-NEXT: mov x26, x4
; CHECK-APPLE-NEXT: mov x27, x5
; CHECK-APPLE-NEXT: mov x28, x6
; CHECK-APPLE-NEXT: mov x23, x7
; CHECK-APPLE-NEXT: str x21, [sp, #24] ; 8-byte Folded Spill
; CHECK-APPLE-NEXT: mov w0, #1 ; =0x1
; CHECK-APPLE-NEXT: mov w1, #2 ; =0x2
Expand All @@ -1687,16 +1686,16 @@ define swiftcc { i64, i64, i64, i64, i64, i64, i64, i64 } @params_and_return_in_
; CHECK-APPLE-NEXT: mov w6, #7 ; =0x7
; CHECK-APPLE-NEXT: mov w7, #8 ; =0x8
; CHECK-APPLE-NEXT: mov x20, xzr
; CHECK-APPLE-NEXT: ldr x21, [sp, #16] ; 8-byte Folded Reload
; CHECK-APPLE-NEXT: ldr x21, [sp, #8] ; 8-byte Folded Reload
; CHECK-APPLE-NEXT: bl _params_in_reg2
; CHECK-APPLE-NEXT: mov x0, x19
; CHECK-APPLE-NEXT: mov x1, x22
; CHECK-APPLE-NEXT: mov x2, x23
; CHECK-APPLE-NEXT: mov x3, x24
; CHECK-APPLE-NEXT: mov x4, x25
; CHECK-APPLE-NEXT: mov x5, x26
; CHECK-APPLE-NEXT: mov x6, x27
; CHECK-APPLE-NEXT: mov x7, x28
; CHECK-APPLE-NEXT: mov x2, x24
; CHECK-APPLE-NEXT: mov x3, x25
; CHECK-APPLE-NEXT: mov x4, x26
; CHECK-APPLE-NEXT: mov x5, x27
; CHECK-APPLE-NEXT: mov x6, x28
; CHECK-APPLE-NEXT: mov x7, x23
; CHECK-APPLE-NEXT: ldp x21, x28, [sp, #24] ; 16-byte Folded Reload
; CHECK-APPLE-NEXT: ldp x29, x30, [sp, #112] ; 16-byte Folded Reload
; CHECK-APPLE-NEXT: ldp x20, x19, [sp, #96] ; 16-byte Folded Reload
Expand Down
8 changes: 4 additions & 4 deletions llvm/test/CodeGen/AArch64/vec-libcalls.ll
Original file line number Diff line number Diff line change
Expand Up @@ -52,15 +52,15 @@ define <2 x float> @sin_v2f32(<2 x float> %x) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: sub sp, sp, #48
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: str q0, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill
; CHECK-NEXT: mov s0, v0.s[1]
; CHECK-NEXT: str x30, [sp, #32] // 8-byte Folded Spill
; CHECK-NEXT: bl sinf
; CHECK-NEXT: str d0, [sp] // 16-byte Folded Spill
; CHECK-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload
; CHECK-NEXT: str d0, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0
; CHECK-NEXT: bl sinf
; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload
; CHECK-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload
; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0
; CHECK-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload
; CHECK-NEXT: mov v0.s[1], v1.s[0]
Expand Down
54 changes: 27 additions & 27 deletions llvm/test/CodeGen/AArch64/vector-fcopysign.ll
Original file line number Diff line number Diff line change
Expand Up @@ -405,60 +405,60 @@ define <8 x half> @test_copysign_v8f16_v8f32(<8 x half> %a, <8 x float> %b) #0 {
; NOFP16-LABEL: test_copysign_v8f16_v8f32:
; NOFP16: ; %bb.0:
; NOFP16-NEXT: fcvtn v1.4h, v1.4s
; NOFP16-NEXT: mov h3, v0[1]
; NOFP16-NEXT: mov h4, v0[1]
; NOFP16-NEXT: fcvt s6, h0
; NOFP16-NEXT: mvni.4s v5, #128, lsl #24
; NOFP16-NEXT: mvni.4s v3, #128, lsl #24
; NOFP16-NEXT: mov h7, v0[2]
; NOFP16-NEXT: fcvtn v2.4h, v2.4s
; NOFP16-NEXT: mov h4, v1[1]
; NOFP16-NEXT: mov h5, v1[1]
; NOFP16-NEXT: fcvt s16, h1
; NOFP16-NEXT: fcvt s3, h3
; NOFP16-NEXT: fcvt s4, h4
; NOFP16-NEXT: mov h17, v1[2]
; NOFP16-NEXT: mov h1, v1[3]
; NOFP16-NEXT: fcvt s7, h7
; NOFP16-NEXT: fcvt s4, h4
; NOFP16-NEXT: bif.16b v6, v16, v5
; NOFP16-NEXT: fcvt s5, h5
; NOFP16-NEXT: bif.16b v6, v16, v3
; NOFP16-NEXT: mov h16, v0[3]
; NOFP16-NEXT: fcvt s17, h17
; NOFP16-NEXT: fcvt s18, h1
; NOFP16-NEXT: bif.16b v3, v4, v5
; NOFP16-NEXT: bif.16b v4, v5, v3
; NOFP16-NEXT: fcvt h1, s6
; NOFP16-NEXT: mov.16b v6, v5
; NOFP16-NEXT: mov h4, v0[4]
; NOFP16-NEXT: mov.16b v6, v3
; NOFP16-NEXT: mov h5, v0[4]
; NOFP16-NEXT: fcvt s16, h16
; NOFP16-NEXT: bsl.16b v6, v7, v17
; NOFP16-NEXT: mov h7, v0[5]
; NOFP16-NEXT: mov h17, v2[1]
; NOFP16-NEXT: fcvt h3, s3
; NOFP16-NEXT: fcvt s4, h4
; NOFP16-NEXT: bif.16b v16, v18, v5
; NOFP16-NEXT: fcvt h4, s4
; NOFP16-NEXT: fcvt s5, h5
; NOFP16-NEXT: bif.16b v16, v18, v3
; NOFP16-NEXT: fcvt h6, s6
; NOFP16-NEXT: fcvt s7, h7
; NOFP16-NEXT: fcvt s17, h17
; NOFP16-NEXT: mov.h v1[1], v3[0]
; NOFP16-NEXT: fcvt s3, h2
; NOFP16-NEXT: bif.16b v7, v17, v5
; NOFP16-NEXT: bit.16b v3, v4, v5
; NOFP16-NEXT: fcvt h4, s16
; NOFP16-NEXT: mov.h v1[1], v4[0]
; NOFP16-NEXT: fcvt s4, h2
; NOFP16-NEXT: bif.16b v7, v17, v3
; NOFP16-NEXT: bit.16b v4, v5, v3
; NOFP16-NEXT: fcvt h5, s16
; NOFP16-NEXT: mov.h v1[2], v6[0]
; NOFP16-NEXT: mov h6, v0[6]
; NOFP16-NEXT: mov h16, v2[2]
; NOFP16-NEXT: mov h0, v0[7]
; NOFP16-NEXT: mov h2, v2[3]
; NOFP16-NEXT: mov.h v1[3], v4[0]
; NOFP16-NEXT: fcvt h3, s3
; NOFP16-NEXT: fcvt s4, h6
; NOFP16-NEXT: mov.h v1[3], v5[0]
; NOFP16-NEXT: fcvt h4, s4
; NOFP16-NEXT: fcvt s5, h6
; NOFP16-NEXT: fcvt s6, h16
; NOFP16-NEXT: fcvt s0, h0
; NOFP16-NEXT: fcvt s2, h2
; NOFP16-NEXT: mov.h v1[4], v3[0]
; NOFP16-NEXT: fcvt h3, s7
; NOFP16-NEXT: bif.16b v4, v6, v5
; NOFP16-NEXT: bif.16b v0, v2, v5
; NOFP16-NEXT: mov.h v1[5], v3[0]
; NOFP16-NEXT: fcvt h3, s4
; NOFP16-NEXT: mov.h v1[4], v4[0]
; NOFP16-NEXT: fcvt h4, s7
; NOFP16-NEXT: bif.16b v5, v6, v3
; NOFP16-NEXT: bif.16b v0, v2, v3
; NOFP16-NEXT: mov.h v1[5], v4[0]
; NOFP16-NEXT: fcvt h4, s5
; NOFP16-NEXT: fcvt h0, s0
; NOFP16-NEXT: mov.h v1[6], v3[0]
; NOFP16-NEXT: mov.h v1[6], v4[0]
; NOFP16-NEXT: mov.h v1[7], v0[0]
; NOFP16-NEXT: mov.16b v0, v1
; NOFP16-NEXT: ret
Expand Down
100 changes: 50 additions & 50 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll
Original file line number Diff line number Diff line change
Expand Up @@ -81,21 +81,21 @@ define amdgpu_cs void @atomic_add_and_format(<4 x i32> inreg %arg) {
;
; GCN-LABEL: atomic_add_and_format:
; GCN: ; %bb.0: ; %.entry
; GCN-NEXT: s_mov_b64 s[4:5], exec
; GCN-NEXT: s_mov_b32 s6, s5
; GCN-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0
; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s6, v0
; GCN-NEXT: s_mov_b64 s[6:7], exec
; GCN-NEXT: s_mov_b32 s4, s7
; GCN-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0
; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s4, v0
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: ; implicit-def: $vgpr1
; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB1_2
; GCN-NEXT: ; %bb.1:
; GCN-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GCN-NEXT: v_mov_b32_e32 v1, s4
; GCN-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
; GCN-NEXT: v_mov_b32_e32 v1, s6
; GCN-NEXT: v_mov_b32_e32 v2, 0
; GCN-NEXT: buffer_atomic_add v1, v2, s[0:3], 0 idxen glc
; GCN-NEXT: .LBB1_2:
; GCN-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_readfirstlane_b32 s4, v1
; GCN-NEXT: v_add_i32_e32 v4, vcc, s4, v0
Expand Down Expand Up @@ -179,21 +179,21 @@ define amdgpu_cs void @atomic_sub_and_format(<4 x i32> inreg %arg) {
;
; GCN-LABEL: atomic_sub_and_format:
; GCN: ; %bb.0: ; %.entry
; GCN-NEXT: s_mov_b64 s[4:5], exec
; GCN-NEXT: s_mov_b32 s6, s5
; GCN-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0
; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s6, v0
; GCN-NEXT: s_mov_b64 s[6:7], exec
; GCN-NEXT: s_mov_b32 s4, s7
; GCN-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0
; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s4, v0
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: ; implicit-def: $vgpr1
; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB3_2
; GCN-NEXT: ; %bb.1:
; GCN-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GCN-NEXT: v_mov_b32_e32 v1, s4
; GCN-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
; GCN-NEXT: v_mov_b32_e32 v1, s6
; GCN-NEXT: v_mov_b32_e32 v2, 0
; GCN-NEXT: buffer_atomic_sub v1, v2, s[0:3], 0 idxen glc
; GCN-NEXT: .LBB3_2:
; GCN-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_readfirstlane_b32 s4, v1
; GCN-NEXT: v_sub_i32_e32 v4, vcc, s4, v0
Expand Down Expand Up @@ -281,22 +281,22 @@ define amdgpu_cs void @atomic_xor_and_format(<4 x i32> inreg %arg) {
;
; GCN-LABEL: atomic_xor_and_format:
; GCN: ; %bb.0: ; %.entry
; GCN-NEXT: s_mov_b64 s[4:5], exec
; GCN-NEXT: s_mov_b32 s6, s5
; GCN-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0
; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s6, v0
; GCN-NEXT: s_mov_b64 s[6:7], exec
; GCN-NEXT: s_mov_b32 s4, s7
; GCN-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0
; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s4, v0
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: ; implicit-def: $vgpr1
; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB5_2
; GCN-NEXT: ; %bb.1:
; GCN-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GCN-NEXT: s_and_b32 s4, s4, 1
; GCN-NEXT: v_mov_b32_e32 v1, s4
; GCN-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
; GCN-NEXT: s_and_b32 s6, s6, 1
; GCN-NEXT: v_mov_b32_e32 v1, s6
; GCN-NEXT: v_mov_b32_e32 v2, 0
; GCN-NEXT: buffer_atomic_xor v1, v2, s[0:3], 0 idxen glc
; GCN-NEXT: .LBB5_2:
; GCN-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_readfirstlane_b32 s4, v1
; GCN-NEXT: v_and_b32_e32 v0, 1, v0
Expand Down Expand Up @@ -383,21 +383,21 @@ define amdgpu_cs void @atomic_ptr_add_and_format(ptr addrspace(8) inreg %arg) {
;
; GCN-LABEL: atomic_ptr_add_and_format:
; GCN: ; %bb.0: ; %.entry
; GCN-NEXT: s_mov_b64 s[4:5], exec
; GCN-NEXT: s_mov_b32 s6, s5
; GCN-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0
; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s6, v0
; GCN-NEXT: s_mov_b64 s[6:7], exec
; GCN-NEXT: s_mov_b32 s4, s7
; GCN-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0
; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s4, v0
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: ; implicit-def: $vgpr1
; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB7_2
; GCN-NEXT: ; %bb.1:
; GCN-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GCN-NEXT: v_mov_b32_e32 v1, s4
; GCN-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
; GCN-NEXT: v_mov_b32_e32 v1, s6
; GCN-NEXT: v_mov_b32_e32 v2, 0
; GCN-NEXT: buffer_atomic_add v1, v2, s[0:3], 0 idxen glc
; GCN-NEXT: .LBB7_2:
; GCN-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_readfirstlane_b32 s4, v1
; GCN-NEXT: v_add_i32_e32 v4, vcc, s4, v0
Expand Down Expand Up @@ -485,21 +485,21 @@ define amdgpu_cs void @atomic_ptr_sub_and_format(ptr addrspace(8) inreg %arg) {
;
; GCN-LABEL: atomic_ptr_sub_and_format:
; GCN: ; %bb.0: ; %.entry
; GCN-NEXT: s_mov_b64 s[4:5], exec
; GCN-NEXT: s_mov_b32 s6, s5
; GCN-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0
; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s6, v0
; GCN-NEXT: s_mov_b64 s[6:7], exec
; GCN-NEXT: s_mov_b32 s4, s7
; GCN-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0
; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s4, v0
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: ; implicit-def: $vgpr1
; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB9_2
; GCN-NEXT: ; %bb.1:
; GCN-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GCN-NEXT: v_mov_b32_e32 v1, s4
; GCN-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
; GCN-NEXT: v_mov_b32_e32 v1, s6
; GCN-NEXT: v_mov_b32_e32 v2, 0
; GCN-NEXT: buffer_atomic_sub v1, v2, s[0:3], 0 idxen glc
; GCN-NEXT: .LBB9_2:
; GCN-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_readfirstlane_b32 s4, v1
; GCN-NEXT: v_sub_i32_e32 v4, vcc, s4, v0
Expand Down Expand Up @@ -591,22 +591,22 @@ define amdgpu_cs void @atomic_ptr_xor_and_format(ptr addrspace(8) inreg %arg) {
;
; GCN-LABEL: atomic_ptr_xor_and_format:
; GCN: ; %bb.0: ; %.entry
; GCN-NEXT: s_mov_b64 s[4:5], exec
; GCN-NEXT: s_mov_b32 s6, s5
; GCN-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0
; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s6, v0
; GCN-NEXT: s_mov_b64 s[6:7], exec
; GCN-NEXT: s_mov_b32 s4, s7
; GCN-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0
; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s4, v0
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: ; implicit-def: $vgpr1
; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB11_2
; GCN-NEXT: ; %bb.1:
; GCN-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GCN-NEXT: s_and_b32 s4, s4, 1
; GCN-NEXT: v_mov_b32_e32 v1, s4
; GCN-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
; GCN-NEXT: s_and_b32 s6, s6, 1
; GCN-NEXT: v_mov_b32_e32 v1, s6
; GCN-NEXT: v_mov_b32_e32 v2, 0
; GCN-NEXT: buffer_atomic_xor v1, v2, s[0:3], 0 idxen glc
; GCN-NEXT: .LBB11_2:
; GCN-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_readfirstlane_b32 s4, v1
; GCN-NEXT: v_and_b32_e32 v0, 1, v0
Expand Down
Loading