425 changes: 425 additions & 0 deletions llvm/test/CodeGen/AArch64/sink-and-fold.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,425 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s | FileCheck %s
target triple = "aarch64-linux"

declare i32 @use(...)

define i32 @f0(i1 %c1, ptr %p) nounwind {
; CHECK-LABEL: f0:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: add x0, x1, #8
; CHECK-NEXT: tbz w8, #0, .LBB0_2
; CHECK-NEXT: // %bb.1: // %if.then
; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: bl use
; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
; CHECK-NEXT: .LBB0_2: // %if.else
; CHECK-NEXT: ldr w0, [x0]
; CHECK-NEXT: ret
entry:
%a = getelementptr i32, ptr %p, i32 2
br i1 %c1, label %if.then, label %if.else

if.then:
%v0 = call i32 @use(ptr %a)
br label %exit

if.else:
%v1 = load i32, ptr %a
br label %exit

exit:
%v = phi i32 [%v0, %if.then], [%v1, %if.else]
ret i32 %v
}

define i32 @f1(i1 %c1, ptr %p, i64 %i) nounwind {
; CHECK-LABEL: f1:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: add x0, x1, x2
; CHECK-NEXT: tbz w8, #0, .LBB1_2
; CHECK-NEXT: // %bb.1: // %if.then
; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: bl use
; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
; CHECK-NEXT: .LBB1_2: // %if.else
; CHECK-NEXT: ldr w0, [x0]
; CHECK-NEXT: ret
entry:
%a = getelementptr i8, ptr %p, i64 %i
br i1 %c1, label %if.then, label %if.else

if.then:
%v0 = call i32 @use(ptr %a)
br label %exit

if.else:
%v1 = load i32, ptr %a
br label %exit

exit:
%v = phi i32 [%v0, %if.then], [%v1, %if.else]
ret i32 %v
}

; Address calculation too slow.
%S = type {i32, [7 x i32] }
define i32 @f2(i1 %c1, ptr %p, i64 %i) nounwind "target-features"="+alu-lsl-fast" {
; CHECK-LABEL: f2:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: add x1, x1, x2, lsl #5
; CHECK-NEXT: tbz w0, #0, .LBB2_2
; CHECK-NEXT: // %bb.1: // %if.then
; CHECK-NEXT: mov x0, x1
; CHECK-NEXT: bl use
; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
; CHECK-NEXT: .LBB2_2: // %if.else
; CHECK-NEXT: mov w0, #1 // =0x1
; CHECK-NEXT: bl use
; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
entry:
%a = getelementptr %S, ptr %p, i64 %i
br i1 %c1, label %if.then, label %if.else

if.then:
%v0 = call i32 @use(ptr %a)
br label %exit

if.else:
%v1 = call i32 @use(i32 1, ptr %a)
br label %exit

exit:
%v = phi i32 [%v0, %if.then], [%v1, %if.else]
ret i32 %v
}

; Address calculation cheap enough on some cores.
define i32 @f3(i1 %c1, ptr %p, i64 %i) nounwind "target-features"="+alu-lsl-fast" {
; CHECK-LABEL: f3:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: add x0, x1, x2, lsl #2
; CHECK-NEXT: tbz w8, #0, .LBB3_2
; CHECK-NEXT: // %bb.1: // %if.then
; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: bl use
; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
; CHECK-NEXT: .LBB3_2: // %if.else
; CHECK-NEXT: ldr w0, [x0]
; CHECK-NEXT: ret
entry:
%a = getelementptr i32, ptr %p, i64 %i
br i1 %c1, label %if.then, label %if.else

if.then:
%v0 = call i32 @use(ptr %a)
br label %exit

if.else:
%v1 = load i32, ptr %a
br label %exit

exit:
%v = phi i32 [%v0, %if.then], [%v1, %if.else]
ret i32 %v
}

define void @f4(ptr %a, i64 %n) nounwind "target-features"="+alu-lsl-fast,+addr-lsl-fast" {
; CHECK-LABEL: f4:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: cmp x1, #1
; CHECK-NEXT: b.lt .LBB4_9
; CHECK-NEXT: // %bb.1: // %LI.preheader
; CHECK-NEXT: str x30, [sp, #-64]! // 8-byte Folded Spill
; CHECK-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: mov x23, xzr
; CHECK-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: mov x19, x1
; CHECK-NEXT: mov x20, x0
; CHECK-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: b .LBB4_3
; CHECK-NEXT: .LBB4_2: // %LI.latch
; CHECK-NEXT: // in Loop: Header=BB4_3 Depth=1
; CHECK-NEXT: cmp x23, x19
; CHECK-NEXT: mov x23, x24
; CHECK-NEXT: b.ge .LBB4_8
; CHECK-NEXT: .LBB4_3: // %LI
; CHECK-NEXT: // =>This Loop Header: Depth=1
; CHECK-NEXT: // Child Loop BB4_6 Depth 2
; CHECK-NEXT: add x22, x20, x23, lsl #2
; CHECK-NEXT: mov x21, xzr
; CHECK-NEXT: add x24, x23, #1
; CHECK-NEXT: b .LBB4_6
; CHECK-NEXT: .LBB4_4: // %if.else
; CHECK-NEXT: // in Loop: Header=BB4_6 Depth=2
; CHECK-NEXT: ldr w0, [x22]
; CHECK-NEXT: .LBB4_5: // %LJ.latch
; CHECK-NEXT: // in Loop: Header=BB4_6 Depth=2
; CHECK-NEXT: add x8, x21, #1
; CHECK-NEXT: str w0, [x20, x21, lsl #2]
; CHECK-NEXT: sub x9, x8, #1
; CHECK-NEXT: mov x21, x8
; CHECK-NEXT: cmp x9, x19
; CHECK-NEXT: b.ge .LBB4_2
; CHECK-NEXT: .LBB4_6: // %LJ
; CHECK-NEXT: // Parent Loop BB4_3 Depth=1
; CHECK-NEXT: // => This Inner Loop Header: Depth=2
; CHECK-NEXT: ldr w8, [x20, x21, lsl #2]
; CHECK-NEXT: tbz w8, #31, .LBB4_4
; CHECK-NEXT: // %bb.7: // %if.then
; CHECK-NEXT: // in Loop: Header=BB4_6 Depth=2
; CHECK-NEXT: mov x0, x22
; CHECK-NEXT: mov x1, x21
; CHECK-NEXT: bl use
; CHECK-NEXT: b .LBB4_5
; CHECK-NEXT: .LBB4_8:
; CHECK-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload
; CHECK-NEXT: ldr x30, [sp], #64 // 8-byte Folded Reload
; CHECK-NEXT: .LBB4_9: // %exit
; CHECK-NEXT: ret
entry:
%c0 = icmp slt i64 %n, 1
br i1 %c0, label %exit, label %LI

LI:
%i = phi i64 [0, %entry], [%i.next, %LI.latch]
%i.next = add i64 %i, 1
%ai.ptr = getelementptr i32, ptr %a, i64 %i
br label %LJ

LJ:
%j = phi i64 [0, %LI], [%j.next, %LJ.latch]
%j.next = add i64 %j, 1
%aj.ptr = getelementptr i32, ptr %a, i64 %j
%aj = load i32, ptr %aj.ptr
%c1 = icmp slt i32 %aj, 0
br i1 %c1, label %if.then, label %if.else

if.then:
%v = call i32 @use(ptr %ai.ptr, i64 %j)
store i32 %v, ptr %aj.ptr
br label %LJ.latch

if.else:
%ai = load i32, ptr %ai.ptr
store i32 %ai, ptr %aj.ptr
br label %LJ.latch

LJ.latch:
%c2 = icmp slt i64 %j, %n
br i1 %c2, label %LJ, label %LI.latch

LI.latch:
%c3 = icmp slt i64 %i, %n
br i1 %c3, label %LI, label %exit

exit:
ret void
}

%T = type { i32, i32, i32 }

define void @f5(ptr %a, i32 %n, i32 %k) nounwind {
; CHECK-LABEL: f5:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: cmp w1, #1
; CHECK-NEXT: b.lt .LBB5_7
; CHECK-NEXT: // %bb.1: // %L.preheader
; CHECK-NEXT: str x30, [sp, #-48]! // 8-byte Folded Spill
; CHECK-NEXT: mov w8, #12 // =0xc
; CHECK-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: add x21, x0, #8
; CHECK-NEXT: smaddl x8, w2, w8, x0
; CHECK-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: mov w19, w1
; CHECK-NEXT: mov w22, #-1 // =0xffffffff
; CHECK-NEXT: add x20, x8, #4
; CHECK-NEXT: b .LBB5_4
; CHECK-NEXT: .LBB5_2: // %if.else
; CHECK-NEXT: // in Loop: Header=BB5_4 Depth=1
; CHECK-NEXT: ldr w0, [x20]
; CHECK-NEXT: .LBB5_3: // %L.latch
; CHECK-NEXT: // in Loop: Header=BB5_4 Depth=1
; CHECK-NEXT: add w22, w22, #1
; CHECK-NEXT: str w0, [x21], #12
; CHECK-NEXT: cmp w22, w19
; CHECK-NEXT: b.ge .LBB5_6
; CHECK-NEXT: .LBB5_4: // %L
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldr w8, [x21]
; CHECK-NEXT: tbz w8, #31, .LBB5_2
; CHECK-NEXT: // %bb.5: // %if.then
; CHECK-NEXT: // in Loop: Header=BB5_4 Depth=1
; CHECK-NEXT: add w1, w22, #1
; CHECK-NEXT: mov x0, x20
; CHECK-NEXT: bl use
; CHECK-NEXT: b .LBB5_3
; CHECK-NEXT: .LBB5_6:
; CHECK-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload
; CHECK-NEXT: ldr x30, [sp], #48 // 8-byte Folded Reload
; CHECK-NEXT: .LBB5_7: // %exit
; CHECK-NEXT: ret
entry:
%p = getelementptr %T, ptr %a, i32 %k, i32 1
%c0 = icmp slt i32 %n, 1
br i1 %c0, label %exit, label %L

L:
%i = phi i32 [0, %entry], [%i.next, %L.latch]
%i.next = add i32 %i, 1
%ai.ptr = getelementptr %T, ptr %a, i32 %i, i32 2
%ai = load i32, ptr %ai.ptr
%c1 = icmp slt i32 %ai, 0
br i1 %c1, label %if.then, label %if.else

if.then:
%u.0 = call i32 @use(ptr %p, i32 %i)
br label %L.latch

if.else:
%u.1 = load i32, ptr %p
br label %L.latch

L.latch:
%u = phi i32 [%u.0, %if.then], [%u.1, %if.else]
store i32 %u, ptr %ai.ptr
%c2 = icmp slt i32 %i, %n
br i1 %c2, label %L, label %exit

exit:
ret void
}

define i32 @f6(i1 %c, ptr %a, i32 %i) {
; CHECK-LABEL: f6:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2
; CHECK-NEXT: sxtw x8, w2
; CHECK-NEXT: tbz w0, #0, .LBB6_2
; CHECK-NEXT: // %bb.1: // %if.then
; CHECK-NEXT: mov w0, wzr
; CHECK-NEXT: str wzr, [x1, x8, lsl #2]
; CHECK-NEXT: ret
; CHECK-NEXT: .LBB6_2: // %if.else
; CHECK-NEXT: ldr w0, [x1, x8, lsl #2]
; CHECK-NEXT: ret
entry:
%j = sext i32 %i to i64
br i1 %c, label %if.then, label %if.else

if.then:
%p0 = getelementptr i32, ptr %a, i64 %j
store i32 0, ptr %p0
br label %exit

if.else:
%p1 = getelementptr i32, ptr %a, i64 %j
%v0 = load i32, ptr %p1
br label %exit

exit:
%v = phi i32 [0, %if.then], [%v0, %if.else]
ret i32 %v
}

define i8 @f7(i1 %c, ptr %a, i32 %i) {
; CHECK-LABEL: f7:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: mov w8, w2
; CHECK-NEXT: tbz w0, #0, .LBB7_2
; CHECK-NEXT: // %bb.1: // %if.then
; CHECK-NEXT: mov w0, wzr
; CHECK-NEXT: strb wzr, [x1, x8]
; CHECK-NEXT: ret
; CHECK-NEXT: .LBB7_2: // %if.else
; CHECK-NEXT: ldrb w0, [x1, x8]
; CHECK-NEXT: ret
entry:
%j = zext i32 %i to i64
br i1 %c, label %if.then, label %if.else

if.then:
%p0 = getelementptr i8, ptr %a, i64 %j
store i8 0, ptr %p0
br label %exit

if.else:
%p1 = getelementptr i8, ptr %a, i64 %j
%v0 = load i8, ptr %p1
br label %exit

exit:
%v = phi i8 [0, %if.then], [%v0, %if.else]
ret i8 %v
}

define i32 @f8(i1 %c, ptr %a, i32 %i) {
; CHECK-LABEL: f8:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: add x8, x1, w2, sxtw #2
; CHECK-NEXT: tbz w0, #0, .LBB8_2
; CHECK-NEXT: // %bb.1: // %if.then
; CHECK-NEXT: mov w0, wzr
; CHECK-NEXT: str wzr, [x8]
; CHECK-NEXT: ret
; CHECK-NEXT: .LBB8_2: // %if.else
; CHECK-NEXT: ldr w0, [x8]
; CHECK-NEXT: ret
entry:
%p = getelementptr i32, ptr %a, i32 %i
br i1 %c, label %if.then, label %if.else

if.then:
store i32 0, ptr %p
br label %exit

if.else:
%v0 = load i32, ptr %p
br label %exit

exit:
%v = phi i32 [0, %if.then], [%v0, %if.else]
ret i32 %v
}

define i64 @f9(i1 %c, ptr %a, i32 %i) {
; CHECK-LABEL: f9:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: mov w8, w2
; CHECK-NEXT: tbz w0, #0, .LBB9_2
; CHECK-NEXT: // %bb.1: // %if.then
; CHECK-NEXT: mov x0, xzr
; CHECK-NEXT: str xzr, [x1, x8, lsl #3]
; CHECK-NEXT: ret
; CHECK-NEXT: .LBB9_2: // %if.else
; CHECK-NEXT: ldr x0, [x1, x8, lsl #3]
; CHECK-NEXT: ret
entry:
%j = zext i32 %i to i64
%p = getelementptr i64, ptr %a, i64 %j
br i1 %c, label %if.then, label %if.else

if.then:
store i64 0, ptr %p
br label %exit

if.else:
%v0 = load i64, ptr %p
br label %exit

exit:
%v = phi i64 [0, %if.then], [%v0, %if.else]
ret i64 %v
}
38 changes: 28 additions & 10 deletions llvm/test/CodeGen/AArch64/swift-async-win.ll
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
; RUN: llc -mtriple aarch64-unknown-windows -swift-async-fp=never -filetype asm -o - %s | FileCheck %s

; ModuleID = '_Concurrency.ll'
Expand All @@ -10,8 +11,35 @@ target triple = "aarch64-unknown-windows-msvc19.32.31302"
; Function Attrs: argmemonly nofree nosync nounwind willreturn
declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #0

; NOTE: we do not see the canonical windows frame setup due to the `nounwind`
; attribtue on the function.

; Function Attrs: nounwind
define hidden swifttailcc void @"$ss23withCheckedContinuation8function_xSS_yScCyxs5NeverOGXEtYalFTQ0_"(ptr nocapture readonly %0) #1 {
; CHECK-LABEL: $ss23withCheckedContinuation8function_xSS_yScCyxs5NeverOGXEtYalFTQ0_:
; CHECK: // %bb.0: // %entryresume.0
; CHECK-NEXT: sub sp, sp, #48
; CHECK-NEXT: stp x30, x29, [sp, #24] // 16-byte Folded Spill
; CHECK-NEXT: add x29, sp, #24
; CHECK-NEXT: str x19, [sp, #40] // 8-byte Folded Spill
; CHECK-NEXT: sub x8, x29, #8
; CHECK-NEXT: adrp x19, __imp_swift_task_dealloc
; CHECK-NEXT: str xzr, [sp, #16]
; CHECK-NEXT: ldr x9, [x0]
; CHECK-NEXT: str x9, [x8]
; CHECK-NEXT: ldr x20, [x0]
; CHECK-NEXT: ldp x22, x0, [x9, #16]
; CHECK-NEXT: str x20, [x8]
; CHECK-NEXT: ldr x19, [x19, :lo12:__imp_swift_task_dealloc]
; CHECK-NEXT: blr x19
; CHECK-NEXT: mov x0, x22
; CHECK-NEXT: blr x19
; CHECK-NEXT: ldp x30, x29, [sp, #24] // 16-byte Folded Reload
; CHECK-NEXT: mov x0, x20
; CHECK-NEXT: ldr x1, [x20, #8]
; CHECK-NEXT: ldr x19, [sp, #40] // 8-byte Folded Reload
; CHECK-NEXT: add sp, sp, #48
; CHECK-NEXT: br x1
entryresume.0:
%1 = load ptr, ptr %0, align 8
%2 = tail call ptr @llvm.swift.async.context.addr() #4
Expand All @@ -31,16 +59,6 @@ entryresume.0:
ret void
}

; NOTE: we do not see the canonical windows frame setup due to the `nounwind`
; attribtue on the function.

; CHECK: sub sp, sp, #48
; CHECK: stp x30, x29, [sp, #24]
; CHECK: add x29, sp, #24
; CHECK: str x19, [sp, #40]
; CHECK: sub x8, x29, #8
; CHECK: ldr x9, [x0]
; CHECK: str x9, [x8]

; Function Attrs: nounwind readnone
declare ptr @llvm.swift.async.context.addr() #2
Expand Down