| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,323 @@ | ||
| //=- ARMScheduleA57WriteRes.td - ARM Cortex-A57 Write Res ---*- tablegen -*-=// | ||
| // | ||
| // The LLVM Compiler Infrastructure | ||
| // | ||
| // This file is distributed under the University of Illinois Open Source | ||
| // License. See LICENSE.TXT for details. | ||
| // | ||
| //===----------------------------------------------------------------------===// | ||
| // | ||
| // Contains all of the Cortex-A57 specific SchedWriteRes types. The approach | ||
| // below is to define a generic SchedWriteRes for every combination of | ||
| // latency and microOps. The naming conventions is to use a prefix, one field | ||
| // for latency, and one or more microOp count/type designators. | ||
| // Prefix: A57Write | ||
| // Latency: #cyc | ||
| // MicroOp Count/Types: #(B|I|M|L|S|X|W|V) | ||
| // | ||
| // e.g. A57Write_6cyc_1I_6S_4V means the total latency is 6 and there are | ||
| // 11 micro-ops to be issued as follows: one to I pipe, six to S pipes and | ||
| // four to V pipes. | ||
| // | ||
| //===----------------------------------------------------------------------===// | ||
|
|
||
| //===----------------------------------------------------------------------===// | ||
| // Define Generic 1 micro-op types | ||
|
|
||
| def A57Write_5cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 5; } | ||
| def A57Write_5cyc_1V : SchedWriteRes<[A57UnitV]> { let Latency = 5; } | ||
| def A57Write_5cyc_1W : SchedWriteRes<[A57UnitW]> { let Latency = 5; } | ||
| def A57Write_10cyc_1V : SchedWriteRes<[A57UnitV]> { let Latency = 10; } | ||
| def A57Write_17cyc_1W : SchedWriteRes<[A57UnitW]> { let Latency = 17; | ||
| let ResourceCycles = [17]; } | ||
| def A57Write_18cyc_1X : SchedWriteRes<[A57UnitX]> { let Latency = 18; | ||
| let ResourceCycles = [18]; } | ||
| def A57Write_19cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 19; | ||
| let ResourceCycles = [19]; } | ||
| def A57Write_20cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 20; | ||
| let ResourceCycles = [20]; } | ||
| def A57Write_1cyc_1B : SchedWriteRes<[A57UnitB]> { let Latency = 1; } | ||
| def A57Write_1cyc_1I : SchedWriteRes<[A57UnitI]> { let Latency = 1; } | ||
| def A57Write_2cyc_1I : SchedWriteRes<[A57UnitI]> { let Latency = 2; } | ||
| def A57Write_3cyc_1I : SchedWriteRes<[A57UnitI]> { let Latency = 3; } | ||
| def A57Write_1cyc_1S : SchedWriteRes<[A57UnitS]> { let Latency = 1; } | ||
| def A57Write_2cyc_1S : SchedWriteRes<[A57UnitS]> { let Latency = 2; } | ||
| def A57Write_3cyc_1S : SchedWriteRes<[A57UnitS]> { let Latency = 3; } | ||
| def A57Write_2cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 2; } | ||
| def A57Write_32cyc_1W : SchedWriteRes<[A57UnitW]> { let Latency = 32; | ||
| let ResourceCycles = [32]; } | ||
| def A57Write_32cyc_1X : SchedWriteRes<[A57UnitX]> { let Latency = 32; | ||
| let ResourceCycles = [32]; } | ||
| def A57Write_35cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 35; | ||
| let ResourceCycles = [35]; } | ||
| def A57Write_3cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 3; } | ||
| def A57Write_3cyc_1V : SchedWriteRes<[A57UnitV]> { let Latency = 3; } | ||
| def A57Write_3cyc_1W : SchedWriteRes<[A57UnitW]> { let Latency = 3; } | ||
| def A57Write_3cyc_1X : SchedWriteRes<[A57UnitX]> { let Latency = 3; } | ||
|
|
||
| // A57Write_3cyc_1L - A57Write_20cyc_1L | ||
| foreach Lat = 3-20 in { | ||
| def A57Write_#Lat#cyc_1L : SchedWriteRes<[A57UnitL]> { | ||
| let Latency = Lat; | ||
| } | ||
| } | ||
|
|
||
| // A57Write_4cyc_1S - A57Write_16cyc_1S | ||
| foreach Lat = 4-16 in { | ||
| def A57Write_#Lat#cyc_1S : SchedWriteRes<[A57UnitS]> { | ||
| let Latency = Lat; | ||
| } | ||
| } | ||
|
|
||
| def A57Write_4cyc_1M : SchedWriteRes<[A57UnitL]> { let Latency = 4; } | ||
| def A57Write_4cyc_1X : SchedWriteRes<[A57UnitX]> { let Latency = 4; } | ||
| def A57Write_4cyc_1W : SchedWriteRes<[A57UnitW]> { let Latency = 4; } | ||
| def A57Write_5cyc_1X : SchedWriteRes<[A57UnitX]> { let Latency = 5; } | ||
| def A57Write_6cyc_1X : SchedWriteRes<[A57UnitX]> { let Latency = 6; } | ||
| def A57Write_6cyc_1W : SchedWriteRes<[A57UnitW]> { let Latency = 6; } | ||
| def A57Write_8cyc_1V : SchedWriteRes<[A57UnitV]> { let Latency = 8; } | ||
| def A57Write_9cyc_1V : SchedWriteRes<[A57UnitV]> { let Latency = 9; } | ||
| def A57Write_6cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 6; } | ||
| def A57Write_6cyc_1V : SchedWriteRes<[A57UnitV]> { let Latency = 6; } | ||
|
|
||
|
|
||
| //===----------------------------------------------------------------------===// | ||
| // Define Generic 2 micro-op types | ||
|
|
||
| def A57Write_64cyc_2X : SchedWriteRes<[A57UnitX, A57UnitX]> { | ||
| let Latency = 64; | ||
| let NumMicroOps = 2; | ||
| let ResourceCycles = [32, 32]; | ||
| } | ||
| def A57Write_6cyc_1I_1L : SchedWriteRes<[A57UnitI, | ||
| A57UnitL]> { | ||
| let Latency = 6; | ||
| let NumMicroOps = 2; | ||
| } | ||
| def A57Write_6cyc_1V_1X : SchedWriteRes<[A57UnitV, | ||
| A57UnitX]> { | ||
| let Latency = 6; | ||
| let NumMicroOps = 2; | ||
| } | ||
| def A57Write_7cyc_1V_1X : SchedWriteRes<[A57UnitV, | ||
| A57UnitX]> { | ||
| let Latency = 7; | ||
| let NumMicroOps = 2; | ||
| } | ||
| def A57Write_8cyc_1L_1V : SchedWriteRes<[A57UnitL, | ||
| A57UnitV]> { | ||
| let Latency = 8; | ||
| let NumMicroOps = 2; | ||
| } | ||
| def A57Write_9cyc_1L_1V : SchedWriteRes<[A57UnitL, | ||
| A57UnitV]> { | ||
| let Latency = 9; | ||
| let NumMicroOps = 2; | ||
| } | ||
| def A57Write_9cyc_2V : SchedWriteRes<[A57UnitV, A57UnitV]> { | ||
| let Latency = 9; | ||
| let NumMicroOps = 2; | ||
| } | ||
| def A57Write_8cyc_2X : SchedWriteRes<[A57UnitX, A57UnitX]> { | ||
| let Latency = 8; | ||
| let NumMicroOps = 2; | ||
| } | ||
| def A57Write_6cyc_2L : SchedWriteRes<[A57UnitL, A57UnitL]> { | ||
| let Latency = 6; | ||
| let NumMicroOps = 2; | ||
| } | ||
| def A57Write_6cyc_2V : SchedWriteRes<[A57UnitV, A57UnitV]> { | ||
| let Latency = 6; | ||
| let NumMicroOps = 2; | ||
| } | ||
| def A57Write_6cyc_2W : SchedWriteRes<[A57UnitW, A57UnitW]> { | ||
| let Latency = 6; | ||
| let NumMicroOps = 2; | ||
| } | ||
| def A57Write_5cyc_1I_1L : SchedWriteRes<[A57UnitI, | ||
| A57UnitL]> { | ||
| let Latency = 5; | ||
| let NumMicroOps = 2; | ||
| } | ||
| def A57Write_5cyc_1I_1M : SchedWriteRes<[A57UnitI, | ||
| A57UnitM]> { | ||
| let Latency = 5; | ||
| let NumMicroOps = 2; | ||
| } | ||
| def A57Write_5cyc_2V : SchedWriteRes<[A57UnitV, A57UnitV]> { | ||
| let Latency = 5; | ||
| let NumMicroOps = 2; | ||
| } | ||
| def A57Write_5cyc_2X : SchedWriteRes<[A57UnitX, A57UnitX]> { | ||
| let Latency = 5; | ||
| let NumMicroOps = 2; | ||
| } | ||
| def A57Write_10cyc_1L_1V : SchedWriteRes<[A57UnitL, | ||
| A57UnitV]> { | ||
| let Latency = 10; | ||
| let NumMicroOps = 2; | ||
| } | ||
| def A57Write_10cyc_2V : SchedWriteRes<[A57UnitV, A57UnitV]> { | ||
| let Latency = 10; | ||
| let NumMicroOps = 2; | ||
| } | ||
| def A57Write_1cyc_1B_1I : SchedWriteRes<[A57UnitB, | ||
| A57UnitI]> { | ||
| let Latency = 1; | ||
| let NumMicroOps = 2; | ||
| } | ||
| def A57Write_1cyc_1I_1S : SchedWriteRes<[A57UnitI, | ||
| A57UnitS]> { | ||
| let Latency = 1; | ||
| let NumMicroOps = 2; | ||
| } | ||
| def A57Write_1cyc_1S_1I : SchedWriteRes<[A57UnitS, | ||
| A57UnitI]> { | ||
| let Latency = 1; | ||
| let NumMicroOps = 2; | ||
| } | ||
| def A57Write_2cyc_1S_1I : SchedWriteRes<[A57UnitS, | ||
| A57UnitI]> { | ||
| let Latency = 2; | ||
| let NumMicroOps = 2; | ||
| } | ||
| def A57Write_3cyc_1S_1I : SchedWriteRes<[A57UnitS, | ||
| A57UnitI]> { | ||
| let Latency = 3; | ||
| let NumMicroOps = 2; | ||
| } | ||
| def A57Write_1cyc_1S_1M : SchedWriteRes<[A57UnitS, | ||
| A57UnitM]> { | ||
| let Latency = 1; | ||
| let NumMicroOps = 2; | ||
| } | ||
| def A57Write_2cyc_1B_1I : SchedWriteRes<[A57UnitB, | ||
| A57UnitI]> { | ||
| let Latency = 2; | ||
| let NumMicroOps = 2; | ||
| } | ||
| def A57Write_3cyc_1B_1I : SchedWriteRes<[A57UnitB, | ||
| A57UnitI]> { | ||
| let Latency = 3; | ||
| let NumMicroOps = 2; | ||
| } | ||
| def A57Write_6cyc_1B_1L : SchedWriteRes<[A57UnitB, | ||
| A57UnitI]> { | ||
| let Latency = 6; | ||
| let NumMicroOps = 2; | ||
| } | ||
| def A57Write_2cyc_1I_1M : SchedWriteRes<[A57UnitI, | ||
| A57UnitM]> { | ||
| let Latency = 2; | ||
| let NumMicroOps = 2; | ||
| } | ||
| def A57Write_2cyc_2S : SchedWriteRes<[A57UnitS, A57UnitS]> { | ||
| let Latency = 2; | ||
| let NumMicroOps = 2; | ||
| } | ||
| def A57Write_2cyc_2V : SchedWriteRes<[A57UnitV, A57UnitV]> { | ||
| let Latency = 2; | ||
| let NumMicroOps = 2; | ||
| } | ||
| def A57Write_36cyc_2X : SchedWriteRes<[A57UnitX, A57UnitX]> { | ||
| let Latency = 36; | ||
| let NumMicroOps = 2; | ||
| let ResourceCycles = [18, 18]; | ||
| } | ||
| def A57Write_3cyc_1I_1M : SchedWriteRes<[A57UnitI, | ||
| A57UnitM]> { | ||
| let Latency = 3; | ||
| let NumMicroOps = 2; | ||
| } | ||
| def A57Write_4cyc_1I_1M : SchedWriteRes<[A57UnitI, | ||
| A57UnitM]> { | ||
| let Latency = 4; | ||
| let NumMicroOps = 2; | ||
| } | ||
|
|
||
| // A57Write_3cyc_1L_1I - A57Write_20cyc_1L_1I | ||
| foreach Lat = 3-20 in { | ||
| def A57Write_#Lat#cyc_1L_1I : SchedWriteRes<[A57UnitL, A57UnitI]> { | ||
| let Latency = Lat; let NumMicroOps = 2; | ||
| } | ||
| } | ||
|
|
||
| def A57Write_3cyc_1I_1S : SchedWriteRes<[A57UnitI, | ||
| A57UnitS]> { | ||
| let Latency = 3; | ||
| let NumMicroOps = 2; | ||
| } | ||
| def A57Write_3cyc_1S_1V : SchedWriteRes<[A57UnitS, | ||
| A57UnitV]> { | ||
| let Latency = 3; | ||
| let NumMicroOps = 2; | ||
| } | ||
| def A57Write_4cyc_1S_1V : SchedWriteRes<[A57UnitS, | ||
| A57UnitV]> { | ||
| let Latency = 4; | ||
| let NumMicroOps = 2; | ||
| } | ||
| def A57Write_3cyc_2V : SchedWriteRes<[A57UnitV, A57UnitV]> { | ||
| let Latency = 3; | ||
| let NumMicroOps = 2; | ||
| } | ||
|
|
||
| // A57Write_4cyc_1S_1I - A57Write_16cyc_1S_1I | ||
| foreach Lat = 4-16 in { | ||
| def A57Write_#Lat#cyc_1S_1I : SchedWriteRes<[A57UnitS, A57UnitI]> { | ||
| let Latency = Lat; let NumMicroOps = 2; | ||
| } | ||
| } | ||
|
|
||
| def A57Write_4cyc_2X : SchedWriteRes<[A57UnitX, A57UnitX]> { | ||
| let Latency = 4; | ||
| let NumMicroOps = 2; | ||
| } | ||
|
|
||
|
|
||
| //===----------------------------------------------------------------------===// | ||
| // Define Generic 3 micro-op types | ||
|
|
||
| def A57Write_10cyc_3V : SchedWriteRes<[A57UnitV, A57UnitV, A57UnitV]> { | ||
| let Latency = 10; | ||
| let NumMicroOps = 3; | ||
| } | ||
| def A57Write_2cyc_1I_2S : SchedWriteRes<[A57UnitI, | ||
| A57UnitS, A57UnitS]> { | ||
| let Latency = 2; | ||
| let NumMicroOps = 3; | ||
| } | ||
| def A57Write_3cyc_1I_1S_1V : SchedWriteRes<[A57UnitI, | ||
| A57UnitS, | ||
| A57UnitV]> { | ||
| let Latency = 3; | ||
| let NumMicroOps = 3; | ||
| } | ||
| def A57Write_3cyc_1S_1V_1I : SchedWriteRes<[A57UnitS, | ||
| A57UnitV, | ||
| A57UnitI]> { | ||
| let Latency = 3; | ||
| let NumMicroOps = 3; | ||
| } | ||
| def A57Write_4cyc_1S_1V_1I : SchedWriteRes<[A57UnitS, | ||
| A57UnitV, | ||
| A57UnitI]> { | ||
| let Latency = 4; | ||
| let NumMicroOps = 3; | ||
| } | ||
| def A57Write_4cyc_1I_1L_1M : SchedWriteRes<[A57UnitI, A57UnitL, A57UnitM]> { | ||
| let Latency = 4; | ||
| let NumMicroOps = 3; | ||
| } | ||
| def A57Write_8cyc_1L_1V_1I : SchedWriteRes<[A57UnitL, | ||
| A57UnitV, | ||
| A57UnitI]> { | ||
| let Latency = 8; | ||
| let NumMicroOps = 3; | ||
| } | ||
| def A57Write_9cyc_1L_1V_1I : SchedWriteRes<[A57UnitL, | ||
| A57UnitV, | ||
| A57UnitI]> { | ||
| let Latency = 9; | ||
| let NumMicroOps = 3; | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,81 @@ | ||
| ; REQUIRES: asserts | ||
| ; RUN: llc < %s -mtriple=armv8r-eabi -mcpu=cortex-a57 -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s | ||
|
|
||
| ; Check the latency for ALU shifted operand variants. | ||
| ; | ||
| ; CHECK: ********** MI Scheduling ********** | ||
| ; CHECK: foo:BB#0 entry | ||
|
|
||
| ; ALU, basic - 1 cyc I0/I1 | ||
| ; CHECK: EORrr | ||
| ; CHECK: rdefs left | ||
| ; CHECK-NEXT: Latency : 1 | ||
|
|
||
| ; ALU, shift by immed - 2 cyc M | ||
| ; CHECK: ADDrsi | ||
| ; CHECK: rdefs left | ||
| ; CHECK-NEXT: Latency : 2 | ||
|
|
||
| ; ALU, shift by register, unconditional - 2 cyc M | ||
| ; CHECK: RSBrsr | ||
| ; CHECK: rdefs left | ||
| ; CHECK-NEXT: Latency : 2 | ||
|
|
||
| ; ALU, shift by register, conditional - 2 cyc I0/I1 | ||
| ; CHECK: ANDrsr | ||
| ; CHECK: rdefs left | ||
| ; CHECK-NEXT: Latency : 2 | ||
|
|
||
| ; Checking scheduling units | ||
|
|
||
| ; CHECK: ** ScheduleDAGMILive::schedule picking next node | ||
| ; Skipping COPY | ||
| ; CHECK: ** ScheduleDAGMILive::schedule picking next node | ||
| ; CHECK: Scheduling | ||
| ; CHECK-SAME: ANDrsr | ||
| ; CHECK: Ready | ||
| ; CHECK-NEXT: A57UnitI | ||
|
|
||
| ; CHECK: ** ScheduleDAGMILive::schedule picking next node | ||
| ; CHECK: Scheduling | ||
| ; CHECK-SAME: CMPri | ||
| ; CHECK: Ready | ||
| ; CHECK-NEXT: A57UnitI | ||
|
|
||
| ; CHECK: ** ScheduleDAGMILive::schedule picking next node | ||
| ; CHECK: Scheduling | ||
| ; CHECK-SAME: RSBrsr | ||
| ; CHECK: Ready | ||
| ; CHECK-NEXT: A57UnitM | ||
|
|
||
| ; CHECK: ** ScheduleDAGMILive::schedule picking next node | ||
| ; CHECK: Scheduling | ||
| ; CHECK-SAME: ADDrsi | ||
| ; CHECK: Ready | ||
| ; CHECK-NEXT: A57UnitM | ||
|
|
||
| ; CHECK: ** ScheduleDAGMILive::schedule picking next node | ||
| ; CHECK: Scheduling | ||
| ; CHECK-SAME: EORrr | ||
| ; CHECK: Ready | ||
| ; CHECK-NEXT: A57UnitI | ||
|
|
||
|
|
||
| target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" | ||
| target triple = "armv8r-arm-none-eabi" | ||
|
|
||
| ; Function Attrs: norecurse nounwind readnone | ||
| define hidden i32 @foo(i32 %a, i32 %b, i32 %c, i32 %d) local_unnamed_addr #0 { | ||
| entry: | ||
| %xor = xor i32 %a, %b | ||
| %xor_shl = shl i32 %xor, 2 | ||
| %add = add i32 %xor_shl, %d | ||
| %add_ashr = ashr i32 %add, %a | ||
| %sub = sub i32 %add_ashr, %a | ||
| %sub_lshr_pred = lshr i32 %sub, %c | ||
| %pred = icmp sgt i32 %a, 4 | ||
| %and = and i32 %sub_lshr_pred, %b | ||
| %rv = select i1 %pred, i32 %and, i32 %d | ||
| ret i32 %rv | ||
| } | ||
|
|
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,53 @@ | ||
| ; REQUIRES: asserts | ||
| ; RUN: llc < %s -mtriple=armv8r-eabi -mcpu=cortex-a57 -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s --check-prefix=CHECK --check-prefix=A57_SCHED | ||
| ; RUN: llc < %s -mtriple=armv8r-eabi -mcpu=generic -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC | ||
|
|
||
| ; Check the latency for instructions for both generic and cortex-a57. | ||
| ; SDIV should be scheduled at the block's begin (20 cyc of independent M unit). | ||
| ; | ||
| ; CHECK: ********** MI Scheduling ********** | ||
| ; CHECK: foo:BB#0 entry | ||
|
|
||
| ; GENERIC: SDIV | ||
| ; GENERIC: Latency : 1 | ||
| ; GENERIC: EORrr | ||
| ; GENERIC: Latency : 1 | ||
| ; GENERIC: LDRi12 | ||
| ; GENERIC: Latency : 4 | ||
| ; GENERIC: ADDrr | ||
| ; GENERIC: Latency : 1 | ||
| ; GENERIC: SUBrr | ||
| ; GENERIC: Latency : 1 | ||
|
|
||
| ; A57_SCHED: SDIV | ||
| ; A57_SCHED: Latency : 20 | ||
| ; A57_SCHED: EORrr | ||
| ; A57_SCHED: Latency : 1 | ||
| ; A57_SCHED: LDRi12 | ||
| ; A57_SCHED: Latency : 4 | ||
| ; A57_SCHED: ADDrr | ||
| ; A57_SCHED: Latency : 1 | ||
| ; A57_SCHED: SUBrr | ||
| ; A57_SCHED: Latency : 1 | ||
|
|
||
| ; CHECK: ** Final schedule for BB#0 *** | ||
| ; GENERIC: LDRi12 | ||
| ; GENERIC: SDIV | ||
| ; A57_SCHED: SDIV | ||
| ; A57_SCHED: LDRi12 | ||
| ; CHECK: ********** INTERVALS ********** | ||
|
|
||
| target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" | ||
| target triple = "armv8r-arm-none-eabi" | ||
|
|
||
| ; Function Attrs: norecurse nounwind readnone | ||
| define hidden i32 @foo(i32 %a, i32 %b, i32 %c, i32* %d) local_unnamed_addr #0 { | ||
| entry: | ||
| %xor = xor i32 %c, %b | ||
| %ld = load i32, i32* %d | ||
| %add = add nsw i32 %xor, %ld | ||
| %div = sdiv i32 %a, %b | ||
| %sub = sub i32 %div, %add | ||
| ret i32 %sub | ||
| } | ||
|
|
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,37 @@ | ||
| ; REQUIRES: asserts | ||
| ; RUN: llc < %s -mtriple=armv8r-eabi -mcpu=cortex-a57 -misched-postra -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s | ||
| ; | ||
|
|
||
| @a = global i32 0, align 4 | ||
| @b = global i32 0, align 4 | ||
| @c = global i32 0, align 4 | ||
|
|
||
| ; CHECK: ********** MI Scheduling ********** | ||
| ; We need second, post-ra scheduling to have LDM instruction combined from single-loads | ||
| ; CHECK: ********** MI Scheduling ********** | ||
| ; CHECK: LDMIA_UPD | ||
| ; CHECK: rdefs left | ||
| ; CHECK-NEXT: Latency : 4 | ||
| ; CHECK: Successors: | ||
| ; CHECK: data | ||
| ; CHECK-SAME: Latency=1 | ||
| ; CHECK-NEXT: data | ||
| ; CHECK-SAME: Latency=3 | ||
| ; CHECK-NEXT: data | ||
| ; CHECK-SAME: Latency=3 | ||
| ; CHECK-NEXT: data | ||
| ; CHECK-SAME: Latency=4 | ||
| define i32 @bar(i32 %a1, i32 %b1, i32 %c1) minsize optsize { | ||
| %1 = load i32, i32* @a, align 4 | ||
| %2 = load i32, i32* @b, align 4 | ||
| %3 = load i32, i32* @c, align 4 | ||
|
|
||
| %ptr_after = getelementptr i32, i32* @a, i32 3 | ||
|
|
||
| %ptr_val = ptrtoint i32* %ptr_after to i32 | ||
| %mul1 = mul i32 %ptr_val, %1 | ||
| %mul2 = mul i32 %mul1, %2 | ||
| %mul3 = mul i32 %mul2, %3 | ||
| ret i32 %mul3 | ||
| } | ||
|
|
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,28 @@ | ||
| ; REQUIRES: asserts | ||
| ; RUN: llc < %s -mtriple=armv8r-eabi -mcpu=cortex-a57 -misched-postra -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s | ||
|
|
||
| ; CHECK: ********** MI Scheduling ********** | ||
| ; We need second, post-ra scheduling to have LDM instruction combined from single-loads | ||
| ; CHECK: ********** MI Scheduling ********** | ||
| ; CHECK: LDMIA | ||
| ; CHECK: rdefs left | ||
| ; CHECK-NEXT: Latency : 3 | ||
| ; CHECK: Successors: | ||
| ; CHECK: data | ||
| ; CHECK-SAME: Latency=3 | ||
| ; CHECK-NEXT: data | ||
| ; CHECK-SAME: Latency=3 | ||
|
|
||
| define i32 @foo(i32* %a) nounwind optsize { | ||
| entry: | ||
| %b = getelementptr i32, i32* %a, i32 1 | ||
| %c = getelementptr i32, i32* %a, i32 2 | ||
| %0 = load i32, i32* %a, align 4 | ||
| %1 = load i32, i32* %b, align 4 | ||
| %2 = load i32, i32* %c, align 4 | ||
|
|
||
| %mul1 = mul i32 %0, %1 | ||
| %mul2 = mul i32 %mul1, %2 | ||
| ret i32 %mul2 | ||
| } | ||
|
|
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,36 @@ | ||
| ; REQUIRES: asserts | ||
| ; RUN: llc < %s -mtriple=armv8r-eabi -mcpu=cortex-a57 -misched-postra -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s | ||
| ; N=3 STMIA_UPD should have latency 2cyc and writeback latency 1cyc | ||
|
|
||
| ; CHECK: ********** MI Scheduling ********** | ||
| ; We need second, post-ra scheduling to have STM instruction combined from single-stores | ||
| ; CHECK: ********** MI Scheduling ********** | ||
| ; CHECK: schedule starting | ||
| ; CHECK: STMIA_UPD | ||
| ; CHECK: rdefs left | ||
| ; CHECK-NEXT: Latency : 2 | ||
| ; CHECK: Successors | ||
| ; CHECK: data | ||
| ; CHECK-SAME: Latency=1 | ||
|
|
||
| define i32 @bar(i32 %v0, i32 %v1, i32 %v2, i32* %addr) { | ||
|
|
||
| %addr.1 = getelementptr i32, i32* %addr, i32 0 | ||
| store i32 %v0, i32* %addr.1 | ||
|
|
||
| %addr.2 = getelementptr i32, i32* %addr, i32 1 | ||
| store i32 %v1, i32* %addr.2 | ||
|
|
||
| %addr.3 = getelementptr i32, i32* %addr, i32 2 | ||
| store i32 %v2, i32* %addr.3 | ||
|
|
||
| %ptr_after = getelementptr i32, i32* %addr, i32 3 | ||
| %val = ptrtoint i32* %ptr_after to i32 | ||
|
|
||
| %rv1 = mul i32 %val, %v0 | ||
| %rv2 = mul i32 %rv1, %v1 | ||
| %rv3 = mul i32 %rv2, %v2 | ||
|
|
||
| ret i32 %rv3 | ||
| } | ||
|
|
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,29 @@ | ||
| ; REQUIRES: asserts | ||
| ; RUN: llc < %s -mtriple=armv8r-eabi -mcpu=cortex-a57 -misched-postra -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s | ||
| ; N=3 STMIB should have latency 2cyc | ||
|
|
||
| ; CHECK: ********** MI Scheduling ********** | ||
| ; We need second, post-ra scheduling to have STM instruction combined from single-stores | ||
| ; CHECK: ********** MI Scheduling ********** | ||
| ; CHECK: schedule starting | ||
| ; CHECK: STMIB | ||
| ; CHECK: rdefs left | ||
| ; CHECK-NEXT: Latency : 2 | ||
|
|
||
| define i32 @test_stm(i32 %v0, i32 %v1, i32* %addr) { | ||
|
|
||
| %addr.1 = getelementptr i32, i32* %addr, i32 1 | ||
| store i32 %v0, i32* %addr.1 | ||
|
|
||
| %addr.2 = getelementptr i32, i32* %addr, i32 2 | ||
| store i32 %v1, i32* %addr.2 | ||
|
|
||
| %addr.3 = getelementptr i32, i32* %addr, i32 3 | ||
| %val = ptrtoint i32* %addr to i32 | ||
| store i32 %val, i32* %addr.3 | ||
|
|
||
| %rv = add i32 %v0, %v1 | ||
|
|
||
| ret i32 %rv | ||
| } | ||
|
|
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,77 @@ | ||
| ; REQUIRES: asserts | ||
| ; RUN: llc < %s -mtriple=armv8r-eabi -mcpu=cortex-a57 -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s | ||
| ; Check latencies of vmul/vfma accumulate chains. | ||
|
|
||
| define float @Test1(float %f1, float %f2, float %f3, float %f4, float %f5, float %f6) { | ||
| ; CHECK: ********** MI Scheduling ********** | ||
| ; CHECK: Test1:BB#0 | ||
|
|
||
| ; CHECK: VMULS | ||
| ; > VMULS common latency = 5 | ||
| ; CHECK: Latency : 5 | ||
| ; CHECK: Successors: | ||
| ; CHECK: data | ||
| ; > VMULS read-advanced latency to VMLAS = 0 | ||
| ; CHECK-SAME: Latency=0 | ||
|
|
||
| ; CHECK: VMLAS | ||
| ; > VMLAS common latency = 9 | ||
| ; CHECK: Latency : 9 | ||
| ; CHECK: Successors: | ||
| ; CHECK: data | ||
| ; > VMLAS read-advanced latency to the next VMLAS = 4 | ||
| ; CHECK-SAME: Latency=4 | ||
|
|
||
| ; CHECK: VMLAS | ||
| ; CHECK: Latency : 9 | ||
| ; CHECK: Successors: | ||
| ; CHECK: data | ||
| ; > VMLAS not-optimized latency to VMOVRS = 9 | ||
| ; CHECK-SAME: Latency=9 | ||
|
|
||
| ; f1 * f2 + f3 * f4 + f5 * f6 ==> VMULS, VMLAS, VMLAS | ||
| %mul1 = fmul float %f1, %f2 | ||
| %mul2 = fmul float %f3, %f4 | ||
| %mul3 = fmul float %f5, %f6 | ||
| %add1 = fadd float %mul1, %mul2 | ||
| %add2 = fadd float %add1, %mul3 | ||
| ret float %add2 | ||
| } | ||
|
|
||
| ; ASIMD form | ||
| define <2 x float> @Test2(<2 x float> %f1, <2 x float> %f2, <2 x float> %f3, <2 x float> %f4, <2 x float> %f5, <2 x float> %f6) { | ||
| ; CHECK: ********** MI Scheduling ********** | ||
| ; CHECK: Test2:BB#0 | ||
|
|
||
| ; CHECK: VMULfd | ||
| ; > VMULfd common latency = 5 | ||
| ; CHECK: Latency : 5 | ||
| ; CHECK: Successors: | ||
| ; CHECK: data | ||
| ; VMULfd read-advanced latency to VMLAfd = 0 | ||
| ; CHECK-SAME: Latency=0 | ||
|
|
||
| ; CHECK: VMLAfd | ||
| ; > VMLAfd common latency = 9 | ||
| ; CHECK: Latency : 9 | ||
| ; CHECK: Successors: | ||
| ; CHECK: data | ||
| ; > VMLAfd read-advanced latency to the next VMLAfd = 4 | ||
| ; CHECK-SAME: Latency=4 | ||
|
|
||
| ; CHECK: VMLAfd | ||
| ; CHECK: Latency : 9 | ||
| ; CHECK: Successors: | ||
| ; CHECK: data | ||
| ; > VMLAfd not-optimized latency to VMOVRRD = 9 | ||
| ; CHECK-SAME: Latency=9 | ||
|
|
||
| ; f1 * f2 + f3 * f4 + f5 * f6 ==> VMULS, VMLAS, VMLAS | ||
| %mul1 = fmul <2 x float> %f1, %f2 | ||
| %mul2 = fmul <2 x float> %f3, %f4 | ||
| %mul3 = fmul <2 x float> %f5, %f6 | ||
| %add1 = fadd <2 x float> %mul1, %mul2 | ||
| %add2 = fadd <2 x float> %add1, %mul3 | ||
| ret <2 x float> %add2 | ||
| } | ||
|
|
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,50 @@ | ||
| ; REQUIRES: asserts | ||
| ; RUN: llc < %s -mtriple=armv8r-eabi -mcpu=cortex-a57 -misched-postra -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s | ||
| ; | ||
|
|
||
| @a = global double 0.0, align 4 | ||
| @b = global double 0.0, align 4 | ||
| @c = global double 0.0, align 4 | ||
|
|
||
| ; CHECK: ********** MI Scheduling ********** | ||
| ; We need second, post-ra scheduling to have VLDM instruction combined from single-loads | ||
| ; CHECK: ********** MI Scheduling ********** | ||
| ; CHECK: VLDMDIA_UPD | ||
| ; CHECK: rdefs left | ||
| ; CHECK-NEXT: Latency : 6 | ||
| ; CHECK: Successors: | ||
| ; CHECK: data | ||
| ; CHECK-SAME: Latency=1 | ||
| ; CHECK-NEXT: data | ||
| ; CHECK-SAME: Latency=1 | ||
| ; CHECK-NEXT: data | ||
| ; CHECK-SAME: Latency=5 | ||
| ; CHECK-NEXT: data | ||
| ; CHECK-SAME: Latency=5 | ||
| ; CHECK-NEXT: data | ||
| ; CHECK-SAME: Latency=6 | ||
| define i32 @bar(i32* %iptr) minsize optsize { | ||
| %1 = load double, double* @a, align 8 | ||
| %2 = load double, double* @b, align 8 | ||
| %3 = load double, double* @c, align 8 | ||
|
|
||
| %ptr_after = getelementptr double, double* @a, i32 3 | ||
|
|
||
| %ptr_new_ival = ptrtoint double* %ptr_after to i32 | ||
| %ptr_new = inttoptr i32 %ptr_new_ival to i32* | ||
|
|
||
| store i32 %ptr_new_ival, i32* %iptr, align 8 | ||
|
|
||
| %v1 = fptoui double %1 to i32 | ||
|
|
||
| %mul1 = mul i32 %ptr_new_ival, %v1 | ||
|
|
||
| %v2 = fptoui double %2 to i32 | ||
| %v3 = fptoui double %3 to i32 | ||
|
|
||
| %mul2 = mul i32 %mul1, %v2 | ||
| %mul3 = mul i32 %mul2, %v3 | ||
|
|
||
| ret i32 %mul3 | ||
| } | ||
|
|
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,30 @@ | ||
| ; REQUIRES: asserts | ||
| ; RUN: llc < %s -mtriple=armv8r-eabi -mcpu=cortex-a57 -misched-postra -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s | ||
|
|
||
| ; CHECK: ********** MI Scheduling ********** | ||
| ; We need second, post-ra scheduling to have VLDM instruction combined from single-loads | ||
| ; CHECK: ********** MI Scheduling ********** | ||
| ; CHECK: VLDMDIA | ||
| ; CHECK: rdefs left | ||
| ; CHECK-NEXT: Latency : 6 | ||
| ; CHECK: Successors: | ||
| ; CHECK: data | ||
| ; CHECK-SAME: Latency=5 | ||
| ; CHECK-NEXT: data | ||
| ; CHECK-SAME: Latency=5 | ||
| ; CHECK-NEXT: data | ||
| ; CHECK-SAME: Latency=6 | ||
|
|
||
| define double @foo(double* %a) nounwind optsize { | ||
| entry: | ||
| %b = getelementptr double, double* %a, i32 1 | ||
| %c = getelementptr double, double* %a, i32 2 | ||
| %0 = load double, double* %a, align 4 | ||
| %1 = load double, double* %b, align 4 | ||
| %2 = load double, double* %c, align 4 | ||
|
|
||
| %mul1 = fmul double %0, %1 | ||
| %mul2 = fmul double %mul1, %2 | ||
| ret double %mul2 | ||
| } | ||
|
|
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,43 @@ | ||
| ; REQUIRES: asserts | ||
| ; RUN: llc < %s -mtriple=armv8r-eabi -mcpu=cortex-a57 -misched-postra -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s | ||
|
|
||
| ; CHECK: ********** MI Scheduling ********** | ||
| ; We need second, post-ra scheduling to have VSTM instruction combined from single-stores | ||
| ; CHECK: ********** MI Scheduling ********** | ||
| ; CHECK: schedule starting | ||
| ; CHECK: VSTMDIA_UPD | ||
| ; CHECK: rdefs left | ||
| ; CHECK-NEXT: Latency : 4 | ||
| ; CHECK: Successors: | ||
| ; CHECK: data | ||
| ; CHECK-SAME: Latency=1 | ||
|
|
||
| @a = global double 0.0, align 4 | ||
| @b = global double 0.0, align 4 | ||
| @c = global double 0.0, align 4 | ||
|
|
||
| define i32 @bar(double* %vptr, i32 %iv1, i32* %iptr) minsize { | ||
|
|
||
| %vp2 = getelementptr double, double* %vptr, i32 1 | ||
| %vp3 = getelementptr double, double* %vptr, i32 2 | ||
|
|
||
| %v1 = load double, double* %vptr, align 8 | ||
| %v2 = load double, double* %vp2, align 8 | ||
| %v3 = load double, double* %vp3, align 8 | ||
|
|
||
| store double %v1, double* @a, align 8 | ||
| store double %v2, double* @b, align 8 | ||
| store double %v3, double* @c, align 8 | ||
|
|
||
| %ptr_after = getelementptr double, double* @a, i32 3 | ||
|
|
||
| %ptr_new_ival = ptrtoint double* %ptr_after to i32 | ||
| %ptr_new = inttoptr i32 %ptr_new_ival to i32* | ||
|
|
||
| store i32 %ptr_new_ival, i32* %iptr, align 8 | ||
|
|
||
| %mul1 = mul i32 %ptr_new_ival, %iv1 | ||
|
|
||
| ret i32 %mul1 | ||
| } | ||
|
|
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,23 @@ | ||
| ; REQUIRES: asserts | ||
| ; RUN: llc < %s -mtriple=armv8r-eabi -mcpu=cortex-a57 -misched-postra -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s | ||
|
|
||
| ; CHECK: ********** MI Scheduling ********** | ||
| ; We need second, post-ra scheduling to have VSTM instruction combined from single-stores | ||
| ; CHECK: ********** MI Scheduling ********** | ||
| ; CHECK: schedule starting | ||
| ; CHECK: VSTMDIA | ||
| ; CHECK: rdefs left | ||
| ; CHECK-NEXT: Latency : 2 | ||
|
|
||
| %bigVec = type [2 x double] | ||
|
|
||
| @var = global %bigVec zeroinitializer | ||
|
|
||
| define void @bar(%bigVec* %ptr) { | ||
|
|
||
| %tmp = load %bigVec, %bigVec* %ptr | ||
| store %bigVec %tmp, %bigVec* @var | ||
|
|
||
| ret void | ||
| } | ||
|
|