1,471 changes: 1,471 additions & 0 deletions llvm/lib/Target/ARM/ARMScheduleA57.td

Large diffs are not rendered by default.

323 changes: 323 additions & 0 deletions llvm/lib/Target/ARM/ARMScheduleA57WriteRes.td
Original file line number Diff line number Diff line change
@@ -0,0 +1,323 @@
//=- ARMScheduleA57WriteRes.td - ARM Cortex-A57 Write Res ---*- tablegen -*-=//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// Contains all of the Cortex-A57 specific SchedWriteRes types. The approach
// below is to define a generic SchedWriteRes for every combination of
// latency and microOps. The naming conventions is to use a prefix, one field
// for latency, and one or more microOp count/type designators.
// Prefix: A57Write
// Latency: #cyc
// MicroOp Count/Types: #(B|I|M|L|S|X|W|V)
//
// e.g. A57Write_6cyc_1I_6S_4V means the total latency is 6 and there are
// 11 micro-ops to be issued as follows: one to I pipe, six to S pipes and
// four to V pipes.
//
//===----------------------------------------------------------------------===//

//===----------------------------------------------------------------------===//
// Define Generic 1 micro-op types

def A57Write_5cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 5; }
def A57Write_5cyc_1V : SchedWriteRes<[A57UnitV]> { let Latency = 5; }
def A57Write_5cyc_1W : SchedWriteRes<[A57UnitW]> { let Latency = 5; }
def A57Write_10cyc_1V : SchedWriteRes<[A57UnitV]> { let Latency = 10; }
def A57Write_17cyc_1W : SchedWriteRes<[A57UnitW]> { let Latency = 17;
let ResourceCycles = [17]; }
def A57Write_18cyc_1X : SchedWriteRes<[A57UnitX]> { let Latency = 18;
let ResourceCycles = [18]; }
def A57Write_19cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 19;
let ResourceCycles = [19]; }
def A57Write_20cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 20;
let ResourceCycles = [20]; }
def A57Write_1cyc_1B : SchedWriteRes<[A57UnitB]> { let Latency = 1; }
def A57Write_1cyc_1I : SchedWriteRes<[A57UnitI]> { let Latency = 1; }
def A57Write_2cyc_1I : SchedWriteRes<[A57UnitI]> { let Latency = 2; }
def A57Write_3cyc_1I : SchedWriteRes<[A57UnitI]> { let Latency = 3; }
def A57Write_1cyc_1S : SchedWriteRes<[A57UnitS]> { let Latency = 1; }
def A57Write_2cyc_1S : SchedWriteRes<[A57UnitS]> { let Latency = 2; }
def A57Write_3cyc_1S : SchedWriteRes<[A57UnitS]> { let Latency = 3; }
def A57Write_2cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 2; }
def A57Write_32cyc_1W : SchedWriteRes<[A57UnitW]> { let Latency = 32;
let ResourceCycles = [32]; }
def A57Write_32cyc_1X : SchedWriteRes<[A57UnitX]> { let Latency = 32;
let ResourceCycles = [32]; }
def A57Write_35cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 35;
let ResourceCycles = [35]; }
def A57Write_3cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 3; }
def A57Write_3cyc_1V : SchedWriteRes<[A57UnitV]> { let Latency = 3; }
def A57Write_3cyc_1W : SchedWriteRes<[A57UnitW]> { let Latency = 3; }
def A57Write_3cyc_1X : SchedWriteRes<[A57UnitX]> { let Latency = 3; }

// A57Write_3cyc_1L - A57Write_20cyc_1L
foreach Lat = 3-20 in {
def A57Write_#Lat#cyc_1L : SchedWriteRes<[A57UnitL]> {
let Latency = Lat;
}
}

// A57Write_4cyc_1S - A57Write_16cyc_1S
foreach Lat = 4-16 in {
def A57Write_#Lat#cyc_1S : SchedWriteRes<[A57UnitS]> {
let Latency = Lat;
}
}

def A57Write_4cyc_1M : SchedWriteRes<[A57UnitL]> { let Latency = 4; }
def A57Write_4cyc_1X : SchedWriteRes<[A57UnitX]> { let Latency = 4; }
def A57Write_4cyc_1W : SchedWriteRes<[A57UnitW]> { let Latency = 4; }
def A57Write_5cyc_1X : SchedWriteRes<[A57UnitX]> { let Latency = 5; }
def A57Write_6cyc_1X : SchedWriteRes<[A57UnitX]> { let Latency = 6; }
def A57Write_6cyc_1W : SchedWriteRes<[A57UnitW]> { let Latency = 6; }
def A57Write_8cyc_1V : SchedWriteRes<[A57UnitV]> { let Latency = 8; }
def A57Write_9cyc_1V : SchedWriteRes<[A57UnitV]> { let Latency = 9; }
def A57Write_6cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 6; }
def A57Write_6cyc_1V : SchedWriteRes<[A57UnitV]> { let Latency = 6; }


//===----------------------------------------------------------------------===//
// Define Generic 2 micro-op types

def A57Write_64cyc_2X : SchedWriteRes<[A57UnitX, A57UnitX]> {
let Latency = 64;
let NumMicroOps = 2;
let ResourceCycles = [32, 32];
}
def A57Write_6cyc_1I_1L : SchedWriteRes<[A57UnitI,
A57UnitL]> {
let Latency = 6;
let NumMicroOps = 2;
}
def A57Write_6cyc_1V_1X : SchedWriteRes<[A57UnitV,
A57UnitX]> {
let Latency = 6;
let NumMicroOps = 2;
}
def A57Write_7cyc_1V_1X : SchedWriteRes<[A57UnitV,
A57UnitX]> {
let Latency = 7;
let NumMicroOps = 2;
}
def A57Write_8cyc_1L_1V : SchedWriteRes<[A57UnitL,
A57UnitV]> {
let Latency = 8;
let NumMicroOps = 2;
}
def A57Write_9cyc_1L_1V : SchedWriteRes<[A57UnitL,
A57UnitV]> {
let Latency = 9;
let NumMicroOps = 2;
}
def A57Write_9cyc_2V : SchedWriteRes<[A57UnitV, A57UnitV]> {
let Latency = 9;
let NumMicroOps = 2;
}
def A57Write_8cyc_2X : SchedWriteRes<[A57UnitX, A57UnitX]> {
let Latency = 8;
let NumMicroOps = 2;
}
def A57Write_6cyc_2L : SchedWriteRes<[A57UnitL, A57UnitL]> {
let Latency = 6;
let NumMicroOps = 2;
}
def A57Write_6cyc_2V : SchedWriteRes<[A57UnitV, A57UnitV]> {
let Latency = 6;
let NumMicroOps = 2;
}
def A57Write_6cyc_2W : SchedWriteRes<[A57UnitW, A57UnitW]> {
let Latency = 6;
let NumMicroOps = 2;
}
def A57Write_5cyc_1I_1L : SchedWriteRes<[A57UnitI,
A57UnitL]> {
let Latency = 5;
let NumMicroOps = 2;
}
def A57Write_5cyc_1I_1M : SchedWriteRes<[A57UnitI,
A57UnitM]> {
let Latency = 5;
let NumMicroOps = 2;
}
def A57Write_5cyc_2V : SchedWriteRes<[A57UnitV, A57UnitV]> {
let Latency = 5;
let NumMicroOps = 2;
}
def A57Write_5cyc_2X : SchedWriteRes<[A57UnitX, A57UnitX]> {
let Latency = 5;
let NumMicroOps = 2;
}
def A57Write_10cyc_1L_1V : SchedWriteRes<[A57UnitL,
A57UnitV]> {
let Latency = 10;
let NumMicroOps = 2;
}
def A57Write_10cyc_2V : SchedWriteRes<[A57UnitV, A57UnitV]> {
let Latency = 10;
let NumMicroOps = 2;
}
def A57Write_1cyc_1B_1I : SchedWriteRes<[A57UnitB,
A57UnitI]> {
let Latency = 1;
let NumMicroOps = 2;
}
def A57Write_1cyc_1I_1S : SchedWriteRes<[A57UnitI,
A57UnitS]> {
let Latency = 1;
let NumMicroOps = 2;
}
def A57Write_1cyc_1S_1I : SchedWriteRes<[A57UnitS,
A57UnitI]> {
let Latency = 1;
let NumMicroOps = 2;
}
def A57Write_2cyc_1S_1I : SchedWriteRes<[A57UnitS,
A57UnitI]> {
let Latency = 2;
let NumMicroOps = 2;
}
def A57Write_3cyc_1S_1I : SchedWriteRes<[A57UnitS,
A57UnitI]> {
let Latency = 3;
let NumMicroOps = 2;
}
def A57Write_1cyc_1S_1M : SchedWriteRes<[A57UnitS,
A57UnitM]> {
let Latency = 1;
let NumMicroOps = 2;
}
def A57Write_2cyc_1B_1I : SchedWriteRes<[A57UnitB,
A57UnitI]> {
let Latency = 2;
let NumMicroOps = 2;
}
def A57Write_3cyc_1B_1I : SchedWriteRes<[A57UnitB,
A57UnitI]> {
let Latency = 3;
let NumMicroOps = 2;
}
def A57Write_6cyc_1B_1L : SchedWriteRes<[A57UnitB,
A57UnitI]> {
let Latency = 6;
let NumMicroOps = 2;
}
def A57Write_2cyc_1I_1M : SchedWriteRes<[A57UnitI,
A57UnitM]> {
let Latency = 2;
let NumMicroOps = 2;
}
def A57Write_2cyc_2S : SchedWriteRes<[A57UnitS, A57UnitS]> {
let Latency = 2;
let NumMicroOps = 2;
}
def A57Write_2cyc_2V : SchedWriteRes<[A57UnitV, A57UnitV]> {
let Latency = 2;
let NumMicroOps = 2;
}
def A57Write_36cyc_2X : SchedWriteRes<[A57UnitX, A57UnitX]> {
let Latency = 36;
let NumMicroOps = 2;
let ResourceCycles = [18, 18];
}
def A57Write_3cyc_1I_1M : SchedWriteRes<[A57UnitI,
A57UnitM]> {
let Latency = 3;
let NumMicroOps = 2;
}
def A57Write_4cyc_1I_1M : SchedWriteRes<[A57UnitI,
A57UnitM]> {
let Latency = 4;
let NumMicroOps = 2;
}

// A57Write_3cyc_1L_1I - A57Write_20cyc_1L_1I
foreach Lat = 3-20 in {
def A57Write_#Lat#cyc_1L_1I : SchedWriteRes<[A57UnitL, A57UnitI]> {
let Latency = Lat; let NumMicroOps = 2;
}
}

def A57Write_3cyc_1I_1S : SchedWriteRes<[A57UnitI,
A57UnitS]> {
let Latency = 3;
let NumMicroOps = 2;
}
def A57Write_3cyc_1S_1V : SchedWriteRes<[A57UnitS,
A57UnitV]> {
let Latency = 3;
let NumMicroOps = 2;
}
def A57Write_4cyc_1S_1V : SchedWriteRes<[A57UnitS,
A57UnitV]> {
let Latency = 4;
let NumMicroOps = 2;
}
def A57Write_3cyc_2V : SchedWriteRes<[A57UnitV, A57UnitV]> {
let Latency = 3;
let NumMicroOps = 2;
}

// A57Write_4cyc_1S_1I - A57Write_16cyc_1S_1I
foreach Lat = 4-16 in {
def A57Write_#Lat#cyc_1S_1I : SchedWriteRes<[A57UnitS, A57UnitI]> {
let Latency = Lat; let NumMicroOps = 2;
}
}

def A57Write_4cyc_2X : SchedWriteRes<[A57UnitX, A57UnitX]> {
let Latency = 4;
let NumMicroOps = 2;
}


//===----------------------------------------------------------------------===//
// Define Generic 3 micro-op types

def A57Write_10cyc_3V : SchedWriteRes<[A57UnitV, A57UnitV, A57UnitV]> {
let Latency = 10;
let NumMicroOps = 3;
}
def A57Write_2cyc_1I_2S : SchedWriteRes<[A57UnitI,
A57UnitS, A57UnitS]> {
let Latency = 2;
let NumMicroOps = 3;
}
def A57Write_3cyc_1I_1S_1V : SchedWriteRes<[A57UnitI,
A57UnitS,
A57UnitV]> {
let Latency = 3;
let NumMicroOps = 3;
}
def A57Write_3cyc_1S_1V_1I : SchedWriteRes<[A57UnitS,
A57UnitV,
A57UnitI]> {
let Latency = 3;
let NumMicroOps = 3;
}
def A57Write_4cyc_1S_1V_1I : SchedWriteRes<[A57UnitS,
A57UnitV,
A57UnitI]> {
let Latency = 4;
let NumMicroOps = 3;
}
def A57Write_4cyc_1I_1L_1M : SchedWriteRes<[A57UnitI, A57UnitL, A57UnitM]> {
let Latency = 4;
let NumMicroOps = 3;
}
def A57Write_8cyc_1L_1V_1I : SchedWriteRes<[A57UnitL,
A57UnitV,
A57UnitI]> {
let Latency = 8;
let NumMicroOps = 3;
}
def A57Write_9cyc_1L_1V_1I : SchedWriteRes<[A57UnitL,
A57UnitV,
A57UnitI]> {
let Latency = 9;
let NumMicroOps = 3;
}
5 changes: 5 additions & 0 deletions llvm/lib/Target/ARM/ARMSubtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,10 @@ class ARMSubtarget : public ARMGenSubtargetInfo {
/// CPSR setting instruction.
bool AvoidCPSRPartialUpdate = false;

/// CheapPredicableCPSRDef - If true, disable +1 predication cost
/// for instructions updating CPSR. Enabled for Cortex-A57.
bool CheapPredicableCPSRDef = false;

/// AvoidMOVsShifterOperand - If true, codegen should avoid using flag setting
/// movs with shifter operand (i.e. asr, lsl, lsr).
bool AvoidMOVsShifterOperand = false;
Expand Down Expand Up @@ -543,6 +547,7 @@ class ARMSubtarget : public ARMGenSubtargetInfo {
bool nonpipelinedVFP() const { return NonpipelinedVFP; }
bool prefers32BitThumb() const { return Pref32BitThumb; }
bool avoidCPSRPartialUpdate() const { return AvoidCPSRPartialUpdate; }
bool cheapPredicableCPSRDef() const { return CheapPredicableCPSRDef; }
bool avoidMOVsShifterOperand() const { return AvoidMOVsShifterOperand; }
bool hasRetAddrStack() const { return HasRetAddrStack; }
bool hasMPExtension() const { return HasMPExtension; }
Expand Down
81 changes: 81 additions & 0 deletions llvm/test/CodeGen/ARM/cortex-a57-misched-alu.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
; REQUIRES: asserts
; RUN: llc < %s -mtriple=armv8r-eabi -mcpu=cortex-a57 -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s

; Check the latency for ALU shifted operand variants.
;
; CHECK: ********** MI Scheduling **********
; CHECK: foo:BB#0 entry

; ALU, basic - 1 cyc I0/I1
; CHECK: EORrr
; CHECK: rdefs left
; CHECK-NEXT: Latency : 1

; ALU, shift by immed - 2 cyc M
; CHECK: ADDrsi
; CHECK: rdefs left
; CHECK-NEXT: Latency : 2

; ALU, shift by register, unconditional - 2 cyc M
; CHECK: RSBrsr
; CHECK: rdefs left
; CHECK-NEXT: Latency : 2

; ALU, shift by register, conditional - 2 cyc I0/I1
; CHECK: ANDrsr
; CHECK: rdefs left
; CHECK-NEXT: Latency : 2

; Checking scheduling units

; CHECK: ** ScheduleDAGMILive::schedule picking next node
; Skipping COPY
; CHECK: ** ScheduleDAGMILive::schedule picking next node
; CHECK: Scheduling
; CHECK-SAME: ANDrsr
; CHECK: Ready
; CHECK-NEXT: A57UnitI

; CHECK: ** ScheduleDAGMILive::schedule picking next node
; CHECK: Scheduling
; CHECK-SAME: CMPri
; CHECK: Ready
; CHECK-NEXT: A57UnitI

; CHECK: ** ScheduleDAGMILive::schedule picking next node
; CHECK: Scheduling
; CHECK-SAME: RSBrsr
; CHECK: Ready
; CHECK-NEXT: A57UnitM

; CHECK: ** ScheduleDAGMILive::schedule picking next node
; CHECK: Scheduling
; CHECK-SAME: ADDrsi
; CHECK: Ready
; CHECK-NEXT: A57UnitM

; CHECK: ** ScheduleDAGMILive::schedule picking next node
; CHECK: Scheduling
; CHECK-SAME: EORrr
; CHECK: Ready
; CHECK-NEXT: A57UnitI


target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
target triple = "armv8r-arm-none-eabi"

; Function Attrs: norecurse nounwind readnone
define hidden i32 @foo(i32 %a, i32 %b, i32 %c, i32 %d) local_unnamed_addr #0 {
entry:
%xor = xor i32 %a, %b
%xor_shl = shl i32 %xor, 2
%add = add i32 %xor_shl, %d
%add_ashr = ashr i32 %add, %a
%sub = sub i32 %add_ashr, %a
%sub_lshr_pred = lshr i32 %sub, %c
%pred = icmp sgt i32 %a, 4
%and = and i32 %sub_lshr_pred, %b
%rv = select i1 %pred, i32 %and, i32 %d
ret i32 %rv
}

53 changes: 53 additions & 0 deletions llvm/test/CodeGen/ARM/cortex-a57-misched-basic.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
; REQUIRES: asserts
; RUN: llc < %s -mtriple=armv8r-eabi -mcpu=cortex-a57 -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s --check-prefix=CHECK --check-prefix=A57_SCHED
; RUN: llc < %s -mtriple=armv8r-eabi -mcpu=generic -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC

; Check the latency for instructions for both generic and cortex-a57.
; SDIV should be scheduled at the block's begin (20 cyc of independent M unit).
;
; CHECK: ********** MI Scheduling **********
; CHECK: foo:BB#0 entry

; GENERIC: SDIV
; GENERIC: Latency : 1
; GENERIC: EORrr
; GENERIC: Latency : 1
; GENERIC: LDRi12
; GENERIC: Latency : 4
; GENERIC: ADDrr
; GENERIC: Latency : 1
; GENERIC: SUBrr
; GENERIC: Latency : 1

; A57_SCHED: SDIV
; A57_SCHED: Latency : 20
; A57_SCHED: EORrr
; A57_SCHED: Latency : 1
; A57_SCHED: LDRi12
; A57_SCHED: Latency : 4
; A57_SCHED: ADDrr
; A57_SCHED: Latency : 1
; A57_SCHED: SUBrr
; A57_SCHED: Latency : 1

; CHECK: ** Final schedule for BB#0 ***
; GENERIC: LDRi12
; GENERIC: SDIV
; A57_SCHED: SDIV
; A57_SCHED: LDRi12
; CHECK: ********** INTERVALS **********

target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
target triple = "armv8r-arm-none-eabi"

; Function Attrs: norecurse nounwind readnone
define hidden i32 @foo(i32 %a, i32 %b, i32 %c, i32* %d) local_unnamed_addr #0 {
entry:
%xor = xor i32 %c, %b
%ld = load i32, i32* %d
%add = add nsw i32 %xor, %ld
%div = sdiv i32 %a, %b
%sub = sub i32 %div, %add
ret i32 %sub
}

37 changes: 37 additions & 0 deletions llvm/test/CodeGen/ARM/cortex-a57-misched-ldm-wrback.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
; REQUIRES: asserts
; RUN: llc < %s -mtriple=armv8r-eabi -mcpu=cortex-a57 -misched-postra -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s
;

@a = global i32 0, align 4
@b = global i32 0, align 4
@c = global i32 0, align 4

; CHECK: ********** MI Scheduling **********
; We need second, post-ra scheduling to have LDM instruction combined from single-loads
; CHECK: ********** MI Scheduling **********
; CHECK: LDMIA_UPD
; CHECK: rdefs left
; CHECK-NEXT: Latency : 4
; CHECK: Successors:
; CHECK: data
; CHECK-SAME: Latency=1
; CHECK-NEXT: data
; CHECK-SAME: Latency=3
; CHECK-NEXT: data
; CHECK-SAME: Latency=3
; CHECK-NEXT: data
; CHECK-SAME: Latency=4
define i32 @bar(i32 %a1, i32 %b1, i32 %c1) minsize optsize {
%1 = load i32, i32* @a, align 4
%2 = load i32, i32* @b, align 4
%3 = load i32, i32* @c, align 4

%ptr_after = getelementptr i32, i32* @a, i32 3

%ptr_val = ptrtoint i32* %ptr_after to i32
%mul1 = mul i32 %ptr_val, %1
%mul2 = mul i32 %mul1, %2
%mul3 = mul i32 %mul2, %3
ret i32 %mul3
}

28 changes: 28 additions & 0 deletions llvm/test/CodeGen/ARM/cortex-a57-misched-ldm.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
; REQUIRES: asserts
; RUN: llc < %s -mtriple=armv8r-eabi -mcpu=cortex-a57 -misched-postra -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s

; CHECK: ********** MI Scheduling **********
; We need second, post-ra scheduling to have LDM instruction combined from single-loads
; CHECK: ********** MI Scheduling **********
; CHECK: LDMIA
; CHECK: rdefs left
; CHECK-NEXT: Latency : 3
; CHECK: Successors:
; CHECK: data
; CHECK-SAME: Latency=3
; CHECK-NEXT: data
; CHECK-SAME: Latency=3

define i32 @foo(i32* %a) nounwind optsize {
entry:
%b = getelementptr i32, i32* %a, i32 1
%c = getelementptr i32, i32* %a, i32 2
%0 = load i32, i32* %a, align 4
%1 = load i32, i32* %b, align 4
%2 = load i32, i32* %c, align 4

%mul1 = mul i32 %0, %1
%mul2 = mul i32 %mul1, %2
ret i32 %mul2
}

36 changes: 36 additions & 0 deletions llvm/test/CodeGen/ARM/cortex-a57-misched-stm-wrback.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
; REQUIRES: asserts
; RUN: llc < %s -mtriple=armv8r-eabi -mcpu=cortex-a57 -misched-postra -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s
; N=3 STMIA_UPD should have latency 2cyc and writeback latency 1cyc

; CHECK: ********** MI Scheduling **********
; We need second, post-ra scheduling to have STM instruction combined from single-stores
; CHECK: ********** MI Scheduling **********
; CHECK: schedule starting
; CHECK: STMIA_UPD
; CHECK: rdefs left
; CHECK-NEXT: Latency : 2
; CHECK: Successors
; CHECK: data
; CHECK-SAME: Latency=1

define i32 @bar(i32 %v0, i32 %v1, i32 %v2, i32* %addr) {

%addr.1 = getelementptr i32, i32* %addr, i32 0
store i32 %v0, i32* %addr.1

%addr.2 = getelementptr i32, i32* %addr, i32 1
store i32 %v1, i32* %addr.2

%addr.3 = getelementptr i32, i32* %addr, i32 2
store i32 %v2, i32* %addr.3

%ptr_after = getelementptr i32, i32* %addr, i32 3
%val = ptrtoint i32* %ptr_after to i32

%rv1 = mul i32 %val, %v0
%rv2 = mul i32 %rv1, %v1
%rv3 = mul i32 %rv2, %v2

ret i32 %rv3
}

29 changes: 29 additions & 0 deletions llvm/test/CodeGen/ARM/cortex-a57-misched-stm.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
; REQUIRES: asserts
; RUN: llc < %s -mtriple=armv8r-eabi -mcpu=cortex-a57 -misched-postra -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s
; N=3 STMIB should have latency 2cyc

; CHECK: ********** MI Scheduling **********
; We need second, post-ra scheduling to have STM instruction combined from single-stores
; CHECK: ********** MI Scheduling **********
; CHECK: schedule starting
; CHECK: STMIB
; CHECK: rdefs left
; CHECK-NEXT: Latency : 2

define i32 @test_stm(i32 %v0, i32 %v1, i32* %addr) {

%addr.1 = getelementptr i32, i32* %addr, i32 1
store i32 %v0, i32* %addr.1

%addr.2 = getelementptr i32, i32* %addr, i32 2
store i32 %v1, i32* %addr.2

%addr.3 = getelementptr i32, i32* %addr, i32 3
%val = ptrtoint i32* %addr to i32
store i32 %val, i32* %addr.3

%rv = add i32 %v0, %v1

ret i32 %rv
}

77 changes: 77 additions & 0 deletions llvm/test/CodeGen/ARM/cortex-a57-misched-vfma.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
; REQUIRES: asserts
; RUN: llc < %s -mtriple=armv8r-eabi -mcpu=cortex-a57 -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s
; Check latencies of vmul/vfma accumulate chains.

define float @Test1(float %f1, float %f2, float %f3, float %f4, float %f5, float %f6) {
; CHECK: ********** MI Scheduling **********
; CHECK: Test1:BB#0

; CHECK: VMULS
; > VMULS common latency = 5
; CHECK: Latency : 5
; CHECK: Successors:
; CHECK: data
; > VMULS read-advanced latency to VMLAS = 0
; CHECK-SAME: Latency=0

; CHECK: VMLAS
; > VMLAS common latency = 9
; CHECK: Latency : 9
; CHECK: Successors:
; CHECK: data
; > VMLAS read-advanced latency to the next VMLAS = 4
; CHECK-SAME: Latency=4

; CHECK: VMLAS
; CHECK: Latency : 9
; CHECK: Successors:
; CHECK: data
; > VMLAS not-optimized latency to VMOVRS = 9
; CHECK-SAME: Latency=9

; f1 * f2 + f3 * f4 + f5 * f6 ==> VMULS, VMLAS, VMLAS
%mul1 = fmul float %f1, %f2
%mul2 = fmul float %f3, %f4
%mul3 = fmul float %f5, %f6
%add1 = fadd float %mul1, %mul2
%add2 = fadd float %add1, %mul3
ret float %add2
}

; ASIMD form
define <2 x float> @Test2(<2 x float> %f1, <2 x float> %f2, <2 x float> %f3, <2 x float> %f4, <2 x float> %f5, <2 x float> %f6) {
; CHECK: ********** MI Scheduling **********
; CHECK: Test2:BB#0

; CHECK: VMULfd
; > VMULfd common latency = 5
; CHECK: Latency : 5
; CHECK: Successors:
; CHECK: data
; VMULfd read-advanced latency to VMLAfd = 0
; CHECK-SAME: Latency=0

; CHECK: VMLAfd
; > VMLAfd common latency = 9
; CHECK: Latency : 9
; CHECK: Successors:
; CHECK: data
; > VMLAfd read-advanced latency to the next VMLAfd = 4
; CHECK-SAME: Latency=4

; CHECK: VMLAfd
; CHECK: Latency : 9
; CHECK: Successors:
; CHECK: data
; > VMLAfd not-optimized latency to VMOVRRD = 9
; CHECK-SAME: Latency=9

; f1 * f2 + f3 * f4 + f5 * f6 ==> VMULS, VMLAS, VMLAS
%mul1 = fmul <2 x float> %f1, %f2
%mul2 = fmul <2 x float> %f3, %f4
%mul3 = fmul <2 x float> %f5, %f6
%add1 = fadd <2 x float> %mul1, %mul2
%add2 = fadd <2 x float> %add1, %mul3
ret <2 x float> %add2
}

50 changes: 50 additions & 0 deletions llvm/test/CodeGen/ARM/cortex-a57-misched-vldm-wrback.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
; REQUIRES: asserts
; RUN: llc < %s -mtriple=armv8r-eabi -mcpu=cortex-a57 -misched-postra -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s
;

@a = global double 0.0, align 4
@b = global double 0.0, align 4
@c = global double 0.0, align 4

; CHECK: ********** MI Scheduling **********
; We need second, post-ra scheduling to have VLDM instruction combined from single-loads
; CHECK: ********** MI Scheduling **********
; CHECK: VLDMDIA_UPD
; CHECK: rdefs left
; CHECK-NEXT: Latency : 6
; CHECK: Successors:
; CHECK: data
; CHECK-SAME: Latency=1
; CHECK-NEXT: data
; CHECK-SAME: Latency=1
; CHECK-NEXT: data
; CHECK-SAME: Latency=5
; CHECK-NEXT: data
; CHECK-SAME: Latency=5
; CHECK-NEXT: data
; CHECK-SAME: Latency=6
define i32 @bar(i32* %iptr) minsize optsize {
%1 = load double, double* @a, align 8
%2 = load double, double* @b, align 8
%3 = load double, double* @c, align 8

%ptr_after = getelementptr double, double* @a, i32 3

%ptr_new_ival = ptrtoint double* %ptr_after to i32
%ptr_new = inttoptr i32 %ptr_new_ival to i32*

store i32 %ptr_new_ival, i32* %iptr, align 8

%v1 = fptoui double %1 to i32

%mul1 = mul i32 %ptr_new_ival, %v1

%v2 = fptoui double %2 to i32
%v3 = fptoui double %3 to i32

%mul2 = mul i32 %mul1, %v2
%mul3 = mul i32 %mul2, %v3

ret i32 %mul3
}

30 changes: 30 additions & 0 deletions llvm/test/CodeGen/ARM/cortex-a57-misched-vldm.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
; REQUIRES: asserts
; RUN: llc < %s -mtriple=armv8r-eabi -mcpu=cortex-a57 -misched-postra -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s

; CHECK: ********** MI Scheduling **********
; We need second, post-ra scheduling to have VLDM instruction combined from single-loads
; CHECK: ********** MI Scheduling **********
; CHECK: VLDMDIA
; CHECK: rdefs left
; CHECK-NEXT: Latency : 6
; CHECK: Successors:
; CHECK: data
; CHECK-SAME: Latency=5
; CHECK-NEXT: data
; CHECK-SAME: Latency=5
; CHECK-NEXT: data
; CHECK-SAME: Latency=6

define double @foo(double* %a) nounwind optsize {
entry:
%b = getelementptr double, double* %a, i32 1
%c = getelementptr double, double* %a, i32 2
%0 = load double, double* %a, align 4
%1 = load double, double* %b, align 4
%2 = load double, double* %c, align 4

%mul1 = fmul double %0, %1
%mul2 = fmul double %mul1, %2
ret double %mul2
}

43 changes: 43 additions & 0 deletions llvm/test/CodeGen/ARM/cortex-a57-misched-vstm-wrback.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
; REQUIRES: asserts
; RUN: llc < %s -mtriple=armv8r-eabi -mcpu=cortex-a57 -misched-postra -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s

; CHECK: ********** MI Scheduling **********
; We need second, post-ra scheduling to have VSTM instruction combined from single-stores
; CHECK: ********** MI Scheduling **********
; CHECK: schedule starting
; CHECK: VSTMDIA_UPD
; CHECK: rdefs left
; CHECK-NEXT: Latency : 4
; CHECK: Successors:
; CHECK: data
; CHECK-SAME: Latency=1

@a = global double 0.0, align 4
@b = global double 0.0, align 4
@c = global double 0.0, align 4

define i32 @bar(double* %vptr, i32 %iv1, i32* %iptr) minsize {

%vp2 = getelementptr double, double* %vptr, i32 1
%vp3 = getelementptr double, double* %vptr, i32 2

%v1 = load double, double* %vptr, align 8
%v2 = load double, double* %vp2, align 8
%v3 = load double, double* %vp3, align 8

store double %v1, double* @a, align 8
store double %v2, double* @b, align 8
store double %v3, double* @c, align 8

%ptr_after = getelementptr double, double* @a, i32 3

%ptr_new_ival = ptrtoint double* %ptr_after to i32
%ptr_new = inttoptr i32 %ptr_new_ival to i32*

store i32 %ptr_new_ival, i32* %iptr, align 8

%mul1 = mul i32 %ptr_new_ival, %iv1

ret i32 %mul1
}

23 changes: 23 additions & 0 deletions llvm/test/CodeGen/ARM/cortex-a57-misched-vstm.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
; REQUIRES: asserts
; RUN: llc < %s -mtriple=armv8r-eabi -mcpu=cortex-a57 -misched-postra -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s

; CHECK: ********** MI Scheduling **********
; We need second, post-ra scheduling to have VSTM instruction combined from single-stores
; CHECK: ********** MI Scheduling **********
; CHECK: schedule starting
; CHECK: VSTMDIA
; CHECK: rdefs left
; CHECK-NEXT: Latency : 2

%bigVec = type [2 x double]

@var = global %bigVec zeroinitializer

define void @bar(%bigVec* %ptr) {

%tmp = load %bigVec, %bigVec* %ptr
store %bigVec %tmp, %bigVec* @var

ret void
}