diff --git a/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp b/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp index ccbaea88d2f1d3..9c83f0a9248252 100644 --- a/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp +++ b/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp @@ -81,10 +81,8 @@ using namespace llvm; -// By default, we limit this to creating 16 common bases out of loops per -// function. 16 is a little over half of the allocatable register set. static cl::opt MaxVarsPrep("ppc-formprep-max-vars", - cl::Hidden, cl::init(16), + cl::Hidden, cl::init(24), cl::desc("Potential common base number threshold per function for PPC loop " "prep")); @@ -94,8 +92,7 @@ static cl::opt PreferUpdateForm("ppc-formprep-prefer-update", // Sum of following 3 per loop thresholds for all loops can not be larger // than MaxVarsPrep. -// By default, we limit this to creating 9 PHIs for one loop. -// 9 and 3 for each kind prep are exterimental values on Power9. +// now the thresholds for each kind prep are exterimental values on Power9. static cl::opt MaxVarsUpdateForm("ppc-preinc-prep-max-vars", cl::Hidden, cl::init(3), cl::desc("Potential PHI threshold per loop for PPC loop prep of update " @@ -106,7 +103,7 @@ static cl::opt MaxVarsDSForm("ppc-dsprep-max-vars", cl::desc("Potential PHI threshold per loop for PPC loop prep of DS form")); static cl::opt MaxVarsDQForm("ppc-dqprep-max-vars", - cl::Hidden, cl::init(3), + cl::Hidden, cl::init(8), cl::desc("Potential PHI threshold per loop for PPC loop prep of DQ form")); diff --git a/llvm/test/CodeGen/PowerPC/more-dq-form-prepare.ll b/llvm/test/CodeGen/PowerPC/more-dq-form-prepare.ll new file mode 100644 index 00000000000000..93e1eb54298956 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/more-dq-form-prepare.ll @@ -0,0 +1,637 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mcpu=pwr10 < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-p:64:64-n32:64-v256:256:256-v512:512:512" +target triple = "powerpc64le-unknown-linux-gnu" + +%_elem_type_of_a = type <{ double }> +%_elem_type_of_x = type <{ double }> + +define void @foo(i32* %.m, i32* %.n, [0 x %_elem_type_of_a]* %.a, [0 x %_elem_type_of_x]* %.x, i32* %.l, <2 x double>* %.vy01, <2 x double>* %.vy02, <2 x double>* %.vy03, <2 x double>* %.vy04, <2 x double>* %.vy05, <2 x double>* %.vy06, <2 x double>* %.vy07, <2 x double>* %.vy08, <2 x double>* %.vy09, <2 x double>* %.vy0a, <2 x double>* %.vy0b, <2 x double>* %.vy0c, <2 x double>* %.vy21, <2 x double>* %.vy22, <2 x double>* %.vy23, <2 x double>* %.vy24, <2 x double>* %.vy25, <2 x double>* %.vy26, <2 x double>* %.vy27, <2 x double>* %.vy28, <2 x double>* %.vy29, <2 x double>* %.vy2a, <2 x double>* %.vy2b, <2 x double>* %.vy2c) { +; CHECK-LABEL: foo: +; CHECK: .localentry foo, 1 +; CHECK-NEXT: # %bb.0: # %entry +; CHECK-NEXT: stdu 1, -448(1) +; CHECK-NEXT: .cfi_def_cfa_offset 448 +; CHECK-NEXT: .cfi_offset r14, -256 +; CHECK-NEXT: .cfi_offset r15, -248 +; CHECK-NEXT: .cfi_offset r16, -240 +; CHECK-NEXT: .cfi_offset r17, -232 +; CHECK-NEXT: .cfi_offset r18, -224 +; CHECK-NEXT: .cfi_offset r19, -216 +; CHECK-NEXT: .cfi_offset r20, -208 +; CHECK-NEXT: .cfi_offset r21, -200 +; CHECK-NEXT: .cfi_offset r22, -192 +; CHECK-NEXT: .cfi_offset r23, -184 +; CHECK-NEXT: .cfi_offset r24, -176 +; CHECK-NEXT: .cfi_offset r25, -168 +; CHECK-NEXT: .cfi_offset r26, -160 +; CHECK-NEXT: .cfi_offset r27, -152 +; CHECK-NEXT: .cfi_offset r28, -144 +; CHECK-NEXT: .cfi_offset r29, -136 +; CHECK-NEXT: .cfi_offset r30, -128 +; CHECK-NEXT: .cfi_offset r31, -120 +; CHECK-NEXT: .cfi_offset f18, -112 +; CHECK-NEXT: .cfi_offset f19, -104 +; CHECK-NEXT: .cfi_offset f20, -96 +; CHECK-NEXT: .cfi_offset f21, -88 +; CHECK-NEXT: .cfi_offset f22, -80 +; CHECK-NEXT: .cfi_offset f23, -72 +; CHECK-NEXT: .cfi_offset f24, -64 +; CHECK-NEXT: .cfi_offset f25, -56 +; CHECK-NEXT: .cfi_offset f26, -48 +; CHECK-NEXT: .cfi_offset f27, -40 +; CHECK-NEXT: .cfi_offset f28, -32 +; CHECK-NEXT: .cfi_offset f29, -24 +; CHECK-NEXT: .cfi_offset f30, -16 +; CHECK-NEXT: .cfi_offset f31, -8 +; CHECK-NEXT: lwz 4, 0(4) +; CHECK-NEXT: std 14, 192(1) # 8-byte Folded Spill +; CHECK-NEXT: std 15, 200(1) # 8-byte Folded Spill +; CHECK-NEXT: cmpwi 4, 1 +; CHECK-NEXT: std 16, 208(1) # 8-byte Folded Spill +; CHECK-NEXT: std 17, 216(1) # 8-byte Folded Spill +; CHECK-NEXT: std 18, 224(1) # 8-byte Folded Spill +; CHECK-NEXT: std 19, 232(1) # 8-byte Folded Spill +; CHECK-NEXT: std 20, 240(1) # 8-byte Folded Spill +; CHECK-NEXT: std 21, 248(1) # 8-byte Folded Spill +; CHECK-NEXT: std 22, 256(1) # 8-byte Folded Spill +; CHECK-NEXT: std 23, 264(1) # 8-byte Folded Spill +; CHECK-NEXT: std 24, 272(1) # 8-byte Folded Spill +; CHECK-NEXT: std 25, 280(1) # 8-byte Folded Spill +; CHECK-NEXT: std 26, 288(1) # 8-byte Folded Spill +; CHECK-NEXT: std 27, 296(1) # 8-byte Folded Spill +; CHECK-NEXT: std 28, 304(1) # 8-byte Folded Spill +; CHECK-NEXT: std 29, 312(1) # 8-byte Folded Spill +; CHECK-NEXT: std 30, 320(1) # 8-byte Folded Spill +; CHECK-NEXT: std 31, 328(1) # 8-byte Folded Spill +; CHECK-NEXT: stfd 18, 336(1) # 8-byte Folded Spill +; CHECK-NEXT: stfd 19, 344(1) # 8-byte Folded Spill +; CHECK-NEXT: stfd 20, 352(1) # 8-byte Folded Spill +; CHECK-NEXT: stfd 21, 360(1) # 8-byte Folded Spill +; CHECK-NEXT: stfd 22, 368(1) # 8-byte Folded Spill +; CHECK-NEXT: stfd 23, 376(1) # 8-byte Folded Spill +; CHECK-NEXT: stfd 24, 384(1) # 8-byte Folded Spill +; CHECK-NEXT: stfd 25, 392(1) # 8-byte Folded Spill +; CHECK-NEXT: stfd 26, 400(1) # 8-byte Folded Spill +; CHECK-NEXT: stfd 27, 408(1) # 8-byte Folded Spill +; CHECK-NEXT: stfd 28, 416(1) # 8-byte Folded Spill +; CHECK-NEXT: stfd 29, 424(1) # 8-byte Folded Spill +; CHECK-NEXT: stfd 30, 432(1) # 8-byte Folded Spill +; CHECK-NEXT: stfd 31, 440(1) # 8-byte Folded Spill +; CHECK-NEXT: blt 0, .LBB0_7 +; CHECK-NEXT: # %bb.1: # %_loop_1_do_.lr.ph +; CHECK-NEXT: lwz 3, 0(3) +; CHECK-NEXT: cmpwi 3, 1 +; CHECK-NEXT: blt 0, .LBB0_7 +; CHECK-NEXT: # %bb.2: # %_loop_1_do_.preheader +; CHECK-NEXT: addi 3, 3, 1 +; CHECK-NEXT: mr 24, 5 +; CHECK-NEXT: li 5, 9 +; CHECK-NEXT: mr 11, 7 +; CHECK-NEXT: ld 12, 640(1) +; CHECK-NEXT: std 9, 176(1) # 8-byte Folded Spill +; CHECK-NEXT: std 10, 184(1) # 8-byte Folded Spill +; CHECK-NEXT: mr 7, 6 +; CHECK-NEXT: ld 6, 544(1) +; CHECK-NEXT: lxv 1, 0(9) +; CHECK-NEXT: ld 9, 648(1) +; CHECK-NEXT: ld 29, 688(1) +; CHECK-NEXT: ld 28, 680(1) +; CHECK-NEXT: ld 2, 632(1) +; CHECK-NEXT: ld 26, 624(1) +; CHECK-NEXT: lxv 0, 0(10) +; CHECK-NEXT: cmpldi 3, 9 +; CHECK-NEXT: lxv 4, 0(8) +; CHECK-NEXT: ld 30, 664(1) +; CHECK-NEXT: ld 10, 704(1) +; CHECK-NEXT: ld 27, 672(1) +; CHECK-NEXT: ld 25, 616(1) +; CHECK-NEXT: ld 23, 608(1) +; CHECK-NEXT: ld 22, 600(1) +; CHECK-NEXT: ld 21, 592(1) +; CHECK-NEXT: ld 19, 584(1) +; CHECK-NEXT: ld 17, 576(1) +; CHECK-NEXT: iselgt 3, 3, 5 +; CHECK-NEXT: ld 5, 656(1) +; CHECK-NEXT: addi 3, 3, -2 +; CHECK-NEXT: lwa 20, 0(11) +; CHECK-NEXT: lxv 13, 0(12) +; CHECK-NEXT: std 6, 128(1) # 8-byte Folded Spill +; CHECK-NEXT: std 27, 136(1) # 8-byte Folded Spill +; CHECK-NEXT: lxv 2, 0(6) +; CHECK-NEXT: ld 6, 696(1) +; CHECK-NEXT: lxv 34, 0(2) +; CHECK-NEXT: lxv 7, 0(29) +; CHECK-NEXT: lxv 39, 0(17) +; CHECK-NEXT: lxv 38, 0(19) +; CHECK-NEXT: lxv 33, 0(21) +; CHECK-NEXT: lxv 32, 0(22) +; CHECK-NEXT: lxv 37, 0(23) +; CHECK-NEXT: lxv 36, 0(25) +; CHECK-NEXT: lxv 35, 0(26) +; CHECK-NEXT: lxv 11, 0(9) +; CHECK-NEXT: lxv 12, 0(30) +; CHECK-NEXT: rldicl 3, 3, 61, 3 +; CHECK-NEXT: addi 0, 3, 1 +; CHECK-NEXT: ld 3, 560(1) +; CHECK-NEXT: sldi 11, 20, 2 +; CHECK-NEXT: lxv 9, 0(5) +; CHECK-NEXT: lxv 10, 0(27) +; CHECK-NEXT: lxv 8, 0(28) +; CHECK-NEXT: lxv 6, 0(6) +; CHECK-NEXT: lxv 5, 0(10) +; CHECK-NEXT: lxv 3, 0(3) +; CHECK-NEXT: std 3, 96(1) # 8-byte Folded Spill +; CHECK-NEXT: std 12, 104(1) # 8-byte Folded Spill +; CHECK-NEXT: sldi 3, 20, 4 +; CHECK-NEXT: add 12, 20, 11 +; CHECK-NEXT: std 8, 168(1) # 8-byte Folded Spill +; CHECK-NEXT: std 6, 160(1) # 8-byte Folded Spill +; CHECK-NEXT: ld 8, 552(1) +; CHECK-NEXT: sldi 18, 20, 1 +; CHECK-NEXT: lxv 41, 0(8) +; CHECK-NEXT: add 3, 3, 24 +; CHECK-NEXT: addi 16, 3, 32 +; CHECK-NEXT: sldi 3, 20, 3 +; CHECK-NEXT: std 9, 112(1) # 8-byte Folded Spill +; CHECK-NEXT: std 5, 120(1) # 8-byte Folded Spill +; CHECK-NEXT: sldi 5, 12, 3 +; CHECK-NEXT: std 26, 80(1) # 8-byte Folded Spill +; CHECK-NEXT: std 2, 88(1) # 8-byte Folded Spill +; CHECK-NEXT: add 2, 24, 5 +; CHECK-NEXT: mr 9, 30 +; CHECK-NEXT: li 26, 1 +; CHECK-NEXT: add 3, 3, 24 +; CHECK-NEXT: addi 31, 3, 32 +; CHECK-NEXT: ld 3, 568(1) +; CHECK-NEXT: std 28, 144(1) # 8-byte Folded Spill +; CHECK-NEXT: std 29, 152(1) # 8-byte Folded Spill +; CHECK-NEXT: sldi 5, 20, 5 +; CHECK-NEXT: add 29, 20, 18 +; CHECK-NEXT: std 23, 64(1) # 8-byte Folded Spill +; CHECK-NEXT: std 25, 72(1) # 8-byte Folded Spill +; CHECK-NEXT: mulli 27, 20, 48 +; CHECK-NEXT: add 30, 24, 5 +; CHECK-NEXT: li 25, 0 +; CHECK-NEXT: lxv 40, 0(3) +; CHECK-NEXT: mulli 23, 20, 6 +; CHECK-NEXT: sldi 5, 29, 3 +; CHECK-NEXT: add 28, 24, 5 +; CHECK-NEXT: mr 5, 24 +; CHECK-NEXT: std 17, 32(1) # 8-byte Folded Spill +; CHECK-NEXT: std 19, 40(1) # 8-byte Folded Spill +; CHECK-NEXT: std 21, 48(1) # 8-byte Folded Spill +; CHECK-NEXT: std 22, 56(1) # 8-byte Folded Spill +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: .LBB0_3: # %_loop_2_do_.lr.ph +; CHECK-NEXT: # =>This Loop Header: Depth=1 +; CHECK-NEXT: # Child Loop BB0_4 Depth 2 +; CHECK-NEXT: maddld 6, 23, 25, 12 +; CHECK-NEXT: maddld 21, 23, 25, 11 +; CHECK-NEXT: mtctr 0 +; CHECK-NEXT: sldi 6, 6, 3 +; CHECK-NEXT: add 22, 24, 6 +; CHECK-NEXT: sldi 6, 21, 3 +; CHECK-NEXT: add 21, 24, 6 +; CHECK-NEXT: maddld 6, 23, 25, 29 +; CHECK-NEXT: sldi 6, 6, 3 +; CHECK-NEXT: add 19, 24, 6 +; CHECK-NEXT: maddld 6, 23, 25, 18 +; CHECK-NEXT: sldi 6, 6, 3 +; CHECK-NEXT: add 17, 24, 6 +; CHECK-NEXT: maddld 6, 23, 25, 20 +; CHECK-NEXT: sldi 6, 6, 3 +; CHECK-NEXT: add 15, 24, 6 +; CHECK-NEXT: mulld 6, 23, 25 +; CHECK-NEXT: sldi 6, 6, 3 +; CHECK-NEXT: add 14, 24, 6 +; CHECK-NEXT: mr 6, 7 +; CHECK-NEXT: .p2align 5 +; CHECK-NEXT: .LBB0_4: # %_loop_2_do_ +; CHECK-NEXT: # Parent Loop BB0_3 Depth=1 +; CHECK-NEXT: # => This Inner Loop Header: Depth=2 +; CHECK-NEXT: lxvp 42, 0(6) +; CHECK-NEXT: lxvp 44, 0(14) +; CHECK-NEXT: lxvp 46, 0(15) +; CHECK-NEXT: lxvp 48, 0(17) +; CHECK-NEXT: lxvp 50, 0(19) +; CHECK-NEXT: lxvp 30, 0(21) +; CHECK-NEXT: lxvp 28, 0(22) +; CHECK-NEXT: lxvp 26, 32(6) +; CHECK-NEXT: lxvp 24, 32(14) +; CHECK-NEXT: lxvp 22, 32(15) +; CHECK-NEXT: lxvp 20, 32(17) +; CHECK-NEXT: lxvp 18, 32(19) +; CHECK-NEXT: addi 6, 6, 64 +; CHECK-NEXT: addi 14, 14, 64 +; CHECK-NEXT: addi 15, 15, 64 +; CHECK-NEXT: addi 17, 17, 64 +; CHECK-NEXT: addi 19, 19, 64 +; CHECK-NEXT: xvmaddadp 4, 45, 43 +; CHECK-NEXT: xvmaddadp 1, 47, 43 +; CHECK-NEXT: xvmaddadp 0, 49, 43 +; CHECK-NEXT: xvmaddadp 2, 51, 43 +; CHECK-NEXT: xvmaddadp 41, 31, 43 +; CHECK-NEXT: xvmaddadp 3, 29, 43 +; CHECK-NEXT: xvmaddadp 40, 44, 42 +; CHECK-NEXT: xvmaddadp 39, 46, 42 +; CHECK-NEXT: xvmaddadp 38, 48, 42 +; CHECK-NEXT: xvmaddadp 33, 50, 42 +; CHECK-NEXT: xvmaddadp 32, 30, 42 +; CHECK-NEXT: xvmaddadp 37, 28, 42 +; CHECK-NEXT: lxvp 42, 32(21) +; CHECK-NEXT: lxvp 44, 32(22) +; CHECK-NEXT: addi 21, 21, 64 +; CHECK-NEXT: addi 22, 22, 64 +; CHECK-NEXT: xvmaddadp 36, 25, 27 +; CHECK-NEXT: xvmaddadp 35, 23, 27 +; CHECK-NEXT: xvmaddadp 34, 21, 27 +; CHECK-NEXT: xvmaddadp 13, 19, 27 +; CHECK-NEXT: xvmaddadp 12, 24, 26 +; CHECK-NEXT: xvmaddadp 10, 22, 26 +; CHECK-NEXT: xvmaddadp 8, 20, 26 +; CHECK-NEXT: xvmaddadp 7, 18, 26 +; CHECK-NEXT: xvmaddadp 11, 43, 27 +; CHECK-NEXT: xvmaddadp 9, 45, 27 +; CHECK-NEXT: xvmaddadp 6, 42, 26 +; CHECK-NEXT: xvmaddadp 5, 44, 26 +; CHECK-NEXT: bdnz .LBB0_4 +; CHECK-NEXT: # %bb.5: # %_loop_2_endl_ +; CHECK-NEXT: # +; CHECK-NEXT: addi 26, 26, 6 +; CHECK-NEXT: add 5, 5, 27 +; CHECK-NEXT: add 31, 31, 27 +; CHECK-NEXT: add 2, 2, 27 +; CHECK-NEXT: add 16, 16, 27 +; CHECK-NEXT: add 30, 30, 27 +; CHECK-NEXT: add 28, 28, 27 +; CHECK-NEXT: addi 25, 25, 1 +; CHECK-NEXT: cmpld 26, 4 +; CHECK-NEXT: ble 0, .LBB0_3 +; CHECK-NEXT: # %bb.6: # %_loop_1_loopHeader_._return_bb_crit_edge.loopexit +; CHECK-NEXT: ld 4, 168(1) # 8-byte Folded Reload +; CHECK-NEXT: stxv 4, 0(4) +; CHECK-NEXT: ld 4, 176(1) # 8-byte Folded Reload +; CHECK-NEXT: stxv 1, 0(4) +; CHECK-NEXT: ld 4, 184(1) # 8-byte Folded Reload +; CHECK-NEXT: stxv 0, 0(4) +; CHECK-NEXT: ld 4, 128(1) # 8-byte Folded Reload +; CHECK-NEXT: stxv 2, 0(4) +; CHECK-NEXT: ld 4, 96(1) # 8-byte Folded Reload +; CHECK-NEXT: stxv 41, 0(8) +; CHECK-NEXT: stxv 3, 0(4) +; CHECK-NEXT: stxv 40, 0(3) +; CHECK-NEXT: ld 3, 32(1) # 8-byte Folded Reload +; CHECK-NEXT: stxv 39, 0(3) +; CHECK-NEXT: ld 3, 40(1) # 8-byte Folded Reload +; CHECK-NEXT: stxv 38, 0(3) +; CHECK-NEXT: ld 3, 48(1) # 8-byte Folded Reload +; CHECK-NEXT: stxv 33, 0(3) +; CHECK-NEXT: ld 3, 56(1) # 8-byte Folded Reload +; CHECK-NEXT: stxv 32, 0(3) +; CHECK-NEXT: ld 3, 64(1) # 8-byte Folded Reload +; CHECK-NEXT: stxv 37, 0(3) +; CHECK-NEXT: ld 3, 72(1) # 8-byte Folded Reload +; CHECK-NEXT: stxv 36, 0(3) +; CHECK-NEXT: ld 3, 80(1) # 8-byte Folded Reload +; CHECK-NEXT: stxv 35, 0(3) +; CHECK-NEXT: ld 3, 88(1) # 8-byte Folded Reload +; CHECK-NEXT: stxv 34, 0(3) +; CHECK-NEXT: ld 3, 104(1) # 8-byte Folded Reload +; CHECK-NEXT: stxv 13, 0(3) +; CHECK-NEXT: ld 3, 112(1) # 8-byte Folded Reload +; CHECK-NEXT: stxv 11, 0(3) +; CHECK-NEXT: ld 3, 120(1) # 8-byte Folded Reload +; CHECK-NEXT: stxv 9, 0(3) +; CHECK-NEXT: ld 3, 136(1) # 8-byte Folded Reload +; CHECK-NEXT: stxv 12, 0(9) +; CHECK-NEXT: stxv 10, 0(3) +; CHECK-NEXT: ld 3, 144(1) # 8-byte Folded Reload +; CHECK-NEXT: stxv 8, 0(3) +; CHECK-NEXT: ld 3, 152(1) # 8-byte Folded Reload +; CHECK-NEXT: stxv 7, 0(3) +; CHECK-NEXT: ld 3, 160(1) # 8-byte Folded Reload +; CHECK-NEXT: stxv 6, 0(3) +; CHECK-NEXT: stxv 5, 0(10) +; CHECK-NEXT: .LBB0_7: # %_return_bb +; CHECK-NEXT: lfd 31, 440(1) # 8-byte Folded Reload +; CHECK-NEXT: lfd 30, 432(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 31, 328(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 30, 320(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 29, 312(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 28, 304(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 27, 296(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 26, 288(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 25, 280(1) # 8-byte Folded Reload +; CHECK-NEXT: lfd 29, 424(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 24, 272(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 23, 264(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 22, 256(1) # 8-byte Folded Reload +; CHECK-NEXT: lfd 28, 416(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 21, 248(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 20, 240(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 19, 232(1) # 8-byte Folded Reload +; CHECK-NEXT: lfd 27, 408(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 18, 224(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 17, 216(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 16, 208(1) # 8-byte Folded Reload +; CHECK-NEXT: lfd 26, 400(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 15, 200(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 14, 192(1) # 8-byte Folded Reload +; CHECK-NEXT: lfd 25, 392(1) # 8-byte Folded Reload +; CHECK-NEXT: lfd 24, 384(1) # 8-byte Folded Reload +; CHECK-NEXT: lfd 23, 376(1) # 8-byte Folded Reload +; CHECK-NEXT: lfd 22, 368(1) # 8-byte Folded Reload +; CHECK-NEXT: lfd 21, 360(1) # 8-byte Folded Reload +; CHECK-NEXT: lfd 20, 352(1) # 8-byte Folded Reload +; CHECK-NEXT: lfd 19, 344(1) # 8-byte Folded Reload +; CHECK-NEXT: lfd 18, 336(1) # 8-byte Folded Reload +; CHECK-NEXT: addi 1, 1, 448 +; CHECK-NEXT: blr +entry: + %_val_l_ = load i32, i32* %.l, align 4 + %_conv = sext i32 %_val_l_ to i64 + %_mult_tmp = shl nsw i64 %_conv, 3 + %_sub_tmp4 = sub nuw nsw i64 -8, %_mult_tmp + %_val_n_ = load i32, i32* %.n, align 4 + %_leq_tmp.not116 = icmp slt i32 %_val_n_, 1 + br i1 %_leq_tmp.not116, label %_return_bb, label %_loop_1_do_.lr.ph + +_loop_1_do_.lr.ph: ; preds = %entry + %_val_m_ = load i32, i32* %.m, align 4 + %_leq_tmp6.not114 = icmp slt i32 %_val_m_, 1 + br i1 %_leq_tmp6.not114, label %_return_bb, label %_loop_1_do_.preheader + +_loop_1_do_.preheader: ; preds = %_loop_1_do_.lr.ph + %x_rvo_based_addr_112 = getelementptr inbounds [0 x %_elem_type_of_x], [0 x %_elem_type_of_x]* %.x, i64 0, i64 -1 + %a_byte_ptr_ = bitcast [0 x %_elem_type_of_a]* %.a to i8* + %a_rvo_based_addr_ = getelementptr inbounds i8, i8* %a_byte_ptr_, i64 %_sub_tmp4 + %.vy01.promoted = load <2 x double>, <2 x double>* %.vy01, align 16 + %.vy02.promoted = load <2 x double>, <2 x double>* %.vy02, align 16 + %.vy03.promoted = load <2 x double>, <2 x double>* %.vy03, align 16 + %.vy04.promoted = load <2 x double>, <2 x double>* %.vy04, align 16 + %.vy05.promoted = load <2 x double>, <2 x double>* %.vy05, align 16 + %.vy06.promoted = load <2 x double>, <2 x double>* %.vy06, align 16 + %.vy07.promoted = load <2 x double>, <2 x double>* %.vy07, align 16 + %.vy08.promoted = load <2 x double>, <2 x double>* %.vy08, align 16 + %.vy09.promoted = load <2 x double>, <2 x double>* %.vy09, align 16 + %.vy0a.promoted = load <2 x double>, <2 x double>* %.vy0a, align 16 + %.vy0b.promoted = load <2 x double>, <2 x double>* %.vy0b, align 16 + %.vy0c.promoted = load <2 x double>, <2 x double>* %.vy0c, align 16 + %.vy21.promoted = load <2 x double>, <2 x double>* %.vy21, align 16 + %.vy22.promoted = load <2 x double>, <2 x double>* %.vy22, align 16 + %.vy23.promoted = load <2 x double>, <2 x double>* %.vy23, align 16 + %.vy24.promoted = load <2 x double>, <2 x double>* %.vy24, align 16 + %.vy25.promoted = load <2 x double>, <2 x double>* %.vy25, align 16 + %.vy26.promoted = load <2 x double>, <2 x double>* %.vy26, align 16 + %.vy27.promoted = load <2 x double>, <2 x double>* %.vy27, align 16 + %.vy28.promoted = load <2 x double>, <2 x double>* %.vy28, align 16 + %.vy29.promoted = load <2 x double>, <2 x double>* %.vy29, align 16 + %.vy2a.promoted = load <2 x double>, <2 x double>* %.vy2a, align 16 + %.vy2b.promoted = load <2 x double>, <2 x double>* %.vy2b, align 16 + %.vy2c.promoted = load <2 x double>, <2 x double>* %.vy2c, align 16 + %0 = zext i32 %_val_m_ to i64 + %1 = zext i32 %_val_n_ to i64 + br label %_loop_2_do_.lr.ph + +_loop_2_do_.lr.ph: ; preds = %_loop_2_endl_, %_loop_1_do_.preheader + %indvars.iv212 = phi i64 [ %indvars.iv.next213, %_loop_2_endl_ ], [ 1, %_loop_1_do_.preheader ] + %2 = phi <2 x double> [ %142, %_loop_2_endl_ ], [ %.vy2c.promoted, %_loop_1_do_.preheader ] + %3 = phi <2 x double> [ %140, %_loop_2_endl_ ], [ %.vy2b.promoted, %_loop_1_do_.preheader ] + %4 = phi <2 x double> [ %138, %_loop_2_endl_ ], [ %.vy2a.promoted, %_loop_1_do_.preheader ] + %5 = phi <2 x double> [ %136, %_loop_2_endl_ ], [ %.vy29.promoted, %_loop_1_do_.preheader ] + %6 = phi <2 x double> [ %134, %_loop_2_endl_ ], [ %.vy28.promoted, %_loop_1_do_.preheader ] + %7 = phi <2 x double> [ %132, %_loop_2_endl_ ], [ %.vy27.promoted, %_loop_1_do_.preheader ] + %8 = phi <2 x double> [ %129, %_loop_2_endl_ ], [ %.vy26.promoted, %_loop_1_do_.preheader ] + %9 = phi <2 x double> [ %127, %_loop_2_endl_ ], [ %.vy25.promoted, %_loop_1_do_.preheader ] + %10 = phi <2 x double> [ %125, %_loop_2_endl_ ], [ %.vy24.promoted, %_loop_1_do_.preheader ] + %11 = phi <2 x double> [ %123, %_loop_2_endl_ ], [ %.vy23.promoted, %_loop_1_do_.preheader ] + %12 = phi <2 x double> [ %121, %_loop_2_endl_ ], [ %.vy22.promoted, %_loop_1_do_.preheader ] + %13 = phi <2 x double> [ %119, %_loop_2_endl_ ], [ %.vy21.promoted, %_loop_1_do_.preheader ] + %14 = phi <2 x double> [ %116, %_loop_2_endl_ ], [ %.vy0c.promoted, %_loop_1_do_.preheader ] + %15 = phi <2 x double> [ %114, %_loop_2_endl_ ], [ %.vy0b.promoted, %_loop_1_do_.preheader ] + %16 = phi <2 x double> [ %112, %_loop_2_endl_ ], [ %.vy0a.promoted, %_loop_1_do_.preheader ] + %17 = phi <2 x double> [ %110, %_loop_2_endl_ ], [ %.vy09.promoted, %_loop_1_do_.preheader ] + %18 = phi <2 x double> [ %108, %_loop_2_endl_ ], [ %.vy08.promoted, %_loop_1_do_.preheader ] + %19 = phi <2 x double> [ %106, %_loop_2_endl_ ], [ %.vy07.promoted, %_loop_1_do_.preheader ] + %20 = phi <2 x double> [ %81, %_loop_2_endl_ ], [ %.vy06.promoted, %_loop_1_do_.preheader ] + %21 = phi <2 x double> [ %79, %_loop_2_endl_ ], [ %.vy05.promoted, %_loop_1_do_.preheader ] + %22 = phi <2 x double> [ %77, %_loop_2_endl_ ], [ %.vy04.promoted, %_loop_1_do_.preheader ] + %23 = phi <2 x double> [ %75, %_loop_2_endl_ ], [ %.vy03.promoted, %_loop_1_do_.preheader ] + %24 = phi <2 x double> [ %73, %_loop_2_endl_ ], [ %.vy02.promoted, %_loop_1_do_.preheader ] + %25 = phi <2 x double> [ %71, %_loop_2_endl_ ], [ %.vy01.promoted, %_loop_1_do_.preheader ] + %_ix_x_len10 = mul i64 %_mult_tmp, %indvars.iv212 + %a_ix_dim_0_ = getelementptr inbounds i8, i8* %a_rvo_based_addr_, i64 %_ix_x_len10 + %26 = add nuw nsw i64 %indvars.iv212, 1 + %_ix_x_len24 = mul i64 %_mult_tmp, %26 + %a_ix_dim_0_25 = getelementptr inbounds i8, i8* %a_rvo_based_addr_, i64 %_ix_x_len24 + %27 = add nuw nsw i64 %indvars.iv212, 2 + %_ix_x_len40 = mul i64 %_mult_tmp, %27 + %a_ix_dim_0_41 = getelementptr inbounds i8, i8* %a_rvo_based_addr_, i64 %_ix_x_len40 + %28 = add nuw nsw i64 %indvars.iv212, 3 + %_ix_x_len56 = mul i64 %_mult_tmp, %28 + %a_ix_dim_0_57 = getelementptr inbounds i8, i8* %a_rvo_based_addr_, i64 %_ix_x_len56 + %29 = add nuw nsw i64 %indvars.iv212, 4 + %_ix_x_len72 = mul i64 %_mult_tmp, %29 + %a_ix_dim_0_73 = getelementptr inbounds i8, i8* %a_rvo_based_addr_, i64 %_ix_x_len72 + %30 = add nuw nsw i64 %indvars.iv212, 5 + %_ix_x_len88 = mul i64 %_mult_tmp, %30 + %a_ix_dim_0_89 = getelementptr inbounds i8, i8* %a_rvo_based_addr_, i64 %_ix_x_len88 + br label %_loop_2_do_ + +_loop_2_do_: ; preds = %_loop_2_do_.lr.ph, %_loop_2_do_ + %indvars.iv = phi i64 [ 1, %_loop_2_do_.lr.ph ], [ %indvars.iv.next, %_loop_2_do_ ] + %31 = phi <2 x double> [ %2, %_loop_2_do_.lr.ph ], [ %142, %_loop_2_do_ ] + %32 = phi <2 x double> [ %3, %_loop_2_do_.lr.ph ], [ %140, %_loop_2_do_ ] + %33 = phi <2 x double> [ %4, %_loop_2_do_.lr.ph ], [ %138, %_loop_2_do_ ] + %34 = phi <2 x double> [ %5, %_loop_2_do_.lr.ph ], [ %136, %_loop_2_do_ ] + %35 = phi <2 x double> [ %6, %_loop_2_do_.lr.ph ], [ %134, %_loop_2_do_ ] + %36 = phi <2 x double> [ %7, %_loop_2_do_.lr.ph ], [ %132, %_loop_2_do_ ] + %37 = phi <2 x double> [ %8, %_loop_2_do_.lr.ph ], [ %129, %_loop_2_do_ ] + %38 = phi <2 x double> [ %9, %_loop_2_do_.lr.ph ], [ %127, %_loop_2_do_ ] + %39 = phi <2 x double> [ %10, %_loop_2_do_.lr.ph ], [ %125, %_loop_2_do_ ] + %40 = phi <2 x double> [ %11, %_loop_2_do_.lr.ph ], [ %123, %_loop_2_do_ ] + %41 = phi <2 x double> [ %12, %_loop_2_do_.lr.ph ], [ %121, %_loop_2_do_ ] + %42 = phi <2 x double> [ %13, %_loop_2_do_.lr.ph ], [ %119, %_loop_2_do_ ] + %43 = phi <2 x double> [ %14, %_loop_2_do_.lr.ph ], [ %116, %_loop_2_do_ ] + %44 = phi <2 x double> [ %15, %_loop_2_do_.lr.ph ], [ %114, %_loop_2_do_ ] + %45 = phi <2 x double> [ %16, %_loop_2_do_.lr.ph ], [ %112, %_loop_2_do_ ] + %46 = phi <2 x double> [ %17, %_loop_2_do_.lr.ph ], [ %110, %_loop_2_do_ ] + %47 = phi <2 x double> [ %18, %_loop_2_do_.lr.ph ], [ %108, %_loop_2_do_ ] + %48 = phi <2 x double> [ %19, %_loop_2_do_.lr.ph ], [ %106, %_loop_2_do_ ] + %49 = phi <2 x double> [ %20, %_loop_2_do_.lr.ph ], [ %81, %_loop_2_do_ ] + %50 = phi <2 x double> [ %21, %_loop_2_do_.lr.ph ], [ %79, %_loop_2_do_ ] + %51 = phi <2 x double> [ %22, %_loop_2_do_.lr.ph ], [ %77, %_loop_2_do_ ] + %52 = phi <2 x double> [ %23, %_loop_2_do_.lr.ph ], [ %75, %_loop_2_do_ ] + %53 = phi <2 x double> [ %24, %_loop_2_do_.lr.ph ], [ %73, %_loop_2_do_ ] + %54 = phi <2 x double> [ %25, %_loop_2_do_.lr.ph ], [ %71, %_loop_2_do_ ] + %_ix_x_len = shl nuw nsw i64 %indvars.iv, 3 + %x_ix_dim_0_113 = getelementptr inbounds %_elem_type_of_x, %_elem_type_of_x* %x_rvo_based_addr_112, i64 %indvars.iv + %x_ix_dim_0_ = bitcast %_elem_type_of_x* %x_ix_dim_0_113 to i8* + %55 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* nonnull %x_ix_dim_0_) + %a_ix_dim_1_ = getelementptr inbounds i8, i8* %a_ix_dim_0_, i64 %_ix_x_len + %56 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* nonnull %a_ix_dim_1_) + %a_ix_dim_1_29 = getelementptr inbounds i8, i8* %a_ix_dim_0_25, i64 %_ix_x_len + %57 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* nonnull %a_ix_dim_1_29) + %a_ix_dim_1_45 = getelementptr inbounds i8, i8* %a_ix_dim_0_41, i64 %_ix_x_len + %58 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* nonnull %a_ix_dim_1_45) + %a_ix_dim_1_61 = getelementptr inbounds i8, i8* %a_ix_dim_0_57, i64 %_ix_x_len + %59 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* nonnull %a_ix_dim_1_61) + %a_ix_dim_1_77 = getelementptr inbounds i8, i8* %a_ix_dim_0_73, i64 %_ix_x_len + %60 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* nonnull %a_ix_dim_1_77) + %a_ix_dim_1_93 = getelementptr inbounds i8, i8* %a_ix_dim_0_89, i64 %_ix_x_len + %61 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* nonnull %a_ix_dim_1_93) + %62 = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.pair(<256 x i1> %55) + %.fca.0.extract35 = extractvalue { <16 x i8>, <16 x i8> } %62, 0 + %.fca.1.extract36 = extractvalue { <16 x i8>, <16 x i8> } %62, 1 + %63 = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.pair(<256 x i1> %56) + %.fca.0.extract29 = extractvalue { <16 x i8>, <16 x i8> } %63, 0 + %.fca.1.extract30 = extractvalue { <16 x i8>, <16 x i8> } %63, 1 + %64 = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.pair(<256 x i1> %57) + %.fca.0.extract23 = extractvalue { <16 x i8>, <16 x i8> } %64, 0 + %.fca.1.extract24 = extractvalue { <16 x i8>, <16 x i8> } %64, 1 + %65 = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.pair(<256 x i1> %58) + %.fca.0.extract17 = extractvalue { <16 x i8>, <16 x i8> } %65, 0 + %.fca.1.extract18 = extractvalue { <16 x i8>, <16 x i8> } %65, 1 + %66 = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.pair(<256 x i1> %59) + %.fca.0.extract11 = extractvalue { <16 x i8>, <16 x i8> } %66, 0 + %.fca.1.extract12 = extractvalue { <16 x i8>, <16 x i8> } %66, 1 + %67 = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.pair(<256 x i1> %60) + %.fca.0.extract5 = extractvalue { <16 x i8>, <16 x i8> } %67, 0 + %.fca.1.extract6 = extractvalue { <16 x i8>, <16 x i8> } %67, 1 + %68 = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.pair(<256 x i1> %61) + %.fca.0.extract = extractvalue { <16 x i8>, <16 x i8> } %68, 0 + %.fca.1.extract = extractvalue { <16 x i8>, <16 x i8> } %68, 1 + %69 = bitcast <16 x i8> %.fca.0.extract29 to <2 x double> + %70 = bitcast <16 x i8> %.fca.0.extract35 to <2 x double> + %71 = tail call contract <2 x double> @llvm.fma.v2f64(<2 x double> %69, <2 x double> %70, <2 x double> %54) + %72 = bitcast <16 x i8> %.fca.0.extract23 to <2 x double> + %73 = tail call contract <2 x double> @llvm.fma.v2f64(<2 x double> %72, <2 x double> %70, <2 x double> %53) + %74 = bitcast <16 x i8> %.fca.0.extract17 to <2 x double> + %75 = tail call contract <2 x double> @llvm.fma.v2f64(<2 x double> %74, <2 x double> %70, <2 x double> %52) + %76 = bitcast <16 x i8> %.fca.0.extract11 to <2 x double> + %77 = tail call contract <2 x double> @llvm.fma.v2f64(<2 x double> %76, <2 x double> %70, <2 x double> %51) + %78 = bitcast <16 x i8> %.fca.0.extract5 to <2 x double> + %79 = tail call contract <2 x double> @llvm.fma.v2f64(<2 x double> %78, <2 x double> %70, <2 x double> %50) + %80 = bitcast <16 x i8> %.fca.0.extract to <2 x double> + %81 = tail call contract <2 x double> @llvm.fma.v2f64(<2 x double> %80, <2 x double> %70, <2 x double> %49) + %82 = getelementptr %_elem_type_of_x, %_elem_type_of_x* %x_ix_dim_0_113, i64 4 + %83 = bitcast %_elem_type_of_x* %82 to i8* + %84 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %83) + %85 = getelementptr i8, i8* %a_ix_dim_1_, i64 32 + %86 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %85) + %87 = getelementptr i8, i8* %a_ix_dim_1_29, i64 32 + %88 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %87) + %89 = getelementptr i8, i8* %a_ix_dim_1_45, i64 32 + %90 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %89) + %91 = getelementptr i8, i8* %a_ix_dim_1_61, i64 32 + %92 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %91) + %93 = getelementptr i8, i8* %a_ix_dim_1_77, i64 32 + %94 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %93) + %95 = getelementptr i8, i8* %a_ix_dim_1_93, i64 32 + %96 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %95) + %97 = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.pair(<256 x i1> %84) + %.fca.0.extract37 = extractvalue { <16 x i8>, <16 x i8> } %97, 0 + %.fca.1.extract39 = extractvalue { <16 x i8>, <16 x i8> } %97, 1 + %98 = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.pair(<256 x i1> %86) + %.fca.0.extract31 = extractvalue { <16 x i8>, <16 x i8> } %98, 0 + %.fca.1.extract33 = extractvalue { <16 x i8>, <16 x i8> } %98, 1 + %99 = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.pair(<256 x i1> %88) + %.fca.0.extract25 = extractvalue { <16 x i8>, <16 x i8> } %99, 0 + %.fca.1.extract27 = extractvalue { <16 x i8>, <16 x i8> } %99, 1 + %100 = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.pair(<256 x i1> %90) + %.fca.0.extract19 = extractvalue { <16 x i8>, <16 x i8> } %100, 0 + %.fca.1.extract21 = extractvalue { <16 x i8>, <16 x i8> } %100, 1 + %101 = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.pair(<256 x i1> %92) + %.fca.0.extract13 = extractvalue { <16 x i8>, <16 x i8> } %101, 0 + %.fca.1.extract15 = extractvalue { <16 x i8>, <16 x i8> } %101, 1 + %102 = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.pair(<256 x i1> %94) + %.fca.0.extract7 = extractvalue { <16 x i8>, <16 x i8> } %102, 0 + %.fca.1.extract9 = extractvalue { <16 x i8>, <16 x i8> } %102, 1 + %103 = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.pair(<256 x i1> %96) + %.fca.0.extract1 = extractvalue { <16 x i8>, <16 x i8> } %103, 0 + %.fca.1.extract3 = extractvalue { <16 x i8>, <16 x i8> } %103, 1 + %104 = bitcast <16 x i8> %.fca.1.extract30 to <2 x double> + %105 = bitcast <16 x i8> %.fca.1.extract36 to <2 x double> + %106 = tail call contract <2 x double> @llvm.fma.v2f64(<2 x double> %104, <2 x double> %105, <2 x double> %48) + %107 = bitcast <16 x i8> %.fca.1.extract24 to <2 x double> + %108 = tail call contract <2 x double> @llvm.fma.v2f64(<2 x double> %107, <2 x double> %105, <2 x double> %47) + %109 = bitcast <16 x i8> %.fca.1.extract18 to <2 x double> + %110 = tail call contract <2 x double> @llvm.fma.v2f64(<2 x double> %109, <2 x double> %105, <2 x double> %46) + %111 = bitcast <16 x i8> %.fca.1.extract12 to <2 x double> + %112 = tail call contract <2 x double> @llvm.fma.v2f64(<2 x double> %111, <2 x double> %105, <2 x double> %45) + %113 = bitcast <16 x i8> %.fca.1.extract6 to <2 x double> + %114 = tail call contract <2 x double> @llvm.fma.v2f64(<2 x double> %113, <2 x double> %105, <2 x double> %44) + %115 = bitcast <16 x i8> %.fca.1.extract to <2 x double> + %116 = tail call contract <2 x double> @llvm.fma.v2f64(<2 x double> %115, <2 x double> %105, <2 x double> %43) + %117 = bitcast <16 x i8> %.fca.0.extract31 to <2 x double> + %118 = bitcast <16 x i8> %.fca.0.extract37 to <2 x double> + %119 = tail call contract <2 x double> @llvm.fma.v2f64(<2 x double> %117, <2 x double> %118, <2 x double> %42) + %120 = bitcast <16 x i8> %.fca.0.extract25 to <2 x double> + %121 = tail call contract <2 x double> @llvm.fma.v2f64(<2 x double> %120, <2 x double> %118, <2 x double> %41) + %122 = bitcast <16 x i8> %.fca.0.extract19 to <2 x double> + %123 = tail call contract <2 x double> @llvm.fma.v2f64(<2 x double> %122, <2 x double> %118, <2 x double> %40) + %124 = bitcast <16 x i8> %.fca.0.extract13 to <2 x double> + %125 = tail call contract <2 x double> @llvm.fma.v2f64(<2 x double> %124, <2 x double> %118, <2 x double> %39) + %126 = bitcast <16 x i8> %.fca.0.extract7 to <2 x double> + %127 = tail call contract <2 x double> @llvm.fma.v2f64(<2 x double> %126, <2 x double> %118, <2 x double> %38) + %128 = bitcast <16 x i8> %.fca.0.extract1 to <2 x double> + %129 = tail call contract <2 x double> @llvm.fma.v2f64(<2 x double> %128, <2 x double> %118, <2 x double> %37) + %130 = bitcast <16 x i8> %.fca.1.extract33 to <2 x double> + %131 = bitcast <16 x i8> %.fca.1.extract39 to <2 x double> + %132 = tail call contract <2 x double> @llvm.fma.v2f64(<2 x double> %130, <2 x double> %131, <2 x double> %36) + %133 = bitcast <16 x i8> %.fca.1.extract27 to <2 x double> + %134 = tail call contract <2 x double> @llvm.fma.v2f64(<2 x double> %133, <2 x double> %131, <2 x double> %35) + %135 = bitcast <16 x i8> %.fca.1.extract21 to <2 x double> + %136 = tail call contract <2 x double> @llvm.fma.v2f64(<2 x double> %135, <2 x double> %131, <2 x double> %34) + %137 = bitcast <16 x i8> %.fca.1.extract15 to <2 x double> + %138 = tail call contract <2 x double> @llvm.fma.v2f64(<2 x double> %137, <2 x double> %131, <2 x double> %33) + %139 = bitcast <16 x i8> %.fca.1.extract9 to <2 x double> + %140 = tail call contract <2 x double> @llvm.fma.v2f64(<2 x double> %139, <2 x double> %131, <2 x double> %32) + %141 = bitcast <16 x i8> %.fca.1.extract3 to <2 x double> + %142 = tail call contract <2 x double> @llvm.fma.v2f64(<2 x double> %141, <2 x double> %131, <2 x double> %31) + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 8 + %_leq_tmp6.not = icmp ugt i64 %indvars.iv.next, %0 + br i1 %_leq_tmp6.not, label %_loop_2_endl_, label %_loop_2_do_ + +_loop_2_endl_: ; preds = %_loop_2_do_ + %indvars.iv.next213 = add nuw nsw i64 %indvars.iv212, 6 + %_leq_tmp.not = icmp ugt i64 %indvars.iv.next213, %1 + br i1 %_leq_tmp.not, label %_loop_1_loopHeader_._return_bb_crit_edge.loopexit, label %_loop_2_do_.lr.ph + +_loop_1_loopHeader_._return_bb_crit_edge.loopexit: ; preds = %_loop_2_endl_ + store <2 x double> %71, <2 x double>* %.vy01, align 16 + store <2 x double> %73, <2 x double>* %.vy02, align 16 + store <2 x double> %75, <2 x double>* %.vy03, align 16 + store <2 x double> %77, <2 x double>* %.vy04, align 16 + store <2 x double> %79, <2 x double>* %.vy05, align 16 + store <2 x double> %81, <2 x double>* %.vy06, align 16 + store <2 x double> %106, <2 x double>* %.vy07, align 16 + store <2 x double> %108, <2 x double>* %.vy08, align 16 + store <2 x double> %110, <2 x double>* %.vy09, align 16 + store <2 x double> %112, <2 x double>* %.vy0a, align 16 + store <2 x double> %114, <2 x double>* %.vy0b, align 16 + store <2 x double> %116, <2 x double>* %.vy0c, align 16 + store <2 x double> %119, <2 x double>* %.vy21, align 16 + store <2 x double> %121, <2 x double>* %.vy22, align 16 + store <2 x double> %123, <2 x double>* %.vy23, align 16 + store <2 x double> %125, <2 x double>* %.vy24, align 16 + store <2 x double> %127, <2 x double>* %.vy25, align 16 + store <2 x double> %129, <2 x double>* %.vy26, align 16 + store <2 x double> %132, <2 x double>* %.vy27, align 16 + store <2 x double> %134, <2 x double>* %.vy28, align 16 + store <2 x double> %136, <2 x double>* %.vy29, align 16 + store <2 x double> %138, <2 x double>* %.vy2a, align 16 + store <2 x double> %140, <2 x double>* %.vy2b, align 16 + store <2 x double> %142, <2 x double>* %.vy2c, align 16 + br label %_return_bb + +_return_bb: ; preds = %_loop_1_do_.lr.ph, %_loop_1_loopHeader_._return_bb_crit_edge.loopexit, %entry + ret void +} + +declare <256 x i1> @llvm.ppc.mma.lxvp(i8*) +declare { <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.pair(<256 x i1>) +declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) +