@@ -393,70 +393,69 @@ entry:
393
393
define <32 x i1 > @whilewr_32_expand3 (ptr %a , ptr %b ) {
394
394
; CHECK-LABEL: whilewr_32_expand3:
395
395
; CHECK: // %bb.0: // %entry
396
- ; CHECK-NEXT: sub x10, x1, x0
396
+ ; CHECK-NEXT: subs x10, x1, x0
397
397
; CHECK-NEXT: index z0.d, #0, #1
398
- ; CHECK-NEXT: sub x9, x10, #61
399
- ; CHECK-NEXT: subs x11, x10, #64
400
- ; CHECK-NEXT: add x12, x10, #3
401
- ; CHECK-NEXT: csel x9, x9, x11, mi
398
+ ; CHECK-NEXT: add x9, x10, #3
399
+ ; CHECK-NEXT: sub x12, x10, #61
400
+ ; CHECK-NEXT: csel x9, x9, x10, mi
402
401
; CHECK-NEXT: asr x11, x9, #2
403
- ; CHECK-NEXT: mov z1.d, z0.d
404
402
; CHECK-NEXT: mov z2.d, z0.d
405
403
; CHECK-NEXT: mov z3.d, z0.d
406
- ; CHECK-NEXT: cmp x11, #1
407
404
; CHECK-NEXT: mov z4.d, z0.d
405
+ ; CHECK-NEXT: cmp x11, #1
406
+ ; CHECK-NEXT: dup v1.2d, x11
408
407
; CHECK-NEXT: mov z5.d, z0.d
409
408
; CHECK-NEXT: cset w9, lt
410
- ; CHECK-NEXT: cmp x10, #0
409
+ ; CHECK-NEXT: subs x10, x10, #64
411
410
; CHECK-NEXT: mov z6.d, z0.d
412
411
; CHECK-NEXT: csel x10, x12, x10, mi
413
- ; CHECK-NEXT: dup v7.2d, x11
414
- ; CHECK-NEXT: add z1 .d, z1 .d, #12 // =0xc
412
+ ; CHECK-NEXT: mov z7.d, z0.d
413
+ ; CHECK-NEXT: add z2 .d, z2 .d, #12 // =0xc
415
414
; CHECK-NEXT: asr x10, x10, #2
416
- ; CHECK-NEXT: add z2 .d, z2 .d, #10 // =0xa
417
- ; CHECK-NEXT: add z3 .d, z3 .d, #8 // =0x8
418
- ; CHECK-NEXT: add z4 .d, z4 .d, #6 // =0x6
419
- ; CHECK-NEXT: add z5 .d, z5 .d, #4 // =0x4
420
- ; CHECK-NEXT: add z6.d, z6.d, #2 // =0x2
415
+ ; CHECK-NEXT: add z3 .d, z3 .d, #10 // =0xa
416
+ ; CHECK-NEXT: add z4 .d, z4 .d, #8 // =0x8
417
+ ; CHECK-NEXT: add z5 .d, z5 .d, #6 // =0x6
418
+ ; CHECK-NEXT: add z6 .d, z6 .d, #4 // =0x4
419
+ ; CHECK-NEXT: cmhi v17.2d, v1.2d, v0.2d
421
420
; CHECK-NEXT: dup v16.2d, x10
422
- ; CHECK-NEXT: cmhi v17.2d, v7.2d, v0.2d
423
- ; CHECK-NEXT: cmhi v19.2d, v7 .2d, v1 .2d
424
- ; CHECK-NEXT: cmhi v20.2d, v7 .2d, v2 .2d
425
- ; CHECK-NEXT: cmhi v21.2d, v7 .2d, v3 .2d
421
+ ; CHECK-NEXT: add z7.d, z7.d, #2 // =0x2
422
+ ; CHECK-NEXT: cmhi v19.2d, v1 .2d, v2 .2d
423
+ ; CHECK-NEXT: cmhi v20.2d, v1 .2d, v3 .2d
424
+ ; CHECK-NEXT: cmhi v21.2d, v1 .2d, v4 .2d
426
425
; CHECK-NEXT: cmp x10, #1
427
- ; CHECK-NEXT: cmhi v22.2d, v7 .2d, v4 .2d
426
+ ; CHECK-NEXT: cmhi v22.2d, v1 .2d, v5 .2d
428
427
; CHECK-NEXT: cset w10, lt
429
428
; CHECK-NEXT: cmhi v18.2d, v16.2d, v0.2d
430
429
; CHECK-NEXT: add z0.d, z0.d, #14 // =0xe
431
- ; CHECK-NEXT: cmhi v1.2d, v16.2d, v1.2d
432
430
; CHECK-NEXT: cmhi v2.2d, v16.2d, v2.2d
433
431
; CHECK-NEXT: cmhi v3.2d, v16.2d, v3.2d
434
432
; CHECK-NEXT: cmhi v4.2d, v16.2d, v4.2d
435
- ; CHECK-NEXT: cmhi v23.2d, v16.2d, v5.2d
436
- ; CHECK-NEXT: cmhi v24.2d, v16.2d, v6.2d
437
- ; CHECK-NEXT: cmhi v5.2d, v7.2d, v5.2d
433
+ ; CHECK-NEXT: cmhi v5.2d, v16.2d, v5.2d
434
+ ; CHECK-NEXT: cmhi v23.2d, v16.2d, v6.2d
435
+ ; CHECK-NEXT: cmhi v24.2d, v16.2d, v7.2d
436
+ ; CHECK-NEXT: cmhi v6.2d, v1.2d, v6.2d
438
437
; CHECK-NEXT: cmhi v16.2d, v16.2d, v0.2d
439
- ; CHECK-NEXT: cmhi v6 .2d, v7 .2d, v6 .2d
440
- ; CHECK-NEXT: cmhi v0.2d, v7 .2d, v0.2d
441
- ; CHECK-NEXT: uzp1 v7 .4s, v21.4s, v20.4s
442
- ; CHECK-NEXT: uzp1 v2 .4s, v3 .4s, v2 .4s
443
- ; CHECK-NEXT: uzp1 v3 .4s, v23.4s, v4 .4s
444
- ; CHECK-NEXT: uzp1 v4 .4s, v18.4s, v24.4s
445
- ; CHECK-NEXT: uzp1 v5 .4s, v5 .4s, v22.4s
446
- ; CHECK-NEXT: uzp1 v1 .4s, v1 .4s, v16.4s
447
- ; CHECK-NEXT: uzp1 v6 .4s, v17.4s, v6 .4s
438
+ ; CHECK-NEXT: cmhi v7 .2d, v1 .2d, v7 .2d
439
+ ; CHECK-NEXT: cmhi v0.2d, v1 .2d, v0.2d
440
+ ; CHECK-NEXT: uzp1 v1 .4s, v21.4s, v20.4s
441
+ ; CHECK-NEXT: uzp1 v3 .4s, v4 .4s, v3 .4s
442
+ ; CHECK-NEXT: uzp1 v4 .4s, v23.4s, v5 .4s
443
+ ; CHECK-NEXT: uzp1 v5 .4s, v18.4s, v24.4s
444
+ ; CHECK-NEXT: uzp1 v6 .4s, v6 .4s, v22.4s
445
+ ; CHECK-NEXT: uzp1 v2 .4s, v2 .4s, v16.4s
446
+ ; CHECK-NEXT: uzp1 v7 .4s, v17.4s, v7 .4s
448
447
; CHECK-NEXT: uzp1 v0.4s, v19.4s, v0.4s
449
- ; CHECK-NEXT: uzp1 v3 .8h, v4 .8h, v3 .8h
450
- ; CHECK-NEXT: uzp1 v1 .8h, v2 .8h, v1 .8h
451
- ; CHECK-NEXT: uzp1 v2 .8h, v6 .8h, v5 .8h
452
- ; CHECK-NEXT: uzp1 v0.8h, v7 .8h, v0.8h
453
- ; CHECK-NEXT: uzp1 v1.16b, v3 .16b, v1 .16b
454
- ; CHECK-NEXT: uzp1 v0.16b, v2 .16b, v0.16b
455
- ; CHECK-NEXT: dup v3 .16b, w10
456
- ; CHECK-NEXT: dup v2 .16b, w9
448
+ ; CHECK-NEXT: uzp1 v4 .8h, v5 .8h, v4 .8h
449
+ ; CHECK-NEXT: uzp1 v2 .8h, v3 .8h, v2 .8h
450
+ ; CHECK-NEXT: uzp1 v3 .8h, v7 .8h, v6 .8h
451
+ ; CHECK-NEXT: uzp1 v0.8h, v1 .8h, v0.8h
452
+ ; CHECK-NEXT: uzp1 v1.16b, v4 .16b, v2 .16b
453
+ ; CHECK-NEXT: uzp1 v0.16b, v3 .16b, v0.16b
454
+ ; CHECK-NEXT: dup v2 .16b, w10
455
+ ; CHECK-NEXT: dup v3 .16b, w9
457
456
; CHECK-NEXT: adrp x9, .LCPI14_0
458
- ; CHECK-NEXT: orr v1.16b, v1.16b, v3 .16b
459
- ; CHECK-NEXT: orr v0.16b, v0.16b, v2 .16b
457
+ ; CHECK-NEXT: orr v1.16b, v1.16b, v2 .16b
458
+ ; CHECK-NEXT: orr v0.16b, v0.16b, v3 .16b
460
459
; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI14_0]
461
460
; CHECK-NEXT: shl v1.16b, v1.16b, #7
462
461
; CHECK-NEXT: shl v0.16b, v0.16b, #7
@@ -470,8 +469,8 @@ define <32 x i1> @whilewr_32_expand3(ptr %a, ptr %b) {
470
469
; CHECK-NEXT: zip1 v0.16b, v0.16b, v3.16b
471
470
; CHECK-NEXT: addv h1, v1.8h
472
471
; CHECK-NEXT: addv h0, v0.8h
473
- ; CHECK-NEXT: str h1, [x8]
474
- ; CHECK-NEXT: str h0, [x8, #2 ]
472
+ ; CHECK-NEXT: str h1, [x8, #2 ]
473
+ ; CHECK-NEXT: str h0, [x8]
475
474
; CHECK-NEXT: ret
476
475
entry:
477
476
%0 = call <32 x i1 > @llvm.loop.dependence.war.mask.v32i1 (ptr %a , ptr %b , i64 4 )
@@ -587,70 +586,69 @@ entry:
587
586
define <32 x i1 > @whilewr_64_expand4 (ptr %a , ptr %b ) {
588
587
; CHECK-LABEL: whilewr_64_expand4:
589
588
; CHECK: // %bb.0: // %entry
590
- ; CHECK-NEXT: sub x10, x1, x0
589
+ ; CHECK-NEXT: subs x10, x1, x0
591
590
; CHECK-NEXT: index z0.d, #0, #1
592
- ; CHECK-NEXT: sub x9, x10, #121
593
- ; CHECK-NEXT: subs x11, x10, #128
594
- ; CHECK-NEXT: add x12, x10, #7
595
- ; CHECK-NEXT: csel x9, x9, x11, mi
591
+ ; CHECK-NEXT: add x9, x10, #7
592
+ ; CHECK-NEXT: sub x12, x10, #121
593
+ ; CHECK-NEXT: csel x9, x9, x10, mi
596
594
; CHECK-NEXT: asr x11, x9, #3
597
- ; CHECK-NEXT: mov z1.d, z0.d
598
595
; CHECK-NEXT: mov z2.d, z0.d
599
596
; CHECK-NEXT: mov z3.d, z0.d
600
- ; CHECK-NEXT: cmp x11, #1
601
597
; CHECK-NEXT: mov z4.d, z0.d
598
+ ; CHECK-NEXT: cmp x11, #1
599
+ ; CHECK-NEXT: dup v1.2d, x11
602
600
; CHECK-NEXT: mov z5.d, z0.d
603
601
; CHECK-NEXT: cset w9, lt
604
- ; CHECK-NEXT: cmp x10, #0
602
+ ; CHECK-NEXT: subs x10, x10, #128
605
603
; CHECK-NEXT: mov z6.d, z0.d
606
604
; CHECK-NEXT: csel x10, x12, x10, mi
607
- ; CHECK-NEXT: dup v7.2d, x11
608
- ; CHECK-NEXT: add z1 .d, z1 .d, #12 // =0xc
605
+ ; CHECK-NEXT: mov z7.d, z0.d
606
+ ; CHECK-NEXT: add z2 .d, z2 .d, #12 // =0xc
609
607
; CHECK-NEXT: asr x10, x10, #3
610
- ; CHECK-NEXT: add z2 .d, z2 .d, #10 // =0xa
611
- ; CHECK-NEXT: add z3 .d, z3 .d, #8 // =0x8
612
- ; CHECK-NEXT: add z4 .d, z4 .d, #6 // =0x6
613
- ; CHECK-NEXT: add z5 .d, z5 .d, #4 // =0x4
614
- ; CHECK-NEXT: add z6.d, z6.d, #2 // =0x2
608
+ ; CHECK-NEXT: add z3 .d, z3 .d, #10 // =0xa
609
+ ; CHECK-NEXT: add z4 .d, z4 .d, #8 // =0x8
610
+ ; CHECK-NEXT: add z5 .d, z5 .d, #6 // =0x6
611
+ ; CHECK-NEXT: add z6 .d, z6 .d, #4 // =0x4
612
+ ; CHECK-NEXT: cmhi v17.2d, v1.2d, v0.2d
615
613
; CHECK-NEXT: dup v16.2d, x10
616
- ; CHECK-NEXT: cmhi v17.2d, v7.2d, v0.2d
617
- ; CHECK-NEXT: cmhi v19.2d, v7 .2d, v1 .2d
618
- ; CHECK-NEXT: cmhi v20.2d, v7 .2d, v2 .2d
619
- ; CHECK-NEXT: cmhi v21.2d, v7 .2d, v3 .2d
614
+ ; CHECK-NEXT: add z7.d, z7.d, #2 // =0x2
615
+ ; CHECK-NEXT: cmhi v19.2d, v1 .2d, v2 .2d
616
+ ; CHECK-NEXT: cmhi v20.2d, v1 .2d, v3 .2d
617
+ ; CHECK-NEXT: cmhi v21.2d, v1 .2d, v4 .2d
620
618
; CHECK-NEXT: cmp x10, #1
621
- ; CHECK-NEXT: cmhi v22.2d, v7 .2d, v4 .2d
619
+ ; CHECK-NEXT: cmhi v22.2d, v1 .2d, v5 .2d
622
620
; CHECK-NEXT: cset w10, lt
623
621
; CHECK-NEXT: cmhi v18.2d, v16.2d, v0.2d
624
622
; CHECK-NEXT: add z0.d, z0.d, #14 // =0xe
625
- ; CHECK-NEXT: cmhi v1.2d, v16.2d, v1.2d
626
623
; CHECK-NEXT: cmhi v2.2d, v16.2d, v2.2d
627
624
; CHECK-NEXT: cmhi v3.2d, v16.2d, v3.2d
628
625
; CHECK-NEXT: cmhi v4.2d, v16.2d, v4.2d
629
- ; CHECK-NEXT: cmhi v23.2d, v16.2d, v5.2d
630
- ; CHECK-NEXT: cmhi v24.2d, v16.2d, v6.2d
631
- ; CHECK-NEXT: cmhi v5.2d, v7.2d, v5.2d
626
+ ; CHECK-NEXT: cmhi v5.2d, v16.2d, v5.2d
627
+ ; CHECK-NEXT: cmhi v23.2d, v16.2d, v6.2d
628
+ ; CHECK-NEXT: cmhi v24.2d, v16.2d, v7.2d
629
+ ; CHECK-NEXT: cmhi v6.2d, v1.2d, v6.2d
632
630
; CHECK-NEXT: cmhi v16.2d, v16.2d, v0.2d
633
- ; CHECK-NEXT: cmhi v6 .2d, v7 .2d, v6 .2d
634
- ; CHECK-NEXT: cmhi v0.2d, v7 .2d, v0.2d
635
- ; CHECK-NEXT: uzp1 v7 .4s, v21.4s, v20.4s
636
- ; CHECK-NEXT: uzp1 v2 .4s, v3 .4s, v2 .4s
637
- ; CHECK-NEXT: uzp1 v3 .4s, v23.4s, v4 .4s
638
- ; CHECK-NEXT: uzp1 v4 .4s, v18.4s, v24.4s
639
- ; CHECK-NEXT: uzp1 v5 .4s, v5 .4s, v22.4s
640
- ; CHECK-NEXT: uzp1 v1 .4s, v1 .4s, v16.4s
641
- ; CHECK-NEXT: uzp1 v6 .4s, v17.4s, v6 .4s
631
+ ; CHECK-NEXT: cmhi v7 .2d, v1 .2d, v7 .2d
632
+ ; CHECK-NEXT: cmhi v0.2d, v1 .2d, v0.2d
633
+ ; CHECK-NEXT: uzp1 v1 .4s, v21.4s, v20.4s
634
+ ; CHECK-NEXT: uzp1 v3 .4s, v4 .4s, v3 .4s
635
+ ; CHECK-NEXT: uzp1 v4 .4s, v23.4s, v5 .4s
636
+ ; CHECK-NEXT: uzp1 v5 .4s, v18.4s, v24.4s
637
+ ; CHECK-NEXT: uzp1 v6 .4s, v6 .4s, v22.4s
638
+ ; CHECK-NEXT: uzp1 v2 .4s, v2 .4s, v16.4s
639
+ ; CHECK-NEXT: uzp1 v7 .4s, v17.4s, v7 .4s
642
640
; CHECK-NEXT: uzp1 v0.4s, v19.4s, v0.4s
643
- ; CHECK-NEXT: uzp1 v3 .8h, v4 .8h, v3 .8h
644
- ; CHECK-NEXT: uzp1 v1 .8h, v2 .8h, v1 .8h
645
- ; CHECK-NEXT: uzp1 v2 .8h, v6 .8h, v5 .8h
646
- ; CHECK-NEXT: uzp1 v0.8h, v7 .8h, v0.8h
647
- ; CHECK-NEXT: uzp1 v1.16b, v3 .16b, v1 .16b
648
- ; CHECK-NEXT: uzp1 v0.16b, v2 .16b, v0.16b
649
- ; CHECK-NEXT: dup v3 .16b, w10
650
- ; CHECK-NEXT: dup v2 .16b, w9
641
+ ; CHECK-NEXT: uzp1 v4 .8h, v5 .8h, v4 .8h
642
+ ; CHECK-NEXT: uzp1 v2 .8h, v3 .8h, v2 .8h
643
+ ; CHECK-NEXT: uzp1 v3 .8h, v7 .8h, v6 .8h
644
+ ; CHECK-NEXT: uzp1 v0.8h, v1 .8h, v0.8h
645
+ ; CHECK-NEXT: uzp1 v1.16b, v4 .16b, v2 .16b
646
+ ; CHECK-NEXT: uzp1 v0.16b, v3 .16b, v0.16b
647
+ ; CHECK-NEXT: dup v2 .16b, w10
648
+ ; CHECK-NEXT: dup v3 .16b, w9
651
649
; CHECK-NEXT: adrp x9, .LCPI18_0
652
- ; CHECK-NEXT: orr v1.16b, v1.16b, v3 .16b
653
- ; CHECK-NEXT: orr v0.16b, v0.16b, v2 .16b
650
+ ; CHECK-NEXT: orr v1.16b, v1.16b, v2 .16b
651
+ ; CHECK-NEXT: orr v0.16b, v0.16b, v3 .16b
654
652
; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI18_0]
655
653
; CHECK-NEXT: shl v1.16b, v1.16b, #7
656
654
; CHECK-NEXT: shl v0.16b, v0.16b, #7
@@ -664,8 +662,8 @@ define <32 x i1> @whilewr_64_expand4(ptr %a, ptr %b) {
664
662
; CHECK-NEXT: zip1 v0.16b, v0.16b, v3.16b
665
663
; CHECK-NEXT: addv h1, v1.8h
666
664
; CHECK-NEXT: addv h0, v0.8h
667
- ; CHECK-NEXT: str h1, [x8]
668
- ; CHECK-NEXT: str h0, [x8, #2 ]
665
+ ; CHECK-NEXT: str h1, [x8, #2 ]
666
+ ; CHECK-NEXT: str h0, [x8]
669
667
; CHECK-NEXT: ret
670
668
entry:
671
669
%0 = call <32 x i1 > @llvm.loop.dependence.war.mask.v32i1 (ptr %a , ptr %b , i64 8 )
0 commit comments