Skip to content

[AArch64] Inefficient codegen in shuffle-tlb34.ll #56647

@deadalnix

Description

@deadalnix

The current codegen:

define <8 x i8> @insert4_v8i8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c, <16 x i8> %d) {
; CHECK-LABEL: insert4_v8i8:
; CHECK:       // %bb.0:
; CHECK-NEXT:    adrp x8, .LCPI14_0
; CHECK-NEXT:    adrp x9, .LCPI14_1
; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT:    mov v4.16b, v3.16b
; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
; CHECK-NEXT:    mov v0.d[1], v2.d[0]
; CHECK-NEXT:    mov v3.16b, v1.16b
; CHECK-NEXT:    ldr d1, [x8, :lo12:.LCPI14_0]
; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI14_1]
; CHECK-NEXT:    tbl v0.8b, { v0.16b }, v1.8b
; CHECK-NEXT:    tbl v1.16b, { v3.16b, v4.16b }, v2.16b
; CHECK-NEXT:    trn1 v0.4h, v1.4h, v0.4h
; CHECK-NEXT:    trn2 v0.4h, v0.4h, v1.4h
; CHECK-NEXT:    ret
  %e1 = extractelement <8 x i8> %a, i32 4
  %e2 = extractelement <8 x i8> %c, i32 0
  %e3 = extractelement <16 x i8> %b, i32 15
  %e4 = extractelement <16 x i8> %d, i32 11
  %e5 = extractelement <8 x i8> %c, i32 6
  %e6 = extractelement <8 x i8> %a, i32 3
  %e7 = extractelement <16 x i8> %d, i32 8
  %e8 = extractelement <16 x i8> %b, i32 12
  %i1 = insertelement <8 x i8> undef, i8 %e1, i32 0
  %i2 = insertelement <8 x i8> %i1, i8 %e2, i32 1
  %i3 = insertelement <8 x i8> %i2, i8 %e3, i32 2
  %i4 = insertelement <8 x i8> %i3, i8 %e4, i32 3
  %i5 = insertelement <8 x i8> %i4, i8 %e5, i32 4
  %i6 = insertelement <8 x i8> %i5, i8 %e6, i32 5
  %i7 = insertelement <8 x i8> %i6, i8 %e7, i32 6
  %i8 = insertelement <8 x i8> %i7, i8 %e8, i32 7
  ret <8 x i8> %i8
}

I think we should expect this instead:

define <8 x i8> @insert4_v8i8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c, <16 x i8> %d) {
; CHECK-LABEL: insert4_v8i8:
; CHECK:       // %bb.0:
; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT:    dup v4.8b, v0.b[4]
; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
; CHECK-NEXT:    mov v4.b[1], v2.b[0]
; CHECK-NEXT:    mov v4.b[2], v1.b[15]
; CHECK-NEXT:    mov v4.b[3], v3.b[11]
; CHECK-NEXT:    mov v4.b[4], v2.b[6]
; CHECK-NEXT:    mov v4.b[5], v0.b[3]
; CHECK-NEXT:    mov v4.b[6], v3.b[8]
; CHECK-NEXT:    mov v4.b[7], v1.b[12]
; CHECK-NEXT:    fmov d0, d4
; CHECK-NEXT:    ret
  %e1 = extractelement <8 x i8> %a, i32 4
  %e2 = extractelement <8 x i8> %c, i32 0
  %e3 = extractelement <16 x i8> %b, i32 15
  %e4 = extractelement <16 x i8> %d, i32 11
  %e5 = extractelement <8 x i8> %c, i32 6
  %e6 = extractelement <8 x i8> %a, i32 3
  %e7 = extractelement <16 x i8> %d, i32 8
  %e8 = extractelement <16 x i8> %b, i32 12
  %i1 = insertelement <8 x i8> undef, i8 %e1, i32 0
  %i2 = insertelement <8 x i8> %i1, i8 %e2, i32 1
  %i3 = insertelement <8 x i8> %i2, i8 %e3, i32 2
  %i4 = insertelement <8 x i8> %i3, i8 %e4, i32 3
  %i5 = insertelement <8 x i8> %i4, i8 %e5, i32 4
  %i6 = insertelement <8 x i8> %i5, i8 %e6, i32 5
  %i7 = insertelement <8 x i8> %i6, i8 %e7, i32 6
  %i8 = insertelement <8 x i8> %i7, i8 %e8, i32 7
  ret <8 x i8> %i8
}

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions