-
Notifications
You must be signed in to change notification settings - Fork 14.5k
Open
Labels
Description
The current codegen:
define <8 x i8> @insert4_v8i8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c, <16 x i8> %d) {
; CHECK-LABEL: insert4_v8i8:
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI14_0
; CHECK-NEXT: adrp x9, .LCPI14_1
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: mov v4.16b, v3.16b
; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
; CHECK-NEXT: mov v0.d[1], v2.d[0]
; CHECK-NEXT: mov v3.16b, v1.16b
; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI14_0]
; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI14_1]
; CHECK-NEXT: tbl v0.8b, { v0.16b }, v1.8b
; CHECK-NEXT: tbl v1.16b, { v3.16b, v4.16b }, v2.16b
; CHECK-NEXT: trn1 v0.4h, v1.4h, v0.4h
; CHECK-NEXT: trn2 v0.4h, v0.4h, v1.4h
; CHECK-NEXT: ret
%e1 = extractelement <8 x i8> %a, i32 4
%e2 = extractelement <8 x i8> %c, i32 0
%e3 = extractelement <16 x i8> %b, i32 15
%e4 = extractelement <16 x i8> %d, i32 11
%e5 = extractelement <8 x i8> %c, i32 6
%e6 = extractelement <8 x i8> %a, i32 3
%e7 = extractelement <16 x i8> %d, i32 8
%e8 = extractelement <16 x i8> %b, i32 12
%i1 = insertelement <8 x i8> undef, i8 %e1, i32 0
%i2 = insertelement <8 x i8> %i1, i8 %e2, i32 1
%i3 = insertelement <8 x i8> %i2, i8 %e3, i32 2
%i4 = insertelement <8 x i8> %i3, i8 %e4, i32 3
%i5 = insertelement <8 x i8> %i4, i8 %e5, i32 4
%i6 = insertelement <8 x i8> %i5, i8 %e6, i32 5
%i7 = insertelement <8 x i8> %i6, i8 %e7, i32 6
%i8 = insertelement <8 x i8> %i7, i8 %e8, i32 7
ret <8 x i8> %i8
}
I think we should expect this instead:
define <8 x i8> @insert4_v8i8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c, <16 x i8> %d) {
; CHECK-LABEL: insert4_v8i8:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: dup v4.8b, v0.b[4]
; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
; CHECK-NEXT: mov v4.b[1], v2.b[0]
; CHECK-NEXT: mov v4.b[2], v1.b[15]
; CHECK-NEXT: mov v4.b[3], v3.b[11]
; CHECK-NEXT: mov v4.b[4], v2.b[6]
; CHECK-NEXT: mov v4.b[5], v0.b[3]
; CHECK-NEXT: mov v4.b[6], v3.b[8]
; CHECK-NEXT: mov v4.b[7], v1.b[12]
; CHECK-NEXT: fmov d0, d4
; CHECK-NEXT: ret
%e1 = extractelement <8 x i8> %a, i32 4
%e2 = extractelement <8 x i8> %c, i32 0
%e3 = extractelement <16 x i8> %b, i32 15
%e4 = extractelement <16 x i8> %d, i32 11
%e5 = extractelement <8 x i8> %c, i32 6
%e6 = extractelement <8 x i8> %a, i32 3
%e7 = extractelement <16 x i8> %d, i32 8
%e8 = extractelement <16 x i8> %b, i32 12
%i1 = insertelement <8 x i8> undef, i8 %e1, i32 0
%i2 = insertelement <8 x i8> %i1, i8 %e2, i32 1
%i3 = insertelement <8 x i8> %i2, i8 %e3, i32 2
%i4 = insertelement <8 x i8> %i3, i8 %e4, i32 3
%i5 = insertelement <8 x i8> %i4, i8 %e5, i32 4
%i6 = insertelement <8 x i8> %i5, i8 %e6, i32 5
%i7 = insertelement <8 x i8> %i6, i8 %e7, i32 6
%i8 = insertelement <8 x i8> %i7, i8 %e8, i32 7
ret <8 x i8> %i8
}