Skip to content

Commit

Permalink
[x86] add test for funnel shift in loop with cross-block splat variab…
Browse files Browse the repository at this point in the history
…le; NFC
  • Loading branch information
rotateright committed May 11, 2020
1 parent ba89828 commit b75795c
Showing 1 changed file with 298 additions and 0 deletions.
298 changes: 298 additions & 0 deletions llvm/test/CodeGen/X86/vector-fshl-128.ll
Expand Up @@ -2136,6 +2136,304 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %
ret <16 x i8> %res
}

; CGP should allow a cross-block splat shift amount to be seen in SDAG.
; PR37426 - https://bugs.llvm.org/show_bug.cgi?id=37426

define void @sink_splatvar(i32* %p, i32 %shift_amt) {
; SSE2-LABEL: sink_splatvar:
; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movd %esi, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00
; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
; SSE2-NEXT: pslld $23, %xmm0
; SSE2-NEXT: paddd {{.*}}(%rip), %xmm0
; SSE2-NEXT: cvttps2dq %xmm0, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; SSE2-NEXT: .p2align 4, 0x90
; SSE2-NEXT: .LBB8_1: # %loop
; SSE2-NEXT: # =>This Inner Loop Header: Depth=1
; SSE2-NEXT: movdqu 1024(%rdi,%rax), %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
; SSE2-NEXT: pmuludq %xmm0, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,3,2,3]
; SSE2-NEXT: pmuludq %xmm1, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,3,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; SSE2-NEXT: por %xmm4, %xmm2
; SSE2-NEXT: movdqu %xmm2, 1024(%rdi,%rax)
; SSE2-NEXT: addq $16, %rax
; SSE2-NEXT: jne .LBB8_1
; SSE2-NEXT: # %bb.2: # %end
; SSE2-NEXT: retq
;
; SSE41-LABEL: sink_splatvar:
; SSE41: # %bb.0: # %entry
; SSE41-NEXT: movd %esi, %xmm0
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; SSE41-NEXT: movq $-1024, %rax # imm = 0xFC00
; SSE41-NEXT: pand {{.*}}(%rip), %xmm0
; SSE41-NEXT: pslld $23, %xmm0
; SSE41-NEXT: paddd {{.*}}(%rip), %xmm0
; SSE41-NEXT: cvttps2dq %xmm0, %xmm0
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; SSE41-NEXT: .p2align 4, 0x90
; SSE41-NEXT: .LBB8_1: # %loop
; SSE41-NEXT: # =>This Inner Loop Header: Depth=1
; SSE41-NEXT: movdqu 1024(%rdi,%rax), %xmm2
; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
; SSE41-NEXT: pmuludq %xmm1, %xmm3
; SSE41-NEXT: pmuludq %xmm0, %xmm2
; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7]
; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,2]
; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
; SSE41-NEXT: por %xmm4, %xmm3
; SSE41-NEXT: movdqu %xmm3, 1024(%rdi,%rax)
; SSE41-NEXT: addq $16, %rax
; SSE41-NEXT: jne .LBB8_1
; SSE41-NEXT: # %bb.2: # %end
; SSE41-NEXT: retq
;
; AVX1-LABEL: sink_splatvar:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vmovd %esi, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX1-NEXT: movq $-1024, %rax # imm = 0xFC00
; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vpslld $23, %xmm0, %xmm0
; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vcvttps2dq %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX1-NEXT: .p2align 4, 0x90
; AVX1-NEXT: .LBB8_1: # %loop
; AVX1-NEXT: # =>This Inner Loop Header: Depth=1
; AVX1-NEXT: vmovdqu 1024(%rdi,%rax), %xmm2
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
; AVX1-NEXT: vpmuludq %xmm1, %xmm3, %xmm3
; AVX1-NEXT: vpmuludq %xmm0, %xmm2, %xmm2
; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,2,2]
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
; AVX1-NEXT: vpor %xmm4, %xmm2, %xmm2
; AVX1-NEXT: vmovdqu %xmm2, 1024(%rdi,%rax)
; AVX1-NEXT: addq $16, %rax
; AVX1-NEXT: jne .LBB8_1
; AVX1-NEXT: # %bb.2: # %end
; AVX1-NEXT: retq
;
; AVX2-LABEL: sink_splatvar:
; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vmovd %esi, %xmm0
; AVX2-NEXT: vpbroadcastd %xmm0, %xmm0
; AVX2-NEXT: movq $-1024, %rax # imm = 0xFC00
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [31,31,31,31]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [32,32,32,32]
; AVX2-NEXT: vpsubd %xmm0, %xmm1, %xmm1
; AVX2-NEXT: .p2align 4, 0x90
; AVX2-NEXT: .LBB8_1: # %loop
; AVX2-NEXT: # =>This Inner Loop Header: Depth=1
; AVX2-NEXT: vmovdqu 1024(%rdi,%rax), %xmm2
; AVX2-NEXT: vpsllvd %xmm0, %xmm2, %xmm3
; AVX2-NEXT: vpsrlvd %xmm1, %xmm2, %xmm2
; AVX2-NEXT: vpor %xmm2, %xmm3, %xmm2
; AVX2-NEXT: vmovdqu %xmm2, 1024(%rdi,%rax)
; AVX2-NEXT: addq $16, %rax
; AVX2-NEXT: jne .LBB8_1
; AVX2-NEXT: # %bb.2: # %end
; AVX2-NEXT: retq
;
; AVX512F-LABEL: sink_splatvar:
; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vmovd %esi, %xmm0
; AVX512F-NEXT: vpbroadcastd %xmm0, %xmm0
; AVX512F-NEXT: movq $-1024, %rax # imm = 0xFC00
; AVX512F-NEXT: .p2align 4, 0x90
; AVX512F-NEXT: .LBB8_1: # %loop
; AVX512F-NEXT: # =>This Inner Loop Header: Depth=1
; AVX512F-NEXT: vmovdqu 1024(%rdi,%rax), %xmm1
; AVX512F-NEXT: vprolvd %zmm0, %zmm1, %zmm1
; AVX512F-NEXT: vmovdqu %xmm1, 1024(%rdi,%rax)
; AVX512F-NEXT: addq $16, %rax
; AVX512F-NEXT: jne .LBB8_1
; AVX512F-NEXT: # %bb.2: # %end
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: sink_splatvar:
; AVX512VL: # %bb.0: # %entry
; AVX512VL-NEXT: vpbroadcastd %esi, %xmm0
; AVX512VL-NEXT: movq $-1024, %rax # imm = 0xFC00
; AVX512VL-NEXT: .p2align 4, 0x90
; AVX512VL-NEXT: .LBB8_1: # %loop
; AVX512VL-NEXT: # =>This Inner Loop Header: Depth=1
; AVX512VL-NEXT: vmovdqu 1024(%rdi,%rax), %xmm1
; AVX512VL-NEXT: vprolvd %xmm0, %xmm1, %xmm1
; AVX512VL-NEXT: vmovdqu %xmm1, 1024(%rdi,%rax)
; AVX512VL-NEXT: addq $16, %rax
; AVX512VL-NEXT: jne .LBB8_1
; AVX512VL-NEXT: # %bb.2: # %end
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: sink_splatvar:
; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vmovd %esi, %xmm0
; AVX512BW-NEXT: vpbroadcastd %xmm0, %xmm0
; AVX512BW-NEXT: movq $-1024, %rax # imm = 0xFC00
; AVX512BW-NEXT: .p2align 4, 0x90
; AVX512BW-NEXT: .LBB8_1: # %loop
; AVX512BW-NEXT: # =>This Inner Loop Header: Depth=1
; AVX512BW-NEXT: vmovdqu 1024(%rdi,%rax), %xmm1
; AVX512BW-NEXT: vprolvd %zmm0, %zmm1, %zmm1
; AVX512BW-NEXT: vmovdqu %xmm1, 1024(%rdi,%rax)
; AVX512BW-NEXT: addq $16, %rax
; AVX512BW-NEXT: jne .LBB8_1
; AVX512BW-NEXT: # %bb.2: # %end
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512VBMI2-LABEL: sink_splatvar:
; AVX512VBMI2: # %bb.0: # %entry
; AVX512VBMI2-NEXT: vmovd %esi, %xmm0
; AVX512VBMI2-NEXT: vpbroadcastd %xmm0, %xmm0
; AVX512VBMI2-NEXT: movq $-1024, %rax # imm = 0xFC00
; AVX512VBMI2-NEXT: .p2align 4, 0x90
; AVX512VBMI2-NEXT: .LBB8_1: # %loop
; AVX512VBMI2-NEXT: # =>This Inner Loop Header: Depth=1
; AVX512VBMI2-NEXT: vmovdqu 1024(%rdi,%rax), %xmm1
; AVX512VBMI2-NEXT: vprolvd %zmm0, %zmm1, %zmm1
; AVX512VBMI2-NEXT: vmovdqu %xmm1, 1024(%rdi,%rax)
; AVX512VBMI2-NEXT: addq $16, %rax
; AVX512VBMI2-NEXT: jne .LBB8_1
; AVX512VBMI2-NEXT: # %bb.2: # %end
; AVX512VBMI2-NEXT: vzeroupper
; AVX512VBMI2-NEXT: retq
;
; AVX512VLBW-LABEL: sink_splatvar:
; AVX512VLBW: # %bb.0: # %entry
; AVX512VLBW-NEXT: vpbroadcastd %esi, %xmm0
; AVX512VLBW-NEXT: movq $-1024, %rax # imm = 0xFC00
; AVX512VLBW-NEXT: .p2align 4, 0x90
; AVX512VLBW-NEXT: .LBB8_1: # %loop
; AVX512VLBW-NEXT: # =>This Inner Loop Header: Depth=1
; AVX512VLBW-NEXT: vmovdqu 1024(%rdi,%rax), %xmm1
; AVX512VLBW-NEXT: vprolvd %xmm0, %xmm1, %xmm1
; AVX512VLBW-NEXT: vmovdqu %xmm1, 1024(%rdi,%rax)
; AVX512VLBW-NEXT: addq $16, %rax
; AVX512VLBW-NEXT: jne .LBB8_1
; AVX512VLBW-NEXT: # %bb.2: # %end
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI2-LABEL: sink_splatvar:
; AVX512VLVBMI2: # %bb.0: # %entry
; AVX512VLVBMI2-NEXT: vpbroadcastd %esi, %xmm0
; AVX512VLVBMI2-NEXT: movq $-1024, %rax # imm = 0xFC00
; AVX512VLVBMI2-NEXT: .p2align 4, 0x90
; AVX512VLVBMI2-NEXT: .LBB8_1: # %loop
; AVX512VLVBMI2-NEXT: # =>This Inner Loop Header: Depth=1
; AVX512VLVBMI2-NEXT: vmovdqu 1024(%rdi,%rax), %xmm1
; AVX512VLVBMI2-NEXT: vprolvd %xmm0, %xmm1, %xmm1
; AVX512VLVBMI2-NEXT: vmovdqu %xmm1, 1024(%rdi,%rax)
; AVX512VLVBMI2-NEXT: addq $16, %rax
; AVX512VLVBMI2-NEXT: jne .LBB8_1
; AVX512VLVBMI2-NEXT: # %bb.2: # %end
; AVX512VLVBMI2-NEXT: retq
;
; XOPAVX1-LABEL: sink_splatvar:
; XOPAVX1: # %bb.0: # %entry
; XOPAVX1-NEXT: vmovd %esi, %xmm0
; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; XOPAVX1-NEXT: movq $-1024, %rax # imm = 0xFC00
; XOPAVX1-NEXT: .p2align 4, 0x90
; XOPAVX1-NEXT: .LBB8_1: # %loop
; XOPAVX1-NEXT: # =>This Inner Loop Header: Depth=1
; XOPAVX1-NEXT: vprotd %xmm0, 1024(%rdi,%rax), %xmm1
; XOPAVX1-NEXT: vmovdqu %xmm1, 1024(%rdi,%rax)
; XOPAVX1-NEXT: addq $16, %rax
; XOPAVX1-NEXT: jne .LBB8_1
; XOPAVX1-NEXT: # %bb.2: # %end
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: sink_splatvar:
; XOPAVX2: # %bb.0: # %entry
; XOPAVX2-NEXT: vmovd %esi, %xmm0
; XOPAVX2-NEXT: vpbroadcastd %xmm0, %xmm0
; XOPAVX2-NEXT: movq $-1024, %rax # imm = 0xFC00
; XOPAVX2-NEXT: .p2align 4, 0x90
; XOPAVX2-NEXT: .LBB8_1: # %loop
; XOPAVX2-NEXT: # =>This Inner Loop Header: Depth=1
; XOPAVX2-NEXT: vprotd %xmm0, 1024(%rdi,%rax), %xmm1
; XOPAVX2-NEXT: vmovdqu %xmm1, 1024(%rdi,%rax)
; XOPAVX2-NEXT: addq $16, %rax
; XOPAVX2-NEXT: jne .LBB8_1
; XOPAVX2-NEXT: # %bb.2: # %end
; XOPAVX2-NEXT: retq
;
; X32-SSE-LABEL: sink_splatvar:
; X32-SSE: # %bb.0: # %entry
; X32-SSE-NEXT: pushl %esi
; X32-SSE-NEXT: .cfi_def_cfa_offset 8
; X32-SSE-NEXT: .cfi_offset %esi, -8
; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; X32-SSE-NEXT: xorl %ecx, %ecx
; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: pslld $23, %xmm0
; X32-SSE-NEXT: paddd {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: cvttps2dq %xmm0, %xmm0
; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; X32-SSE-NEXT: xorl %edx, %edx
; X32-SSE-NEXT: .p2align 4, 0x90
; X32-SSE-NEXT: .LBB8_1: # %loop
; X32-SSE-NEXT: # =>This Inner Loop Header: Depth=1
; X32-SSE-NEXT: movdqu (%eax,%ecx,4), %xmm2
; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
; X32-SSE-NEXT: pmuludq %xmm0, %xmm2
; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,3,2,3]
; X32-SSE-NEXT: pmuludq %xmm1, %xmm3
; X32-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,3,2,3]
; X32-SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
; X32-SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; X32-SSE-NEXT: por %xmm4, %xmm2
; X32-SSE-NEXT: movdqu %xmm2, (%eax,%ecx,4)
; X32-SSE-NEXT: addl $4, %ecx
; X32-SSE-NEXT: adcl $0, %edx
; X32-SSE-NEXT: movl %ecx, %esi
; X32-SSE-NEXT: xorl $256, %esi # imm = 0x100
; X32-SSE-NEXT: orl %edx, %esi
; X32-SSE-NEXT: jne .LBB8_1
; X32-SSE-NEXT: # %bb.2: # %end
; X32-SSE-NEXT: popl %esi
; X32-SSE-NEXT: .cfi_def_cfa_offset 4
; X32-SSE-NEXT: retl
entry:
%ins = insertelement <4 x i32> undef, i32 %shift_amt, i32 0
%splat = shufflevector <4 x i32> %ins, <4 x i32> undef, <4 x i32> zeroinitializer
br label %loop

loop:
%index = phi i64 [ 0, %entry ], [ %inc, %loop ]
%addr = getelementptr inbounds i32, i32* %p, i64 %index
%addr_vec = bitcast i32* %addr to <4 x i32>*
%x = load <4 x i32>, <4 x i32>* %addr_vec, align 4
%fsh = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %x, <4 x i32> %splat)
store <4 x i32> %fsh, <4 x i32>* %addr_vec, align 4
%inc = add i64 %index, 4
%iv = icmp eq i64 %inc, 256
br i1 %iv, label %end, label %loop

end:
ret void
}

;
; Constant Shifts
;
Expand Down

0 comments on commit b75795c

Please sign in to comment.