From 10c95a59264137ef0c0a5303f2985f7773ea85e4 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Mon, 17 Nov 2025 12:03:45 +0000 Subject: [PATCH] [X86] bittest-big-integer.ll - add BLSR style pattern test Test using CTTZ to determine the lowest set bit, clear it and return the index Shows failure to use RMW pattern on the load-btr-store due to additional (but non-interference) uses of the load. --- llvm/test/CodeGen/X86/bittest-big-integer.ll | 615 +++++++++++++++++++ 1 file changed, 615 insertions(+) diff --git a/llvm/test/CodeGen/X86/bittest-big-integer.ll b/llvm/test/CodeGen/X86/bittest-big-integer.ll index 9b7569ff8b29f..b85a20b9d6b6e 100644 --- a/llvm/test/CodeGen/X86/bittest-big-integer.ll +++ b/llvm/test/CodeGen/X86/bittest-big-integer.ll @@ -1488,3 +1488,618 @@ define i1 @sequence_i128(ptr %word, i32 %pos0, i32 %pos1, i32 %pos2) nounwind { store i128 %res2, ptr %word ret i1 %cmp1 } + +define i32 @blsr_u512(ptr %word) nounwind { +; X86-LABEL: blsr_u512: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $240, %esp +; X86-NEXT: movl 8(%ebp), %ebx +; X86-NEXT: movl 12(%ebx), %esi +; X86-NEXT: movl 28(%ebx), %eax +; X86-NEXT: movl 60(%ebx), %ecx +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: orl %ecx, %eax +; X86-NEXT: movl 44(%ebx), %edx +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %esi, %ecx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: orl %edx, %ecx +; X86-NEXT: orl %eax, %ecx +; X86-NEXT: movl 20(%ebx), %edx +; X86-NEXT: movl 52(%ebx), %eax +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: orl %eax, %edx +; X86-NEXT: movl 4(%ebx), %edi +; X86-NEXT: movl 36(%ebx), %esi +; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: orl %esi, %eax +; X86-NEXT: orl %edx, %eax +; X86-NEXT: orl %ecx, %eax +; X86-NEXT: movl 24(%ebx), %edx +; X86-NEXT: movl 56(%ebx), %ecx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: orl %ecx, %edx +; X86-NEXT: movl 8(%ebx), %ecx +; X86-NEXT: movl 40(%ebx), %esi +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: orl %esi, %ecx +; X86-NEXT: orl %edx, %ecx +; X86-NEXT: movl 16(%ebx), %edx +; X86-NEXT: movl 48(%ebx), %esi +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: orl %esi, %edx +; X86-NEXT: movl (%ebx), %esi +; X86-NEXT: movl 32(%ebx), %ebx +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: orl %ebx, %esi +; X86-NEXT: orl %edx, %esi +; X86-NEXT: orl %ecx, %esi +; X86-NEXT: orl %eax, %esi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: je .LBB26_1 +; X86-NEXT: # %bb.2: # %cond.false +; X86-NEXT: testl %ebx, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: jne .LBB26_3 +; X86-NEXT: # %bb.4: # %cond.false +; X86-NEXT: rep bsfl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: addl $32, %eax +; X86-NEXT: jmp .LBB26_5 +; X86-NEXT: .LBB26_1: +; X86-NEXT: movl $512, %ecx # imm = 0x200 +; X86-NEXT: jmp .LBB26_41 +; X86-NEXT: .LBB26_3: +; X86-NEXT: rep bsfl %ebx, %eax +; X86-NEXT: .LBB26_5: # %cond.false +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: testl %ecx, %ecx +; X86-NEXT: jne .LBB26_6 +; X86-NEXT: # %bb.7: # %cond.false +; X86-NEXT: rep bsfl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: addl $32, %ecx +; X86-NEXT: jmp .LBB26_8 +; X86-NEXT: .LBB26_6: +; X86-NEXT: rep bsfl %ecx, %ecx +; X86-NEXT: .LBB26_8: # %cond.false +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: jne .LBB26_10 +; X86-NEXT: # %bb.9: # %cond.false +; X86-NEXT: addl $64, %ecx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: .LBB26_10: # %cond.false +; X86-NEXT: testl %esi, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: jne .LBB26_11 +; X86-NEXT: # %bb.12: # %cond.false +; X86-NEXT: rep bsfl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: addl $32, %ecx +; X86-NEXT: testl %edx, %edx +; X86-NEXT: je .LBB26_15 +; X86-NEXT: .LBB26_14: +; X86-NEXT: rep bsfl %edx, %edx +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: je .LBB26_17 +; X86-NEXT: jmp .LBB26_18 +; X86-NEXT: .LBB26_11: +; X86-NEXT: rep bsfl %esi, %ecx +; X86-NEXT: testl %edx, %edx +; X86-NEXT: jne .LBB26_14 +; X86-NEXT: .LBB26_15: # %cond.false +; X86-NEXT: rep bsfl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: addl $32, %edx +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: jne .LBB26_18 +; X86-NEXT: .LBB26_17: # %cond.false +; X86-NEXT: addl $64, %edx +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: .LBB26_18: # %cond.false +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: orl %edx, %esi +; X86-NEXT: jne .LBB26_20 +; X86-NEXT: # %bb.19: # %cond.false +; X86-NEXT: subl $-128, %ecx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: .LBB26_20: # %cond.false +; X86-NEXT: addl $256, %eax # imm = 0x100 +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: testl %edx, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: jne .LBB26_21 +; X86-NEXT: # %bb.22: # %cond.false +; X86-NEXT: rep bsfl %edi, %ebx +; X86-NEXT: addl $32, %ebx +; X86-NEXT: jmp .LBB26_23 +; X86-NEXT: .LBB26_21: +; X86-NEXT: rep bsfl %edx, %ebx +; X86-NEXT: .LBB26_23: # %cond.false +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: testl %ecx, %ecx +; X86-NEXT: jne .LBB26_24 +; X86-NEXT: # %bb.25: # %cond.false +; X86-NEXT: rep bsfl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: addl $32, %ecx +; X86-NEXT: orl %edi, %edx +; X86-NEXT: je .LBB26_27 +; X86-NEXT: jmp .LBB26_28 +; X86-NEXT: .LBB26_24: +; X86-NEXT: rep bsfl %ecx, %ecx +; X86-NEXT: orl %edi, %edx +; X86-NEXT: jne .LBB26_28 +; X86-NEXT: .LBB26_27: # %cond.false +; X86-NEXT: addl $64, %ecx +; X86-NEXT: movl %ecx, %ebx +; X86-NEXT: .LBB26_28: # %cond.false +; X86-NEXT: testl %esi, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: jne .LBB26_29 +; X86-NEXT: # %bb.30: # %cond.false +; X86-NEXT: rep bsfl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: addl $32, %ecx +; X86-NEXT: testl %edx, %edx +; X86-NEXT: je .LBB26_33 +; X86-NEXT: .LBB26_32: +; X86-NEXT: rep bsfl %edx, %edx +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: je .LBB26_35 +; X86-NEXT: jmp .LBB26_36 +; X86-NEXT: .LBB26_29: +; X86-NEXT: rep bsfl %esi, %ecx +; X86-NEXT: testl %edx, %edx +; X86-NEXT: jne .LBB26_32 +; X86-NEXT: .LBB26_33: # %cond.false +; X86-NEXT: rep bsfl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: addl $32, %edx +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: jne .LBB26_36 +; X86-NEXT: .LBB26_35: # %cond.false +; X86-NEXT: addl $64, %edx +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: .LBB26_36: # %cond.false +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: movl %edi, %esi +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: orl %edx, %esi +; X86-NEXT: jne .LBB26_38 +; X86-NEXT: # %bb.37: # %cond.false +; X86-NEXT: subl $-128, %ecx +; X86-NEXT: movl %ecx, %ebx +; X86-NEXT: .LBB26_38: # %cond.false +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: orl %ecx, %edx +; X86-NEXT: movl %edi, %ecx +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: orl %ecx, %esi +; X86-NEXT: orl %edx, %esi +; X86-NEXT: movl %ebx, %ecx +; X86-NEXT: jne .LBB26_40 +; X86-NEXT: # %bb.39: # %cond.false +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: .LBB26_40: # %cond.false +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: .LBB26_41: # %cond.end +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: shrl $3, %esi +; X86-NEXT: andl $60, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: leal {{[0-9]+}}(%esp), %edx +; X86-NEXT: subl %esi, %edx +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $1, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl $31, %ecx +; X86-NEXT: movl 56(%edx), %edi +; X86-NEXT: movl 60(%edx), %esi +; X86-NEXT: shldl %cl, %edi, %esi +; X86-NEXT: notl %esi +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 52(%edx), %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %esi, %edi +; X86-NEXT: notl %edi +; X86-NEXT: andl %eax, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 40(%edx), %eax +; X86-NEXT: movl 44(%edx), %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %eax, %esi +; X86-NEXT: notl %esi +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 36(%edx), %esi +; X86-NEXT: shldl %cl, %esi, %eax +; X86-NEXT: notl %eax +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 32(%edx), %eax +; X86-NEXT: shldl %cl, %eax, %esi +; X86-NEXT: notl %esi +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 28(%edx), %esi +; X86-NEXT: shldl %cl, %esi, %eax +; X86-NEXT: notl %eax +; X86-NEXT: andl %ebx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 24(%edx), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %eax, %esi +; X86-NEXT: notl %esi +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 4(%edx), %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 8(%edx), %eax +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %eax, %edx +; X86-NEXT: shldl %cl, %esi, %edx +; X86-NEXT: notl %edx +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 12(%ebx), %edx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: shldl %cl, %eax, %esi +; X86-NEXT: notl %esi +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 16(%ebx), %eax +; X86-NEXT: movl %eax, %esi +; X86-NEXT: shldl %cl, %edx, %esi +; X86-NEXT: notl %esi +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 20(%ebx), %edx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: shldl %cl, %eax, %esi +; X86-NEXT: notl %esi +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: notl %eax +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: negl %eax +; X86-NEXT: movl 208(%esp,%eax), %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: notl %eax +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, %edx +; X86-NEXT: notl %edx +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: movl (%ebx), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shll %cl, %eax +; X86-NEXT: notl %eax +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: shldl %cl, %edi, %ebx +; X86-NEXT: notl %ebx +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NEXT: movl %ebx, %edi +; X86-NEXT: movl 8(%ebp), %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: movl %ebx, 24(%ecx) +; X86-NEXT: movl %esi, 20(%ecx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: movl %esi, 16(%ecx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: movl %esi, 12(%ecx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: movl %esi, 8(%ecx) +; X86-NEXT: movl %edi, 4(%ecx) +; X86-NEXT: movl %eax, (%ecx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, 28(%ecx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, 32(%ecx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, 36(%ecx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, 40(%ecx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, 44(%ecx) +; X86-NEXT: movl %edx, 48(%ecx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, 52(%ecx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, 56(%ecx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, 60(%ecx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: leal -12(%ebp), %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; SSE-LABEL: blsr_u512: +; SSE: # %bb.0: +; SSE-NEXT: pushq %r15 +; SSE-NEXT: pushq %r14 +; SSE-NEXT: pushq %r12 +; SSE-NEXT: pushq %rbx +; SSE-NEXT: pushq %rax +; SSE-NEXT: movq 56(%rdi), %rcx +; SSE-NEXT: movq 48(%rdi), %rdx +; SSE-NEXT: movq 40(%rdi), %rsi +; SSE-NEXT: movq 32(%rdi), %r11 +; SSE-NEXT: movq 24(%rdi), %r8 +; SSE-NEXT: movq 16(%rdi), %r9 +; SSE-NEXT: movq (%rdi), %rax +; SSE-NEXT: movq 8(%rdi), %r10 +; SSE-NEXT: rep bsfq %rax, %rbx +; SSE-NEXT: rep bsfq %r10, %r14 +; SSE-NEXT: addq $64, %r14 +; SSE-NEXT: testq %rax, %rax +; SSE-NEXT: cmovneq %rbx, %r14 +; SSE-NEXT: rep bsfq %r9, %r15 +; SSE-NEXT: rep bsfq %r8, %rbx +; SSE-NEXT: addq $64, %rbx +; SSE-NEXT: testq %r9, %r9 +; SSE-NEXT: cmovneq %r15, %rbx +; SSE-NEXT: subq $-128, %rbx +; SSE-NEXT: movq %rax, %r15 +; SSE-NEXT: movq %rax, %r12 +; SSE-NEXT: orq %r10, %r12 +; SSE-NEXT: cmovneq %r14, %rbx +; SSE-NEXT: rep bsfq %r11, %r12 +; SSE-NEXT: rep bsfq %rsi, %r14 +; SSE-NEXT: addq $64, %r14 +; SSE-NEXT: testq %r11, %r11 +; SSE-NEXT: cmovneq %r12, %r14 +; SSE-NEXT: xorps %xmm0, %xmm0 +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; SSE-NEXT: rep bsfq %rdx, %r12 +; SSE-NEXT: movl $64, %eax +; SSE-NEXT: rep bsfq %rcx, %rax +; SSE-NEXT: addq $64, %rax +; SSE-NEXT: testq %rdx, %rdx +; SSE-NEXT: cmovneq %r12, %rax +; SSE-NEXT: subq $-128, %rax +; SSE-NEXT: movq %r11, -{{[0-9]+}}(%rsp) +; SSE-NEXT: orq %rsi, %r11 +; SSE-NEXT: cmovneq %r14, %rax +; SSE-NEXT: addq $256, %rax # imm = 0x100 +; SSE-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; SSE-NEXT: orq %r8, %r10 +; SSE-NEXT: orq %r9, %r15 +; SSE-NEXT: orq %r10, %r15 +; SSE-NEXT: cmovneq %rbx, %rax +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movl %eax, %ecx +; SSE-NEXT: andl $32, %ecx +; SSE-NEXT: movl %eax, %edx +; SSE-NEXT: andl $480, %edx # imm = 0x1E0 +; SSE-NEXT: shrl $3, %edx +; SSE-NEXT: movl %edx, %esi +; SSE-NEXT: andl $-8, %esi +; SSE-NEXT: movq -128(%rsp,%rsi), %r8 +; SSE-NEXT: shrq %cl, %r8 +; SSE-NEXT: movl -120(%rsp,%rsi), %esi +; SSE-NEXT: addl %esi, %esi +; SSE-NEXT: notl %ecx +; SSE-NEXT: # kill: def $cl killed $cl killed $ecx +; SSE-NEXT: shlq %cl, %rsi +; SSE-NEXT: orl %r8d, %esi +; SSE-NEXT: btrl %eax, %esi +; SSE-NEXT: movl %esi, (%rdi,%rdx) +; SSE-NEXT: # kill: def $eax killed $eax killed $rax +; SSE-NEXT: addq $8, %rsp +; SSE-NEXT: popq %rbx +; SSE-NEXT: popq %r12 +; SSE-NEXT: popq %r14 +; SSE-NEXT: popq %r15 +; SSE-NEXT: retq +; +; AVX2-LABEL: blsr_u512: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %r15 +; AVX2-NEXT: pushq %r14 +; AVX2-NEXT: pushq %r13 +; AVX2-NEXT: pushq %r12 +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: movq 56(%rdi), %rcx +; AVX2-NEXT: movq 40(%rdi), %rdx +; AVX2-NEXT: movq 32(%rdi), %r11 +; AVX2-NEXT: movq 24(%rdi), %rsi +; AVX2-NEXT: movq 16(%rdi), %r8 +; AVX2-NEXT: movq (%rdi), %r9 +; AVX2-NEXT: movq 8(%rdi), %r10 +; AVX2-NEXT: xorl %ebx, %ebx +; AVX2-NEXT: tzcntq %r9, %rbx +; AVX2-NEXT: tzcntq %r10, %rax +; AVX2-NEXT: addq $64, %rax +; AVX2-NEXT: testq %r9, %r9 +; AVX2-NEXT: cmovneq %rbx, %rax +; AVX2-NEXT: xorl %r14d, %r14d +; AVX2-NEXT: tzcntq %r8, %r14 +; AVX2-NEXT: xorl %ebx, %ebx +; AVX2-NEXT: tzcntq %rsi, %rbx +; AVX2-NEXT: addq $64, %rbx +; AVX2-NEXT: testq %r8, %r8 +; AVX2-NEXT: cmovneq %r14, %rbx +; AVX2-NEXT: subq $-128, %rbx +; AVX2-NEXT: movq %r9, %r14 +; AVX2-NEXT: movq %r9, %r15 +; AVX2-NEXT: orq %r10, %r15 +; AVX2-NEXT: cmovneq %rax, %rbx +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: tzcntq %r11, %rax +; AVX2-NEXT: xorl %r12d, %r12d +; AVX2-NEXT: tzcntq %rdx, %r12 +; AVX2-NEXT: addq $64, %r12 +; AVX2-NEXT: testq %r11, %r11 +; AVX2-NEXT: cmovneq %rax, %r12 +; AVX2-NEXT: movq 48(%rdi), %r15 +; AVX2-NEXT: xorl %r13d, %r13d +; AVX2-NEXT: tzcntq %r15, %r13 +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: tzcntq %rcx, %rax +; AVX2-NEXT: addq $64, %rax +; AVX2-NEXT: testq %r15, %r15 +; AVX2-NEXT: cmovneq %r13, %rax +; AVX2-NEXT: subq $-128, %rax +; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movq %r11, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: orq %rdx, %r11 +; AVX2-NEXT: cmovneq %r12, %rax +; AVX2-NEXT: addq $256, %rax # imm = 0x100 +; AVX2-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: orq %rsi, %r10 +; AVX2-NEXT: orq %r8, %r14 +; AVX2-NEXT: orq %r10, %r14 +; AVX2-NEXT: cmovneq %rbx, %rax +; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movq %r15, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: andl $32, %ecx +; AVX2-NEXT: movl %eax, %edx +; AVX2-NEXT: andl $480, %edx # imm = 0x1E0 +; AVX2-NEXT: shrl $3, %edx +; AVX2-NEXT: movl %edx, %esi +; AVX2-NEXT: andl $-8, %esi +; AVX2-NEXT: shrxq %rcx, -128(%rsp,%rsi), %r8 +; AVX2-NEXT: notl %ecx +; AVX2-NEXT: movl -120(%rsp,%rsi), %esi +; AVX2-NEXT: addl %esi, %esi +; AVX2-NEXT: shlxq %rcx, %rsi, %rcx +; AVX2-NEXT: orl %r8d, %ecx +; AVX2-NEXT: btrl %eax, %ecx +; AVX2-NEXT: movl %ecx, (%rdi,%rdx) +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: popq %r12 +; AVX2-NEXT: popq %r13 +; AVX2-NEXT: popq %r14 +; AVX2-NEXT: popq %r15 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: blsr_u512: +; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rax +; AVX512-NEXT: vmovups (%rdi), %ymm0 +; AVX512-NEXT: vmovups 32(%rdi), %ymm1 +; AVX512-NEXT: vmovdqu64 (%rdi), %zmm2 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm3 = -1 +; AVX512-NEXT: vpaddq %zmm3, %zmm2, %zmm3 +; AVX512-NEXT: vpandnq %zmm3, %zmm2, %zmm3 +; AVX512-NEXT: vplzcntq %zmm3, %zmm3 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [64,128,192,256,320,384,448,512] +; AVX512-NEXT: vpsubq %zmm3, %zmm4, %zmm3 +; AVX512-NEXT: vptestmq %zmm2, %zmm2, %k1 +; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm2 = [512,512,512,512,512,512,512,512] +; AVX512-NEXT: vpcompressq %zmm3, %zmm2 {%k1} +; AVX512-NEXT: vmovq %xmm2, %rax +; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512-NEXT: vmovdqu %ymm2, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovdqu %ymm2, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: movl %eax, %ecx +; AVX512-NEXT: andl $32, %ecx +; AVX512-NEXT: movl %ecx, %edx +; AVX512-NEXT: notl %edx +; AVX512-NEXT: movl %eax, %esi +; AVX512-NEXT: shrl $3, %esi +; AVX512-NEXT: movl %esi, %r8d +; AVX512-NEXT: andl $56, %r8d +; AVX512-NEXT: movl -120(%rsp,%r8), %r9d +; AVX512-NEXT: addl %r9d, %r9d +; AVX512-NEXT: shlxq %rdx, %r9, %rdx +; AVX512-NEXT: shrl $3, %ecx +; AVX512-NEXT: addq %rsp, %r8 +; AVX512-NEXT: addq $-128, %r8 +; AVX512-NEXT: orl (%rcx,%r8), %edx +; AVX512-NEXT: btrl %eax, %edx +; AVX512-NEXT: andl $60, %esi +; AVX512-NEXT: movl %edx, (%rdi,%rsi) +; AVX512-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512-NEXT: popq %rcx +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %ld = load i512, ptr %word + %tz = tail call range(i512 0, 513) i512 @llvm.cttz.i512(i512 %ld, i1 false) + %tz.cast = trunc nuw nsw i512 %tz to i32 + %tz.mask = and i512 %tz, 511 + %mask = shl nuw i512 1, %tz.mask + %mask.not = xor i512 %mask, -1 + %blsr = and i512 %ld, %mask.not + store i512 %blsr, ptr %word + ret i32 %tz.cast +}