Inefficient code generated for inline asm with multiple output register operands #2466

efriedma-quic · 2008-02-26T07:17:08Z


Bugzilla Link	2094
Resolution	FIXED
Resolved on	May 05, 2008 13:21
Version	unspecified
OS	Linux
Blocks	#1234 llvm/llvm-bugzilla-archive#2095
CC	@asl

Extended Description

Testcase:
#include <stdint.h>

int sad16_sse2(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)
{
int ret;
asm volatile(
"pxor %%xmm6, %%xmm6 \n\t"
//ASMALIGN(4)
"1: \n\t"
"movdqu (%1), %%xmm0 \n\t"
"movdqu (%1, %3), %%xmm1 \n\t"
"psadbw (%2), %%xmm0 \n\t"
"psadbw (%2, %3), %%xmm1 \n\t"
"paddw %%xmm0, %%xmm6 \n\t"
"paddw %%xmm1, %%xmm6 \n\t"
"lea (%1,%3,2), %1 \n\t"
"lea (%2,%3,2), %2 \n\t"
"sub $2, %0 \n\t"
" jg 1b \n\t"
: "+r" (h), "+r" (blk1), "+r" (blk2)
: "r" ((long)stride)
);
asm volatile(
"movhlps %%xmm6, %%xmm0 \n\t"
"paddw %%xmm0, %%xmm6 \n\t"
"movd %%xmm6, %0 \n\t"
: "=r"(ret)
);
return ret;
}

Generated code:

    pushl   %esi
    subl    $8, %esp
    movl    20(%esp), %edx
    movl    %edx, 4(%esp)
    movl    24(%esp), %ecx
    movl    %ecx, (%esp)
    movl    32(%esp), %eax
    movl    28(%esp), %esi
    #APP
    pxor %xmm6, %xmm6            
    1:                             
    movdqu (%ecx), %xmm0            
    movdqu (%ecx, %esi), %xmm1        
    psadbw (%edx), %xmm0            
    psadbw (%edx, %esi), %xmm1        
    paddw %xmm0, %xmm6           
    paddw %xmm1, %xmm6           
    lea (%ecx,%esi,2), %ecx              
    lea (%edx,%esi,2), %edx              
    sub $2, %eax                     
     jg 1b                         

    #NO_APP
    movl    %edx, 4(%esp)
    movl    %ecx, (%esp)
    #APP
    movhlps %xmm6, %xmm0         
    paddw   %xmm0, %xmm6         
    movd    %xmm6, %eax             

    #NO_APP
    addl    $8, %esp
    popl    %esi
    ret

(We'll put aside for the moment the fact that this code is extremely dangerous because a compiler using certain kinds of optimizations might actually end up using the xmm regs between the two asm statements.)

The generated code ends up being rather inefficient in that it emits four unnecessary stores to the stack, plus allocation for the necessary space. I think it's because blk1 and blk2 have to be put into alloca's at the il level, and codegen isn't smart enough to eliminate them. Not sure what the right fix is; maybe inline asm should take advantage of the multiple return value work?

(I don't know how much fixing this will help, but this function shows up at the top of a profile in ffmpeg re-encoding from h.264 to mpeg4, so every bit likely helps.)

The text was updated successfully, but these errors were encountered:

lattner · 2008-02-26T08:06:49Z

This is a classic case that suffers from not having multiple return values. Because we can't do this on the llvm IR, asms that have multiple output constraints (= or +) have to return them through memory, which is grossly inefficient.

lattner · 2008-02-26T08:07:39Z

Specifically the llvm ir for the first function compiles to multiple =*r constraints, which are "indirect outputs".

lattner · 2008-04-28T01:23:12Z

Reduced testcase:

int sad16_sse2(void *v, unsigned char *blk2, unsigned char *blk1,
int stride, int h) {
int ret;
asm volatile( "%0 %1 %2 %3"
: "+r" (h), "+r" (blk1), "+r" (blk2)
: "r" ((long)stride));
asm volatile("set %0" : "=r"(ret));
return ret;
}

We compile this to:

_sad16_sse2:
subq $16, %rsp
movq %rsi, 8(%rsp)
movq %rdx, (%rsp)
movslq %ecx, %rdi
movl %r8d, %eax
movq %rdx, %rcx
movq %rsi, %rdx
## InlineAsm Start
%eax %rcx %rdx %rdi
## InlineAsm End
movq %rdx, 8(%rsp)
movq %rcx, (%rsp)
## InlineAsm Start
set %eax
## InlineAsm End
addq $16, %rsp
ret

GCC compiles this to:

_sad16_sse2:
LFB2:
movslq %ecx,%rcx
%r8d %rdx %rsi %rcx
set %eax
ret

lattner · 2008-04-28T09:15:25Z

This is what the .ll will contain in the future. I'm working through teaching the code generator to grok this:

; ModuleID = 'pr2094.c'
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
target triple = "x86_64-apple-darwin8"

define i32 @sad16_sse2(i8* %v, i8* %blk2, i8* %blk1, i32 %stride, i32 %h) nounwind {
entry:
%tmp12 = sext i32 %stride to i64 ; [#uses=1]
%mrv = call {i32, i8*, i8*} asm sideeffect "$0 $1 $2 $3",
"=r,=r,=r,r,0,1,2"( i64 %tmp12, i32 %h, i8* %blk1, i8* %blk2 ) nounwind
%tmp6 = getresult {i32, i8*, i8*} %mrv, 0
%tmp7 = call i32 asm sideeffect "set $0",
"=r,~~{dirflag},~~{fpsr},~{flags}"( ) nounwind
ret i32 %tmp7
}

lattner · 2008-04-29T06:49:20Z

I believe the code generator now handles multiple result .ll files. Time to update the f.e.

lattner · 2008-04-29T07:56:27Z

Implemented.

Here's the last codegen pieces:
http://lists.cs.uiuc.edu/pipermail/llvm-commits/Week-of-Mon-20080428/061730.html
http://lists.cs.uiuc.edu/pipermail/llvm-commits/Week-of-Mon-20080428/061731.html

Here's the CFE piece:
http://lists.cs.uiuc.edu/pipermail/llvm-commits/Week-of-Mon-20080428/061739.html

-Chris

llvmbot · 2008-05-05T12:37:56Z

The testcase inline-asm-mrv.c is failing on x86-32 linux.
Here is what I get at -O4:

; ModuleID = 'inline-asm-mrv.c'
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32"
target triple = "i386-pc-linux-gnu"

define i32 @sad16_sse2(i8* %v, i8* %blk2, i8* %blk1, i32 %stride, i32 %h) nounwind {
entry:
%blk2_addr = alloca i8* ; <i8**> [#uses=2]
%blk1_addr = alloca i8* ; <i8**> [#uses=3]
store i8* %blk2, i8** %blk2_addr
store i8* %blk1, i8** %blk1_addr
%tmp5 = call i32 asm sideeffect "$0 $1 $2 $3", "=r,=r,=r,r,0,1,2,~~{dirflag},~~{fpsr},~{flags}"( i8 %blk1_addr, i8** %blk2_addr, i32 %stride, i32 %h, i8* %blk1, i8* %blk2 ) nounwind ; [#uses=0]
%tmp6 = load i8** %blk1_addr, align 4 ; <i8*> [#uses=1]
%tmp7 = call i32 asm sideeffect "set $0 $1", "=r,r,~~{dirflag},~~{fpsr},~{flags}"( i8* %tmp6 ) nounwind ; [#uses=1]
ret i32 %tmp7
}

llvmbot · 2008-05-05T14:49:58Z

Sorry, my mistake: I accidentally tested using an old version of llvm-gcc.

lattner · 2008-05-05T20:21:40Z

Ok. the new code is much better! :)

lattner · 2021-11-26T19:36:46Z

mentioned in issue llvm/llvm-bugzilla-archive#2095

asl · 2021-11-27T06:10:24Z

mentioned in issue #1234

llvmbot transferred this issue from llvm/llvm-bugzilla-archive Dec 3, 2021

lattner mentioned this issue Aug 15, 2023

[meta] Multiple return value support #2467

Closed

This issue was closed.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Inefficient code generated for inline asm with multiple output register operands #2466

Inefficient code generated for inline asm with multiple output register operands #2466

efriedma-quic commented Feb 26, 2008

lattner commented Feb 26, 2008

lattner commented Feb 26, 2008

lattner commented Apr 28, 2008

lattner commented Apr 28, 2008

lattner commented Apr 29, 2008

lattner commented Apr 29, 2008

llvmbot commented May 5, 2008

llvmbot commented May 5, 2008

lattner commented May 5, 2008

lattner commented Nov 26, 2021

asl commented Nov 27, 2021

Inefficient code generated for inline asm with multiple output register operands #2466

Inefficient code generated for inline asm with multiple output register operands #2466

Comments

efriedma-quic commented Feb 26, 2008

Extended Description

lattner commented Feb 26, 2008

lattner commented Feb 26, 2008

lattner commented Apr 28, 2008

lattner commented Apr 28, 2008

lattner commented Apr 29, 2008

lattner commented Apr 29, 2008

llvmbot commented May 5, 2008

llvmbot commented May 5, 2008

lattner commented May 5, 2008

lattner commented Nov 26, 2021

asl commented Nov 27, 2021