Skip to content

Commit

Permalink
simd rfx yuv
Browse files Browse the repository at this point in the history
  • Loading branch information
jsorg71 committed Apr 30, 2024
1 parent 4c25520 commit 5f9ca4f
Show file tree
Hide file tree
Showing 10 changed files with 505 additions and 41 deletions.
1 change: 1 addition & 0 deletions module/amd64/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ NAFLAGS += -DASM_ARCH_AMD64
ASMSOURCES = \
a8r8g8b8_to_a8b8g8r8_box_amd64_sse2.asm \
a8r8g8b8_to_nv12_box_amd64_sse2.asm \
a8r8g8b8_to_yuvalp_box_amd64_sse2.asm \
cpuid_amd64.asm \
i420_to_rgb32_amd64_sse2.asm \
uyvy_to_rgb32_amd64_sse2.asm \
Expand Down
178 changes: 178 additions & 0 deletions module/amd64/a8r8g8b8_to_yuvalp_box_amd64_sse2.asm
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
;
;Copyright 2024 Jay Sorg
;
;Permission to use, copy, modify, distribute, and sell this software and its
;documentation for any purpose is hereby granted without fee, provided that
;the above copyright notice appear in all copies and that both that
;copyright notice and this permission notice appear in supporting
;documentation.
;
;The above copyright notice and this permission notice shall be included in
;all copies or substantial portions of the Software.
;
;THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
;IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
;FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
;OPEN GROUP BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
;AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
;CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
;
;ARGB to YUVALP
;amd64 SSE2
;
; notes
; address s8 should be aligned on 16 bytes, will be slower if not
; width must be multiple of 8 and > 0
; height must be > 0

%include "common.asm"

PREPARE_RODATA
cd255 times 4 dd 255
cw128 times 8 dw 128
cw77 times 8 dw 77
cw150 times 8 dw 150
cw29 times 8 dw 29
cw43 times 8 dw 43
cw85 times 8 dw 85
cw107 times 8 dw 107
cw21 times 8 dw 21

%define LS8 [rsp + 0] ; s8
%define LSRC_STRIDE [rsp + 8] ; src_stride
%define LD8 [rsp + 16] ; d8
%define LDST_STRIDE [rsp + 24] ; dst_stride
%define LWIDTH [rsp + 32] ; width
%define LHEIGHT [rsp + 40] ; height

;The first six integer or pointer arguments are passed in registers
; RDI, RSI, RDX, RCX, R8, and R9

;int
;a8r8g8b8_to_yuvalp_box_amd64_sse2(const uint8_t *s8, int src_stride,
; uint8_t *d8, int dst_stride,
; int width, int height);
PROC a8r8g8b8_to_yuvalp_box_amd64_sse2
push rbx
push rbp
sub rsp, 48 ; local vars, 48 bytes

mov LS8, rdi ; s8
mov LSRC_STRIDE, rsi ; src_stride
mov LD8, rdx ; d8
mov LDST_STRIDE, rcx ; dst_stride
mov LWIDTH, r8 ; width
mov LHEIGHT, r9 ; height

pxor xmm7, xmm7

mov ebx, LHEIGHT ; ebx = height

row_loop1:
mov rsi, LS8 ; s8
mov rdi, LD8 ; d8

mov ecx, LWIDTH ; ecx = width
shr ecx, 3 ; doing 8 pixels at a time

loop1:
movdqu xmm0, [rsi] ; 4 pixels, 16 bytes
movdqa xmm1, xmm0 ; blue
pand xmm1, [lsym(cd255)] ; blue
movdqa xmm2, xmm0 ; green
psrld xmm2, 8 ; green
pand xmm2, [lsym(cd255)] ; green
movdqa xmm3, xmm0 ; red
psrld xmm3, 16 ; red
pand xmm3, [lsym(cd255)] ; red
movdqa xmm4, xmm0 ; alpha
psrld xmm4, 24 ; alpha
pand xmm4, [lsym(cd255)] ; alpha

movdqu xmm0, [rsi + 16] ; 4 pixels, 16 bytes
movdqa xmm5, xmm0 ; alpha
psrld xmm5, 24 ; alpha
pand xmm5, [lsym(cd255)] ; alpha
packssdw xmm4, xmm5 ; xmm4 = 8 alphas
packuswb xmm4, xmm7
movq [rdi + 3 * 64 * 64], xmm4 ; out 8 bytes aaaaaaaa
movdqa xmm4, xmm0 ; blue
pand xmm4, [lsym(cd255)] ; blue
movdqa xmm5, xmm0 ; green
psrld xmm5, 8 ; green
pand xmm5, [lsym(cd255)] ; green
movdqa xmm6, xmm0 ; red
psrld xmm6, 16 ; red
pand xmm6, [lsym(cd255)] ; red

packssdw xmm1, xmm4 ; xmm1 = 8 blues
packssdw xmm2, xmm5 ; xmm2 = 8 greens
packssdw xmm3, xmm6 ; xmm3 = 8 reds

; _Y = (77 * _R + 150 * _G + 29 * _B) >> 8;
movdqa xmm4, xmm1 ; blue
movdqa xmm5, xmm2 ; green
movdqa xmm6, xmm3 ; red
pmullw xmm4, [lsym(cw29)]
pmullw xmm5, [lsym(cw150)]
pmullw xmm6, [lsym(cw77)]
paddw xmm4, xmm5
paddw xmm4, xmm6
psrlw xmm4, 8
packuswb xmm4, xmm7
movq [rdi], xmm4 ; out 8 bytes yyyyyyyy

; _U = ((-43 * _R - 85 * _G + 128 * _B) >> 8) + 128;
movdqa xmm4, xmm1 ; blue
movdqa xmm5, xmm2 ; green
movdqa xmm6, xmm3 ; red
pmullw xmm4, [lsym(cw128)]
pmullw xmm5, [lsym(cw85)]
pmullw xmm6, [lsym(cw43)]
psubw xmm4, xmm5
psubw xmm4, xmm6
psraw xmm4, 8
paddw xmm4, [lsym(cw128)]
packuswb xmm4, xmm7
movq [rdi + 1 * 64 * 64], xmm4 ; out 8 bytes uuuuuuuu

; _V = ((128 * _R - 107 * _G - 21 * _B) >> 8) + 128;
movdqa xmm6, xmm1 ; blue
movdqa xmm5, xmm2 ; green
movdqa xmm4, xmm3 ; red
pmullw xmm4, [lsym(cw128)]
pmullw xmm5, [lsym(cw107)]
pmullw xmm6, [lsym(cw21)]
psubw xmm4, xmm5
psubw xmm4, xmm6
psraw xmm4, 8
paddw xmm4, [lsym(cw128)]
packuswb xmm4, xmm7
movq [rdi + 2 * 64 * 64], xmm4 ; out 8 bytes vvvvvvvv

; move right
lea rsi, [rsi + 32]
lea rdi, [rdi + 8]

dec ecx
jnz loop1

; update s8
mov rax, LS8 ; s8
add rax, LSRC_STRIDE ; s8 += src_stride
mov LS8, rax

; update d8
mov rax, LD8 ; d8
add rax, LDST_STRIDE ; d8 += dst_stride
mov LD8, rax

dec ebx
jnz row_loop1

mov rax, 0 ; return value
add rsp, 48 ; local vars, 48 bytes
pop rbp
pop rbx
ret
END_OF_FILE
4 changes: 4 additions & 0 deletions module/amd64/funcs_amd64.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,10 @@ a8r8g8b8_to_nv12_box_amd64_sse2(const uint8_t *s8, int src_stride,
uint8_t *d8_y, int dst_stride_y,
uint8_t *d8_uv, int dst_stride_uv,
int width, int height);
int
a8r8g8b8_to_yuvalp_box_amd64_sse2(const uint8_t *s8, int src_stride,
uint8_t *d8, int dst_stride,
int width, int height);

#endif

1 change: 1 addition & 0 deletions module/rdp.h
Original file line number Diff line number Diff line change
Expand Up @@ -297,6 +297,7 @@ struct _rdpRec

copy_box_proc a8r8g8b8_to_a8b8g8r8_box;
copy_box_dst2_proc a8r8g8b8_to_nv12_box;
copy_box_proc a8r8g8b8_to_yuvalp_box;

/* multimon */
struct monitor_info minfo[16]; /* client monitor data */
Expand Down
95 changes: 54 additions & 41 deletions module/rdpCapture.c
Original file line number Diff line number Diff line change
Expand Up @@ -124,24 +124,18 @@ rdpFillBox_yuvalp(int ax, int ay,
/* 19595 38470 7471
-11071 -21736 32807
32756 -27429 -5327 */
static int
rdpCopyBox_a8r8g8b8_to_yuvalp(int ax, int ay,
const uint8_t *src, int src_stride,
uint8_t *dst, int dst_stride,
BoxPtr rects, int num_rects)
int
a8r8g8b8_to_yuvalp_box(const uint8_t *s8, int src_stride,
uint8_t *d8, int dst_stride,
int width, int height)
{
const uint8_t *s8;
uint8_t *d8;
uint8_t *yptr;
uint8_t *uptr;
uint8_t *vptr;
uint8_t *aptr;
const uint32_t *s32;
int index;
int jndex;
int kndex;
int width;
int height;
uint32_t pixel;
uint8_t a;
int r;
Expand All @@ -150,6 +144,51 @@ rdpCopyBox_a8r8g8b8_to_yuvalp(int ax, int ay,
int y;
int u;
int v;

for (jndex = 0; jndex < height; jndex++)
{
s32 = (const uint32_t *) s8;
yptr = d8;
uptr = yptr + 64 * 64;
vptr = uptr + 64 * 64;
aptr = vptr + 64 * 64;
kndex = 0;
while (kndex < width)
{
pixel = *(s32++);
RGB_SPLIT(a, r, g, b, pixel);
y = (r * 19595 + g * 38470 + b * 7471) >> 16;
u = (r * -11071 + g * -21736 + b * 32807) >> 16;
v = (r * 32756 + g * -27429 + b * -5327) >> 16;
u = u + 128;
v = v + 128;
y = RDPCLAMP(y, 0, UCHAR_MAX);
u = RDPCLAMP(u, 0, UCHAR_MAX);
v = RDPCLAMP(v, 0, UCHAR_MAX);
*(yptr++) = y;
*(uptr++) = u;
*(vptr++) = v;
*(aptr++) = a;
kndex++;
}
d8 += dst_stride;
s8 += src_stride;
}
return 0;
}

/******************************************************************************/
static int
rdpCopyBox_a8r8g8b8_to_yuvalp(rdpClientCon *clientCon, int ax, int ay,
const uint8_t *src, int src_stride,
uint8_t *dst, int dst_stride,
BoxPtr rects, int num_rects)
{
const uint8_t *s8;
uint8_t *d8;
int index;
int width;
int height;
BoxPtr box;

dst = dst + (ay << 8) * (dst_stride >> 8) + (ax << 8);
Expand All @@ -162,35 +201,9 @@ rdpCopyBox_a8r8g8b8_to_yuvalp(int ax, int ay,
d8 += box->x1 - ax;
width = box->x2 - box->x1;
height = box->y2 - box->y1;
for (jndex = 0; jndex < height; jndex++)
{
s32 = (const uint32_t *) s8;
yptr = d8;
uptr = yptr + 64 * 64;
vptr = uptr + 64 * 64;
aptr = vptr + 64 * 64;
kndex = 0;
while (kndex < width)
{
pixel = *(s32++);
RGB_SPLIT(a, r, g, b, pixel);
y = (r * 19595 + g * 38470 + b * 7471) >> 16;
u = (r * -11071 + g * -21736 + b * 32807) >> 16;
v = (r * 32756 + g * -27429 + b * -5327) >> 16;
u = u + 128;
v = v + 128;
y = RDPCLAMP(y, 0, UCHAR_MAX);
u = RDPCLAMP(u, 0, UCHAR_MAX);
v = RDPCLAMP(v, 0, UCHAR_MAX);
*(yptr++) = y;
*(uptr++) = u;
*(vptr++) = v;
*(aptr++) = a;
kndex++;
}
d8 += 64;
s8 += src_stride;
}
clientCon->dev->a8r8g8b8_to_yuvalp_box(s8, src_stride,
d8, 64,
width, height);
}
return 0;
}
Expand Down Expand Up @@ -946,7 +959,7 @@ rdpCapture2(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects,
rects = REGION_RECTS(&tile_reg);
num_rects = REGION_NUM_RECTS(&tile_reg);
crc = wyhash((const void*)rects, num_rects * sizeof(BoxRec), crc, _wyp);
rdpCopyBox_a8r8g8b8_to_yuvalp(x, y,
rdpCopyBox_a8r8g8b8_to_yuvalp(clientCon, x, y,
src, src_stride,
dst, dst_stride,
rects, num_rects);
Expand Down Expand Up @@ -975,7 +988,7 @@ rdpCapture2(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects,
/* lazily only do this if hash wasn't identical */
if (rcode != rgnPART)
{
rdpCopyBox_a8r8g8b8_to_yuvalp(x, y,
rdpCopyBox_a8r8g8b8_to_yuvalp(clientCon, x, y,
src, src_stride,
dst, dst_stride,
&rect, 1);
Expand Down
4 changes: 4 additions & 0 deletions module/rdpCapture.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,5 +49,9 @@ a8r8g8b8_to_nv12_box(const uint8_t *s8, int src_stride,
uint8_t *d8_y, int dst_stride_y,
uint8_t *d8_uv, int dst_stride_uv,
int width, int height);
extern _X_EXPORT int
a8r8g8b8_to_yuvalp_box(const uint8_t *s8, int src_stride,
uint8_t *d8, int dst_stride,
int width, int height);

#endif
Loading

0 comments on commit 5f9ca4f

Please sign in to comment.