diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..70869b3 --- /dev/null +++ b/.gitignore @@ -0,0 +1,32 @@ +*~ +aclocal.m4 +AUTHORS +autom4te.cache/ +ChangeLog +config_ac.h +config_ac-h.in +config.c +config.guess +config.log +config.status +config.sub +configure +compile +depcomp +.deps/ +install-sh +*.la +.libs +libtool +*.lo +ltmain.sh +Makefile +Makefile.in +missing +NEWS +*.o +*.pc +README +stamp-h1 +rfxcodectest +.dirstamp diff --git a/Makefile b/Makefile deleted file mode 100644 index 0d2e4ce..0000000 --- a/Makefile +++ /dev/null @@ -1,12 +0,0 @@ - -all: allmake - -allmake: - cd src; $(MAKE) $(MFLAGS) - cd tests; $(MAKE) $(MFLAGS) - -clean: allclean - -allclean: - cd src; $(MAKE) clean - cd tests; $(MAKE) clean diff --git a/Makefile.am b/Makefile.am new file mode 100644 index 0000000..d0034d0 --- /dev/null +++ b/Makefile.am @@ -0,0 +1,14 @@ +ACLOCAL_AMFLAGS = -I m4 + +pkgconfig_DATA = rfxcodec.pc + +EXTRA_DIST = bootstrap readme.txt + +SUBDIRS = \ + src \ + tests + +include_HEADERS = \ + include/rfxcodec_encode.h \ + include/rfxcodec_decode.h \ + include/rfxcodec_common.h diff --git a/acinclude.m4 b/acinclude.m4 new file mode 100644 index 0000000..fbfc98d --- /dev/null +++ b/acinclude.m4 @@ -0,0 +1,137 @@ +# AC_PROG_NASM +# -------------------------- +# Check that NASM exists and determine flags +AC_DEFUN([AC_PROG_NASM],[ + +AC_CHECK_PROGS(NASM, [nasm nasmw yasm]) +test -z "$NASM" && AC_MSG_ERROR([no nasm (Netwide Assembler) found]) + +AC_MSG_CHECKING([for object file format of host system]) +case "$host_os" in + cygwin* | mingw* | pw32* | interix*) + case "$host_cpu" in + x86_64) + objfmt='Win64-COFF' + ;; + *) + objfmt='Win32-COFF' + ;; + esac + ;; + msdosdjgpp* | go32*) + objfmt='COFF' + ;; + os2-emx*) # not tested + objfmt='MSOMF' # obj + ;; + linux*coff* | linux*oldld*) + objfmt='COFF' # ??? + ;; + linux*aout*) + objfmt='a.out' + ;; + linux*) + case "$host_cpu" in + x86_64) + objfmt='ELF64' + ;; + *) + objfmt='ELF' + ;; + esac + ;; + freebsd* | netbsd* | openbsd*) + if echo __ELF__ | $CC -E - | grep __ELF__ > /dev/null; then + objfmt='BSD-a.out' + else + case "$host_cpu" in + x86_64 | amd64) + objfmt='ELF64' + ;; + *) + objfmt='ELF' + ;; + esac + fi + ;; + solaris* | sunos* | sysv* | sco*) + case "$host_cpu" in + x86_64) + objfmt='ELF64' + ;; + *) + objfmt='ELF' + ;; + esac + ;; + darwin* | rhapsody* | nextstep* | openstep* | macos*) + case "$host_cpu" in + x86_64) + objfmt='Mach-O64' + ;; + *) + objfmt='Mach-O' + ;; + esac + ;; + *) + objfmt='ELF ?' + ;; +esac + +AC_MSG_RESULT([$objfmt]) +if test "$objfmt" = 'ELF ?'; then + objfmt='ELF' + AC_MSG_WARN([unexpected host system. assumed that the format is $objfmt.]) +fi + +AC_MSG_CHECKING([for object file format specifier (NAFLAGS) ]) +case "$objfmt" in + MSOMF) NAFLAGS='-fobj -DOBJ32';; + Win32-COFF) NAFLAGS='-fwin32 -DWIN32';; + Win64-COFF) NAFLAGS='-fwin64 -DWIN64 -D__x86_64__';; + COFF) NAFLAGS='-fcoff -DCOFF';; + a.out) NAFLAGS='-faout -DAOUT';; + BSD-a.out) NAFLAGS='-faoutb -DAOUT';; + ELF) NAFLAGS='-felf -DELF';; + ELF64) NAFLAGS='-felf64 -DELF -D__x86_64__';; + RDF) NAFLAGS='-frdf -DRDF';; + Mach-O) NAFLAGS='-fmacho -DMACHO';; + Mach-O64) NAFLAGS='-fmacho64 -DMACHO -D__x86_64__';; +esac +AC_MSG_RESULT([$NAFLAGS]) +AC_SUBST([NAFLAGS]) + +AC_MSG_CHECKING([whether the assembler ($NASM $NAFLAGS) works]) +cat > conftest.asm <&AC_FD_CC + cat conftest.asm >&AC_FD_CC + rm -rf conftest* + AC_MSG_RESULT(no) + AC_MSG_ERROR([installation or configuration problem: assembler cannot create object files.]) +fi + +AC_MSG_CHECKING([whether the linker accepts assembler output]) +try_nasm='${CC-cc} -o conftest${ac_exeext} $LDFLAGS conftest.o $LIBS 1>&AC_FD_CC' +if AC_TRY_EVAL(try_nasm) && test -s conftest${ac_exeext}; then + rm -rf conftest* + AC_MSG_RESULT(yes) +else + rm -rf conftest* + AC_MSG_RESULT(no) + AC_MSG_ERROR([configuration problem: maybe object file format mismatch.]) +fi + +]) + diff --git a/bootstrap b/bootstrap new file mode 100755 index 0000000..a5ef9dd --- /dev/null +++ b/bootstrap @@ -0,0 +1,32 @@ +#!/bin/sh + +which autoconf +if ! test $? -eq 0 +then + echo "error, install autoconf" + exit 1 +fi + +which automake +if ! test $? -eq 0 +then + echo "error, install automake" + exit 1 +fi + +which libtool || which libtoolize +if ! test $? -eq 0 +then + echo "error, install libtool" + exit 1 +fi + +which pkg-config +if ! test $? -eq 0 +then + echo "error, install pkg-config" + exit 1 +fi + +touch configure.ac +autoreconf -fvi diff --git a/configure.ac b/configure.ac new file mode 100644 index 0000000..e5bc1d8 --- /dev/null +++ b/configure.ac @@ -0,0 +1,53 @@ +# Process this file with autoconf to produce a configure script + +AC_PREREQ(2.59) +AC_INIT([rfxcodec], [0.1.0], [jay.sorg@gmail.com]) +AC_CONFIG_HEADERS(config_ac.h:config_ac-h.in) +AM_INIT_AUTOMAKE([1.6 foreign]) +AC_CONFIG_MACRO_DIR([m4]) +m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES]) +AC_PROG_CC +AC_C_CONST +AC_PROG_LIBTOOL +PKG_INSTALLDIR + +# SIMD is optional +AC_ARG_WITH([simd], + AC_HELP_STRING([--without-simd],[Omit SIMD extensions.])) +if test "x${with_simd}" != "xno"; then + # Check if we're on a supported CPU + AC_MSG_CHECKING([if we have SIMD optimisations for cpu type]) + case "$host_cpu" in + x86_64 | amd64) + AC_MSG_RESULT([yes (x86_64)]) + AC_PROG_NASM + simd_arch=x86_64 + ;; + i*86 | x86 | ia32) + AC_MSG_RESULT([yes (i386)]) + AC_PROG_NASM + simd_arch=i386 + ;; + *) + AC_MSG_RESULT([no ("$host_cpu")]) + AC_MSG_WARN([SIMD support not available for this CPU. Performance will suffer.]) + with_simd=no; + ;; + esac + if test "x${with_simd}" != "xno"; then + AC_DEFINE([WITH_SIMD], [1], [Use accelerated SIMD routines.]) + fi +fi + +AM_CONDITIONAL(WITH_SIMD_AMD64, [test x$simd_arch = xx86_64]) +AM_CONDITIONAL(WITH_SIMD_X86, [test x$simd_arch = xi386]) + +AC_CONFIG_FILES([Makefile + src/Makefile + tests/Makefile + rfxcodec.pc + rfxcodec-uninstalled.pc +]) + +AC_OUTPUT + diff --git a/include/rfxcodec_common.h b/include/rfxcodec_common.h new file mode 100644 index 0000000..0411c73 --- /dev/null +++ b/include/rfxcodec_common.h @@ -0,0 +1,40 @@ +/** + * RFX codec + * + * Copyright 2015 Jay Sorg + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __RFXCODEC_COMMON_H +#define __RFXCODEC_COMMON_H + +#define RFX_FORMAT_BGRA 0 +#define RFX_FORMAT_RGBA 1 +#define RFX_FORMAT_BGR 2 +#define RFX_FORMAT_RGB 3 +#define RFX_FORMAT_YUV 4 /* YUV444 linear tiled mode */ + +#define RFX_FLAGS_NONE 0 /* default RFX_FLAGS_RLGR3 and RFX_FLAGS_SAFE */ + +#define RFX_FLAGS_SAFE 0 /* default */ +#define RFX_FLAGS_OPT1 (1 << 3) +#define RFX_FLAGS_OPT2 (1 << 4) +#define RFX_FLAGS_NOACCEL (1 << 6) + +#define RFX_FLAGS_RLGR3 0 /* default */ +#define RFX_FLAGS_RLGR1 1 + +#define RFX_FLAGS_ALPHAV1 1 /* used in flags for rfxcodec_encode */ + +#endif diff --git a/include/rfxcodec_decode.h b/include/rfxcodec_decode.h new file mode 100644 index 0000000..ed4b0a8 --- /dev/null +++ b/include/rfxcodec_decode.h @@ -0,0 +1,33 @@ +/** + * RFX codec decoder + * + * Copyright 2014-2015 Jay Sorg + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __RFXCODEC_DECODE_H +#define __RFXCODEC_DECODE_H + +#include + +int +rfxcodec_decode_create(int width, int height, int format, int flags, + void **handle); +int +rfxcodec_decode_destroy(void *handle); +int +rfxcodec_decode(void *handle, char *cdata, int cdata_bytes, + char *data, int width, int height, int stride_bytes); + +#endif diff --git a/include/rfxcodec_encode.h b/include/rfxcodec_encode.h index f082dd2..04112b0 100644 --- a/include/rfxcodec_encode.h +++ b/include/rfxcodec_encode.h @@ -1,7 +1,7 @@ /** * RFX codec encoder * - * Copyright 2014 Jay Sorg + * Copyright 2014-2015 Jay Sorg * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,23 +19,7 @@ #ifndef __RFXCODEC_ENCODE_H #define __RFXCODEC_ENCODE_H -#define RFX_USE_ACCEL 0 - -#define RFX_FORMAT_BGRA 0 -#define RFX_FORMAT_RGBA 1 -#define RFX_FORMAT_BGR 2 -#define RFX_FORMAT_RGB 3 -#define RFX_FORMAT_YUV 4 /* YUV444 linear tiled mode */ - -#define RFX_FLAGS_NONE 0 /* default RFX_FLAGS_RLGR3 and RFX_FLAGS_SAFE */ - -#define RFX_FLAGS_RLGR3 0 /* default */ -#define RFX_FLAGS_RLGR1 1 - -#define RFX_FLAGS_SAFE 0 /* default */ -#define RFX_FLAGS_OPT1 (1 << 3) -#define RFX_FLAGS_OPT2 (1 << 4) -#define RFX_FLAGS_NOACCEL (1 << 6) +#include struct rfx_rect { @@ -49,8 +33,8 @@ struct rfx_tile { int x; /* multiple of 64 */ int y; /* multiple of 64 */ - int cx; /* must be 64 */ - int cy; /* must be 64 */ + int cx; /* must be 64 or less */ + int cy; /* must be 64 or less */ int quant_y; int quant_cb; int quant_cr; @@ -59,8 +43,12 @@ struct rfx_tile void * rfxcodec_encode_create(int width, int height, int format, int flags); int -rfxcodec_encode_destroy(void * handle); -/* quants, 10 ints per set, should be num_quants * 10 ints in quants) +rfxcodec_encode_create_ex(int width, int height, int format, int flags, + void **handle); +int +rfxcodec_encode_destroy(void *handle); +/* quants, 5 ints per set, should be num_quants * 5 chars in quants) + * each char is 2 quant values * quantizer order is * 0 - LL3 * 1 - LH3 @@ -75,8 +63,14 @@ rfxcodec_encode_destroy(void * handle); int rfxcodec_encode(void *handle, char *cdata, int *cdata_bytes, char *buf, int width, int height, int stride_bytes, - struct rfx_rect *region, int num_region, - struct rfx_tile *tiles, int num_tiles, - const int *quants, int num_quants); + const struct rfx_rect *region, int num_region, + const struct rfx_tile *tiles, int num_tiles, + const char *quants, int num_quants); +int +rfxcodec_encode_ex(void *handle, char *cdata, int *cdata_bytes, + char *buf, int width, int height, int stride_bytes, + const struct rfx_rect *region, int num_region, + const struct rfx_tile *tiles, int num_tiles, + const char *quants, int num_quants, int flags); #endif diff --git a/m4/pkg.m4 b/m4/pkg.m4 new file mode 100644 index 0000000..82bea96 --- /dev/null +++ b/m4/pkg.m4 @@ -0,0 +1,275 @@ +dnl pkg.m4 - Macros to locate and utilise pkg-config. -*- Autoconf -*- +dnl serial 11 (pkg-config-0.29.1) +dnl +dnl Copyright © 2004 Scott James Remnant . +dnl Copyright © 2012-2015 Dan Nicholson +dnl +dnl This program is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU General Public License as published by +dnl the Free Software Foundation; either version 2 of the License, or +dnl (at your option) any later version. +dnl +dnl This program is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl General Public License for more details. +dnl +dnl You should have received a copy of the GNU General Public License +dnl along with this program; if not, write to the Free Software +dnl Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA +dnl 02111-1307, USA. +dnl +dnl As a special exception to the GNU General Public License, if you +dnl distribute this file as part of a program that contains a +dnl configuration script generated by Autoconf, you may include it under +dnl the same distribution terms that you use for the rest of that +dnl program. + +dnl PKG_PREREQ(MIN-VERSION) +dnl ----------------------- +dnl Since: 0.29 +dnl +dnl Verify that the version of the pkg-config macros are at least +dnl MIN-VERSION. Unlike PKG_PROG_PKG_CONFIG, which checks the user's +dnl installed version of pkg-config, this checks the developer's version +dnl of pkg.m4 when generating configure. +dnl +dnl To ensure that this macro is defined, also add: +dnl m4_ifndef([PKG_PREREQ], +dnl [m4_fatal([must install pkg-config 0.29 or later before running autoconf/autogen])]) +dnl +dnl See the "Since" comment for each macro you use to see what version +dnl of the macros you require. +m4_defun([PKG_PREREQ], +[m4_define([PKG_MACROS_VERSION], [0.29.1]) +m4_if(m4_version_compare(PKG_MACROS_VERSION, [$1]), -1, + [m4_fatal([pkg.m4 version $1 or higher is required but ]PKG_MACROS_VERSION[ found])]) +])dnl PKG_PREREQ + +dnl PKG_PROG_PKG_CONFIG([MIN-VERSION]) +dnl ---------------------------------- +dnl Since: 0.16 +dnl +dnl Search for the pkg-config tool and set the PKG_CONFIG variable to +dnl first found in the path. Checks that the version of pkg-config found +dnl is at least MIN-VERSION. If MIN-VERSION is not specified, 0.9.0 is +dnl used since that's the first version where most current features of +dnl pkg-config existed. +AC_DEFUN([PKG_PROG_PKG_CONFIG], +[m4_pattern_forbid([^_?PKG_[A-Z_]+$]) +m4_pattern_allow([^PKG_CONFIG(_(PATH|LIBDIR|SYSROOT_DIR|ALLOW_SYSTEM_(CFLAGS|LIBS)))?$]) +m4_pattern_allow([^PKG_CONFIG_(DISABLE_UNINSTALLED|TOP_BUILD_DIR|DEBUG_SPEW)$]) +AC_ARG_VAR([PKG_CONFIG], [path to pkg-config utility]) +AC_ARG_VAR([PKG_CONFIG_PATH], [directories to add to pkg-config's search path]) +AC_ARG_VAR([PKG_CONFIG_LIBDIR], [path overriding pkg-config's built-in search path]) + +if test "x$ac_cv_env_PKG_CONFIG_set" != "xset"; then + AC_PATH_TOOL([PKG_CONFIG], [pkg-config]) +fi +if test -n "$PKG_CONFIG"; then + _pkg_min_version=m4_default([$1], [0.9.0]) + AC_MSG_CHECKING([pkg-config is at least version $_pkg_min_version]) + if $PKG_CONFIG --atleast-pkgconfig-version $_pkg_min_version; then + AC_MSG_RESULT([yes]) + else + AC_MSG_RESULT([no]) + PKG_CONFIG="" + fi +fi[]dnl +])dnl PKG_PROG_PKG_CONFIG + +dnl PKG_CHECK_EXISTS(MODULES, [ACTION-IF-FOUND], [ACTION-IF-NOT-FOUND]) +dnl ------------------------------------------------------------------- +dnl Since: 0.18 +dnl +dnl Check to see whether a particular set of modules exists. Similar to +dnl PKG_CHECK_MODULES(), but does not set variables or print errors. +dnl +dnl Please remember that m4 expands AC_REQUIRE([PKG_PROG_PKG_CONFIG]) +dnl only at the first occurence in configure.ac, so if the first place +dnl it's called might be skipped (such as if it is within an "if", you +dnl have to call PKG_CHECK_EXISTS manually +AC_DEFUN([PKG_CHECK_EXISTS], +[AC_REQUIRE([PKG_PROG_PKG_CONFIG])dnl +if test -n "$PKG_CONFIG" && \ + AC_RUN_LOG([$PKG_CONFIG --exists --print-errors "$1"]); then + m4_default([$2], [:]) +m4_ifvaln([$3], [else + $3])dnl +fi]) + +dnl _PKG_CONFIG([VARIABLE], [COMMAND], [MODULES]) +dnl --------------------------------------------- +dnl Internal wrapper calling pkg-config via PKG_CONFIG and setting +dnl pkg_failed based on the result. +m4_define([_PKG_CONFIG], +[if test -n "$$1"; then + pkg_cv_[]$1="$$1" + elif test -n "$PKG_CONFIG"; then + PKG_CHECK_EXISTS([$3], + [pkg_cv_[]$1=`$PKG_CONFIG --[]$2 "$3" 2>/dev/null` + test "x$?" != "x0" && pkg_failed=yes ], + [pkg_failed=yes]) + else + pkg_failed=untried +fi[]dnl +])dnl _PKG_CONFIG + +dnl _PKG_SHORT_ERRORS_SUPPORTED +dnl --------------------------- +dnl Internal check to see if pkg-config supports short errors. +AC_DEFUN([_PKG_SHORT_ERRORS_SUPPORTED], +[AC_REQUIRE([PKG_PROG_PKG_CONFIG]) +if $PKG_CONFIG --atleast-pkgconfig-version 0.20; then + _pkg_short_errors_supported=yes +else + _pkg_short_errors_supported=no +fi[]dnl +])dnl _PKG_SHORT_ERRORS_SUPPORTED + + +dnl PKG_CHECK_MODULES(VARIABLE-PREFIX, MODULES, [ACTION-IF-FOUND], +dnl [ACTION-IF-NOT-FOUND]) +dnl -------------------------------------------------------------- +dnl Since: 0.4.0 +dnl +dnl Note that if there is a possibility the first call to +dnl PKG_CHECK_MODULES might not happen, you should be sure to include an +dnl explicit call to PKG_PROG_PKG_CONFIG in your configure.ac +AC_DEFUN([PKG_CHECK_MODULES], +[AC_REQUIRE([PKG_PROG_PKG_CONFIG])dnl +AC_ARG_VAR([$1][_CFLAGS], [C compiler flags for $1, overriding pkg-config])dnl +AC_ARG_VAR([$1][_LIBS], [linker flags for $1, overriding pkg-config])dnl + +pkg_failed=no +AC_MSG_CHECKING([for $1]) + +_PKG_CONFIG([$1][_CFLAGS], [cflags], [$2]) +_PKG_CONFIG([$1][_LIBS], [libs], [$2]) + +m4_define([_PKG_TEXT], [Alternatively, you may set the environment variables $1[]_CFLAGS +and $1[]_LIBS to avoid the need to call pkg-config. +See the pkg-config man page for more details.]) + +if test $pkg_failed = yes; then + AC_MSG_RESULT([no]) + _PKG_SHORT_ERRORS_SUPPORTED + if test $_pkg_short_errors_supported = yes; then + $1[]_PKG_ERRORS=`$PKG_CONFIG --short-errors --print-errors --cflags --libs "$2" 2>&1` + else + $1[]_PKG_ERRORS=`$PKG_CONFIG --print-errors --cflags --libs "$2" 2>&1` + fi + # Put the nasty error message in config.log where it belongs + echo "$$1[]_PKG_ERRORS" >&AS_MESSAGE_LOG_FD + + m4_default([$4], [AC_MSG_ERROR( +[Package requirements ($2) were not met: + +$$1_PKG_ERRORS + +Consider adjusting the PKG_CONFIG_PATH environment variable if you +installed software in a non-standard prefix. + +_PKG_TEXT])[]dnl + ]) +elif test $pkg_failed = untried; then + AC_MSG_RESULT([no]) + m4_default([$4], [AC_MSG_FAILURE( +[The pkg-config script could not be found or is too old. Make sure it +is in your PATH or set the PKG_CONFIG environment variable to the full +path to pkg-config. + +_PKG_TEXT + +To get pkg-config, see .])[]dnl + ]) +else + $1[]_CFLAGS=$pkg_cv_[]$1[]_CFLAGS + $1[]_LIBS=$pkg_cv_[]$1[]_LIBS + AC_MSG_RESULT([yes]) + $3 +fi[]dnl +])dnl PKG_CHECK_MODULES + + +dnl PKG_CHECK_MODULES_STATIC(VARIABLE-PREFIX, MODULES, [ACTION-IF-FOUND], +dnl [ACTION-IF-NOT-FOUND]) +dnl --------------------------------------------------------------------- +dnl Since: 0.29 +dnl +dnl Checks for existence of MODULES and gathers its build flags with +dnl static libraries enabled. Sets VARIABLE-PREFIX_CFLAGS from --cflags +dnl and VARIABLE-PREFIX_LIBS from --libs. +dnl +dnl Note that if there is a possibility the first call to +dnl PKG_CHECK_MODULES_STATIC might not happen, you should be sure to +dnl include an explicit call to PKG_PROG_PKG_CONFIG in your +dnl configure.ac. +AC_DEFUN([PKG_CHECK_MODULES_STATIC], +[AC_REQUIRE([PKG_PROG_PKG_CONFIG])dnl +_save_PKG_CONFIG=$PKG_CONFIG +PKG_CONFIG="$PKG_CONFIG --static" +PKG_CHECK_MODULES($@) +PKG_CONFIG=$_save_PKG_CONFIG[]dnl +])dnl PKG_CHECK_MODULES_STATIC + + +dnl PKG_INSTALLDIR([DIRECTORY]) +dnl ------------------------- +dnl Since: 0.27 +dnl +dnl Substitutes the variable pkgconfigdir as the location where a module +dnl should install pkg-config .pc files. By default the directory is +dnl $libdir/pkgconfig, but the default can be changed by passing +dnl DIRECTORY. The user can override through the --with-pkgconfigdir +dnl parameter. +AC_DEFUN([PKG_INSTALLDIR], +[m4_pushdef([pkg_default], [m4_default([$1], ['${libdir}/pkgconfig'])]) +m4_pushdef([pkg_description], + [pkg-config installation directory @<:@]pkg_default[@:>@]) +AC_ARG_WITH([pkgconfigdir], + [AS_HELP_STRING([--with-pkgconfigdir], pkg_description)],, + [with_pkgconfigdir=]pkg_default) +AC_SUBST([pkgconfigdir], [$with_pkgconfigdir]) +m4_popdef([pkg_default]) +m4_popdef([pkg_description]) +])dnl PKG_INSTALLDIR + + +dnl PKG_NOARCH_INSTALLDIR([DIRECTORY]) +dnl -------------------------------- +dnl Since: 0.27 +dnl +dnl Substitutes the variable noarch_pkgconfigdir as the location where a +dnl module should install arch-independent pkg-config .pc files. By +dnl default the directory is $datadir/pkgconfig, but the default can be +dnl changed by passing DIRECTORY. The user can override through the +dnl --with-noarch-pkgconfigdir parameter. +AC_DEFUN([PKG_NOARCH_INSTALLDIR], +[m4_pushdef([pkg_default], [m4_default([$1], ['${datadir}/pkgconfig'])]) +m4_pushdef([pkg_description], + [pkg-config arch-independent installation directory @<:@]pkg_default[@:>@]) +AC_ARG_WITH([noarch-pkgconfigdir], + [AS_HELP_STRING([--with-noarch-pkgconfigdir], pkg_description)],, + [with_noarch_pkgconfigdir=]pkg_default) +AC_SUBST([noarch_pkgconfigdir], [$with_noarch_pkgconfigdir]) +m4_popdef([pkg_default]) +m4_popdef([pkg_description]) +])dnl PKG_NOARCH_INSTALLDIR + + +dnl PKG_CHECK_VAR(VARIABLE, MODULE, CONFIG-VARIABLE, +dnl [ACTION-IF-FOUND], [ACTION-IF-NOT-FOUND]) +dnl ------------------------------------------- +dnl Since: 0.28 +dnl +dnl Retrieves the value of the pkg-config variable for the given module. +AC_DEFUN([PKG_CHECK_VAR], +[AC_REQUIRE([PKG_PROG_PKG_CONFIG])dnl +AC_ARG_VAR([$1], [value of $3 for $2, overriding pkg-config])dnl + +_PKG_CONFIG([$1], [variable="][$3]["], [$2]) +AS_VAR_COPY([$1], [pkg_cv_][$1]) + +AS_VAR_IF([$1], [""], [$5], [$4])dnl +])dnl PKG_CHECK_VAR diff --git a/rfxcodec-uninstalled.pc.in b/rfxcodec-uninstalled.pc.in new file mode 100644 index 0000000..9aed766 --- /dev/null +++ b/rfxcodec-uninstalled.pc.in @@ -0,0 +1,5 @@ +Name: rfxcodec +Description: Fast jpeg2000 codec compatible with MS RDP servers and xrdp +Version: @PACKAGE_VERSION@ +Cflags: -I${pc_top_builddir}/${pcfiledir}/include +Libs: ${pc_top_builddir}/${pcfiledir}/src/librfxencode.la diff --git a/rfxcodec.pc.in b/rfxcodec.pc.in new file mode 100644 index 0000000..8bd611c --- /dev/null +++ b/rfxcodec.pc.in @@ -0,0 +1,10 @@ +prefix=@prefix@ +exec_prefix=@exec_prefix@ +libdir=@libdir@ +includedir=@includedir@ + +Name: rfxcodec +Description: Fast jpeg2000 codec compatible with MS RDP servers and xrdp +Version: @PACKAGE_VERSION@ +Cflags: -I${includedir} +Libs: -L${libdir} -lrfxencode diff --git a/src/Makefile b/src/Makefile deleted file mode 100644 index 4c657c7..0000000 --- a/src/Makefile +++ /dev/null @@ -1,47 +0,0 @@ - -OBJS = rfxencode.o rfxcompose.o rfxencode_tile.o rfxencode_dwt.o \ -rfxencode_quantization.o rfxencode_differential.o \ -rfxencode_rlgr1.o rfxencode_rlgr3.o - -#OBJS += cpuid_x86.o rfxrlgr1_x86.o rfxrlgr3_x86.o rfxdwt_x86_sse2.o -#OBJS += cpuid_amd64.o rfxrlgr1_amd64.o rfxrlgr3_amd64.o rfxdwt_amd64_sse2.o - -CFLAGS = $(PROFIL) -g -O2 -Wall -fPIC -I../include -#-DRFX_USE_ACCEL - -LDFLAGS = - -LIBS = - -all: librfxencode.so - -librfxencode.so: $(OBJS) Makefile - $(CC) -shared -o librfxencode.so $(LDFLAGS) $(OBJS) $(LIBS) - $(AR) -rv librfxencode.a $(OBJS) - -cpuid_x86.o: x86/cpuid_x86.asm - yasm -f elf32 -g dwarf2 x86/cpuid_x86.asm - -rfxrlgr1_x86.o: x86/rfxrlgr1_x86.asm - yasm -f elf32 -g dwarf2 x86/rfxrlgr1_x86.asm - -rfxrlgr3_x86.o: x86/rfxrlgr3_x86.asm - yasm -f elf32 -g dwarf2 x86/rfxrlgr3_x86.asm - -rfxdwt_x86_sse2.o: x86/rfxdwt_x86_sse2.asm - yasm -f elf32 -g dwarf2 x86/rfxdwt_x86_sse2.asm - -cpuid_amd64.o: amd64/cpuid_amd64.asm - yasm -f elf64 -g dwarf2 amd64/cpuid_amd64.asm - -rfxrlgr1_amd64.o: amd64/rfxrlgr1_amd64.asm - yasm -f elf64 -g dwarf2 amd64/rfxrlgr1_amd64.asm - -rfxrlgr3_amd64.o: amd64/rfxrlgr3_amd64.asm - yasm -f elf64 -g dwarf2 amd64/rfxrlgr3_amd64.asm - -rfxdwt_amd64_sse2.o: amd64/rfxdwt_amd64_sse2.asm - yasm -f elf64 -g dwarf2 amd64/rfxdwt_amd64_sse2.asm - -clean: - rm -f $(OBJS) librfxencode.so librfxencode.a diff --git a/src/Makefile.am b/src/Makefile.am new file mode 100644 index 0000000..54eb6fe --- /dev/null +++ b/src/Makefile.am @@ -0,0 +1,57 @@ +EXTRA_DIST = $(AMD64_ASM) $(X86_ASM) nasm_lt.sh + +AMD64_ASM = \ + amd64/cpuid_amd64.asm \ + amd64/rfxcodec_encode_diff_rlgr1_amd64_sse2.asm \ + amd64/rfxcodec_encode_diff_rlgr3_amd64_sse2.asm \ + amd64/rfxcodec_encode_dwt_shift_amd64_sse2.asm \ + amd64/rfxcodec_encode_dwt_shift_amd64_sse41.asm + +X86_ASM = \ + x86/cpuid_x86.asm \ + x86/rfxcodec_encode_diff_rlgr1_x86_sse2.asm \ + x86/rfxcodec_encode_diff_rlgr3_x86_sse2.asm \ + x86/rfxcodec_encode_dwt_shift_x86_sse2.asm \ + x86/rfxcodec_encode_dwt_shift_x86_sse41.asm + +ASM_SOURCES = + +AM_CPPFLAGS = \ + -I$(top_srcdir)/include \ + -I../include + +if WITH_SIMD_AMD64 +ASM_SOURCES += $(AMD64_ASM) +AM_CPPFLAGS += -DSIMD_USE_ACCEL=1 -DRFX_USE_ACCEL_AMD64=1 +endif + +if WITH_SIMD_X86 +ASM_SOURCES += $(X86_ASM) +AM_CPPFLAGS += -DSIMD_USE_ACCEL=1 -DRFX_USE_ACCEL_X86=1 +endif + +noinst_HEADERS = \ + rfx_bitstream.h \ + rfxcommon.h \ + rfxcompose.h \ + rfxconstants.h \ + rfxencode_alpha.h \ + rfxencode_differential.h \ + rfxencode_dwt.h \ + rfxencode.h \ + rfxencode_quantization.h \ + rfxencode_rlgr1.h \ + rfxencode_rlgr3.h \ + rfxencode_tile.h \ + amd64/funcs_amd64.h \ + x86/funcs_x86.h + +lib_LTLIBRARIES = librfxencode.la + +librfxencode_la_SOURCES = $(noinst_HEADERS) rfxencode.c \ + rfxcompose.c rfxencode_tile.c rfxencode_dwt.c \ + rfxencode_quantization.c rfxencode_differential.c \ + rfxencode_rlgr1.c rfxencode_rlgr3.c rfxencode_alpha.c $(ASM_SOURCES) + +.asm.lo: + $(LIBTOOL) --mode=compile $(srcdir)/nasm_lt.sh $(NASM) $(NAFLAGS) -I$(srcdir) -I. $< -o $@ diff --git a/src/amd64/cpuid_amd64.asm b/src/amd64/cpuid_amd64.asm index b97937a..e561b2d 100644 --- a/src/amd64/cpuid_amd64.asm +++ b/src/amd64/cpuid_amd64.asm @@ -1,3 +1,6 @@ +%ifidn __OUTPUT_FORMAT__,elf64 +SECTION .note.GNU-stack noalloc noexec nowrite progbits +%endif SECTION .text @@ -13,10 +16,14 @@ SECTION .text ;int ;cpuid_amd64(int eax_in, int ecx_in, int *eax, int *ebx, int *ecx, int *edx) +%ifidn __OUTPUT_FORMAT__,elf64 PROC cpuid_amd64 +%else +PROC _cpuid_amd64 +%endif ; save registers push rbx - + push rdx push rcx push r8 @@ -33,9 +40,9 @@ PROC cpuid_amd64 mov [rdi], ebx pop rdi mov [rdi], eax - mov eax, 0 + mov rax, 0 ; restore registers pop rbx - ret; + ret align 16 diff --git a/src/amd64/funcs_amd64.h b/src/amd64/funcs_amd64.h index 02cf6c8..124f838 100644 --- a/src/amd64/funcs_amd64.h +++ b/src/amd64/funcs_amd64.h @@ -1,5 +1,5 @@ /* -Copyright 2014 Jay Sorg +Copyright 2014-2015 Jay Sorg Permission to use, copy, modify, distribute, and sell this software and its documentation for any purpose is hereby granted without fee, provided that @@ -24,12 +24,48 @@ amd64 asm files #ifndef __FUNCS_AMD64_H #define __FUNCS_AMD64_H +#ifdef __cplusplus +extern "C" { +#endif + int cpuid_amd64(int eax_in, int ecx_in, int *eax, int *ebx, int *ecx, int *edx); + +int +rfxcodec_encode_dwt_shift_amd64_sse2(const char *qtable, + unsigned char *data, + short *dwt_buffer1, + short *dwt_buffer); +int +rfxcodec_encode_dwt_shift_amd64_sse41(const char *qtable, + unsigned char *data, + short *dwt_buffer1, + short *dwt_buffer); +int +rfxcodec_encode_diff_rlgr1_amd64_sse2(short *co, + void *dst, int dst_bytes); int -dwt_shift_amd64_sse2(unsigned char *yuvs, int width, int height, int *rgbs); +rfxcodec_encode_diff_rlgr3_amd64_sse2(short *co, + void *dst, int dst_bytes); + +int +rfxcodec_decode_rlgr1_diff_amd64_sse2(void *data, int data_bytes, + short *out_data); +int +rfxcodec_decode_rlgr3_diff_amd64_sse2(void *data, int data_bytes, + short *out_data); int -diff_rlgr3_amd64(sint16 *co, int num_co, uint8 *dst, int dst_bytes); +rfxcodec_decode_shift_idwt_amd64_sse2(char *qtable, short *src, short *dst); +int +rfxcodec_decode_yuv2rgb_amd64_sse2(short *ydata, short *udata, short *vdata, + unsigned int *rgbdata, int stride); +int +rfxcodec_decode_yuva2argb_amd64_sse2(short *ydata, short *udata, + short *vdata, char *adata, + unsigned int *rgbdata, int stride); +#ifdef __cplusplus +} #endif +#endif diff --git a/src/amd64/rfxcodec_encode_diff_rlgr1_amd64_sse2.asm b/src/amd64/rfxcodec_encode_diff_rlgr1_amd64_sse2.asm new file mode 100644 index 0000000..b2de84f --- /dev/null +++ b/src/amd64/rfxcodec_encode_diff_rlgr1_amd64_sse2.asm @@ -0,0 +1,36 @@ +%ifidn __OUTPUT_FORMAT__,elf64 +section .note.GNU-stack noalloc noexec nowrite progbits +%endif + +section .data + const1 times 8 dw 1 + +section .text + +%macro PROC 1 + align 16 + global %1 + %1: +%endmacro + +;The first six integer or pointer arguments are passed in registers +;RDI, RSI, RDX, RCX, R8, and R9 + +;int +;rfxcodec_encode_diff_rlgr1_amd64_sse2(short *co, +; void *dst, int dst_bytes); + +%ifidn __OUTPUT_FORMAT__,elf64 +PROC rfxcodec_encode_diff_rlgr1_amd64_sse2 +%else +PROC _rfxcodec_encode_diff_rlgr1_amd64_sse2 +%endif + ; save registers + push rbx + + mov rax, 0 + ; restore registers + pop rbx + ret + align 16 + diff --git a/src/amd64/rfxcodec_encode_diff_rlgr3_amd64_sse2.asm b/src/amd64/rfxcodec_encode_diff_rlgr3_amd64_sse2.asm new file mode 100644 index 0000000..f5712be --- /dev/null +++ b/src/amd64/rfxcodec_encode_diff_rlgr3_amd64_sse2.asm @@ -0,0 +1,31 @@ +%ifidn __OUTPUT_FORMAT__,elf64 +section .note.GNU-stack noalloc noexec nowrite progbits +%endif + +section .data + const1 times 8 dw 1 + +section .text + +%macro PROC 1 + align 16 + global %1 + %1: +%endmacro + +;int +;rfxcodec_encode_diff_rlgr3_amd64_sse2(short *co, +; void *dst, int dst_bytes); + +%ifidn __OUTPUT_FORMAT__,elf64 +PROC rfxcodec_encode_diff_rlgr3_amd64_sse2 +%else +PROC _rfxcodec_encode_diff_rlgr3_amd64_sse2 +%endif + ; save registers + push rbx + mov rax, 0 + pop rbx + ret + align 16 + diff --git a/src/amd64/rfxcodec_encode_dwt_shift_amd64_sse2.asm b/src/amd64/rfxcodec_encode_dwt_shift_amd64_sse2.asm new file mode 100644 index 0000000..ee97588 --- /dev/null +++ b/src/amd64/rfxcodec_encode_dwt_shift_amd64_sse2.asm @@ -0,0 +1,1503 @@ +; +;Copyright 2016 Jay Sorg +; +;Permission to use, copy, modify, distribute, and sell this software and its +;documentation for any purpose is hereby granted without fee, provided that +;the above copyright notice appear in all copies and that both that +;copyright notice and this permission notice appear in supporting +;documentation. +; +;The above copyright notice and this permission notice shall be included in +;all copies or substantial portions of the Software. +; +;THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +;IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +;FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +;OPEN GROUP BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN +;AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +;CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +; +;amd64 asm dwt + +%ifidn __OUTPUT_FORMAT__,elf64 +section .note.GNU-stack noalloc noexec nowrite progbits +%endif + +section .data + align 16 + cw128 times 8 dw 128 + cdFFFF times 4 dd 65535 + ; these are 1 << (factor - 1) 0 to 15 is factor + cwa0 times 8 dw 0 ; 0 + cwa1 times 8 dw 1 ; 1 + cwa2 times 8 dw 2 ; 2 + cwa4 times 8 dw 4 ; 3 + cwa8 times 8 dw 8 ; 4 + cwa16 times 8 dw 16 ; 5 + cwa32 times 8 dw 32 ; 6 + cwa64 times 8 dw 64 ; 7 + cwa128 times 8 dw 128 ; 8 + cwa256 times 8 dw 256 ; 9 + cwa512 times 8 dw 512 ; 10 + cwa1024 times 8 dw 1024 ; 11 + cwa2048 times 8 dw 2048 ; 12 + cwa4096 times 8 dw 4096 ; 13 + cwa8192 times 8 dw 8192 ; 14 + cwa16384 times 8 dw 16384 ; 15 + +section .text + +%macro PROC 1 + align 16 + global %1 + %1: +%endmacro + +;****************************************************************************** +; source 16 bit signed, 16 pixel width +rfx_dwt_2d_encode_block_horiz_16_16: + mov ecx, 8 +loop1a: + ; pre / post + movdqa xmm1, [rsi] ; src[2n] + movdqa xmm2, [rsi + 16] + movdqa xmm6, xmm1 + movdqa xmm7, xmm2 + pand xmm1, [rel cdFFFF] + pand xmm2, [rel cdFFFF] + pslld xmm1, 16 + pslld xmm2, 16 + psrad xmm1, 16 + psrad xmm2, 16 + packssdw xmm1, xmm2 + movdqa xmm2, xmm6 ; src[2n + 1] + movdqa xmm3, xmm7 + psrldq xmm2, 2 + psrldq xmm3, 2 + pand xmm2, [rel cdFFFF] + pand xmm3, [rel cdFFFF] + pslld xmm2, 16 + pslld xmm3, 16 + psrad xmm2, 16 + psrad xmm3, 16 + packssdw xmm2, xmm3 + movdqa xmm3, xmm6 ; src[2n + 2] + movdqa xmm4, xmm7 + psrldq xmm3, 4 + psrldq xmm4, 4 + movd eax, xmm7 + movd xmm5, eax + pslldq xmm5, 12 + por xmm3, xmm5 + movdqa xmm5, xmm7 + psrldq xmm5, 12 + pslldq xmm5, 12 + por xmm4, xmm5 + pand xmm3, [rel cdFFFF] + pand xmm4, [rel cdFFFF] + pslld xmm3, 16 + pslld xmm4, 16 + psrad xmm3, 16 + psrad xmm4, 16 + packssdw xmm3, xmm4 + movdqa xmm4, xmm1 + movdqa xmm5, xmm2 + movdqa xmm6, xmm3 + ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 + paddw xmm4, xmm6 + psraw xmm4, 1 + psubw xmm5, xmm4 + psraw xmm5, 1 + movdqa xmm6, xmm5 ; out hi + paddw xmm6, xmm8 + psraw xmm6, xmm9 + movdqa [rdi], xmm6 + ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) + movdqa xmm7, xmm5 + movd eax, xmm7 + pslldq xmm7, 2 + and eax, 0xFFFF + movd xmm6, eax + por xmm7, xmm6 + paddw xmm5, xmm7 + psraw xmm5, 1 + paddw xmm5, xmm1 + + movdqa xmm6, xmm5 ; out lo + paddw xmm6, xmm10 + psraw xmm6, xmm11 + movdqa [rdx], xmm6 + + ; move right + lea rsi, [rsi + 16 * 2] + lea rdi, [rdi + 8 * 2] + lea rdx, [rdx + 8 * 2] + + ; move left + lea rsi, [rsi - 16 * 2] + lea rdi, [rdi - 8 * 2] + lea rdx, [rdx - 8 * 2] + + ; move down + lea rsi, [rsi + 16 * 2] + lea rdi, [rdi + 8 * 2] + lea rdx, [rdx + 8 * 2] + + dec ecx + jnz loop1a + + ret + +;****************************************************************************** +; source 16 bit signed, 16 pixel width +rfx_dwt_2d_encode_block_verti_16_16: + mov ecx, 2 +loop1b: + ; pre + movdqa xmm1, [rsi] ; src[2n] + movdqa xmm2, [rsi + 16 * 2] ; src[2n + 1] + movdqa xmm3, [rsi + 16 * 2 * 2] ; src[2n + 2] + movdqa xmm4, xmm1 + movdqa xmm5, xmm2 + movdqa xmm6, xmm3 + ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 + paddw xmm4, xmm6 + psraw xmm4, 1 + psubw xmm5, xmm4 + psraw xmm5, 1 + movdqa [rdi], xmm5 ; out hi + movdqa xmm6, xmm5 ; save hi + ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) + paddw xmm5, xmm1 + movdqa [rdx], xmm5 ; out lo + movdqa xmm7, xmm6 ; save hi + ; move down + lea rsi, [rsi + 16 * 2 * 2] ; 2 rows + lea rdi, [rdi + 16 * 2] ; 1 row + lea rdx, [rdx + 16 * 2] ; 1 row + + ; loop + shl ecx, 16 + mov cx, 6 +loop2b: + movdqa xmm1, xmm3 ; src[2n] + movdqa xmm2, [rsi + 16 * 2] ; src[2n + 1] + movdqa xmm3, [rsi + 16 * 2 * 2] ; src[2n + 2] + movdqa xmm4, xmm1 + movdqa xmm5, xmm2 + movdqa xmm6, xmm3 + ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 + paddw xmm4, xmm6 + psraw xmm4, 1 + psubw xmm5, xmm4 + psraw xmm5, 1 + movdqa [rdi], xmm5 ; out hi + movdqa xmm6, xmm5 ; save hi + ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) + paddw xmm5, xmm7 + psraw xmm5, 1 + paddw xmm5, xmm1 + movdqa [rdx], xmm5 ; out lo + movdqa xmm7, xmm6 ; save hi + ; move down + lea rsi, [rsi + 16 * 2 * 2] ; 2 rows + lea rdi, [rdi + 16 * 2] ; 1 row + lea rdx, [rdx + 16 * 2] ; 1 row + + dec cx + jnz loop2b + shr ecx, 16 + + ; post + movdqa xmm1, xmm3 ; src[2n] + movdqa xmm2, [rsi + 16 * 2] ; src[2n + 1] + movdqa xmm4, xmm1 + movdqa xmm5, xmm2 + movdqa xmm6, xmm3 + ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 + paddw xmm4, xmm6 + psraw xmm4, 1 + psubw xmm5, xmm4 + psraw xmm5, 1 + movdqa [rdi], xmm5 ; out hi + ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) + paddw xmm5, xmm7 + psraw xmm5, 1 + paddw xmm5, xmm1 + movdqa [rdx], xmm5 ; out lo + ; move down + lea rsi, [rsi + 16 * 2 * 2] ; 2 row + lea rdi, [rdi + 16 * 2] ; 1 row + lea rdx, [rdx + 16 * 2] ; 1 row + + ; move up + lea rsi, [rsi - 16 * 16 * 2] + lea rdi, [rdi - 8 * 16 * 2] + lea rdx, [rdx - 8 * 16 * 2] + + ; move right + lea rsi, [rsi + 16] + lea rdi, [rdi + 16] + lea rdx, [rdx + 16] + + dec ecx + jnz loop1b + + ret + +;****************************************************************************** +; source 16 bit signed, 32 pixel width +rfx_dwt_2d_encode_block_horiz_16_32: + mov ecx, 16 +loop1c: + ; pre + movdqa xmm1, [rsi] ; src[2n] + movdqa xmm2, [rsi + 16] + movdqa xmm6, xmm1 + movdqa xmm7, xmm2 + pand xmm1, [rel cdFFFF] + pand xmm2, [rel cdFFFF] + pslld xmm1, 16 + pslld xmm2, 16 + psrad xmm1, 16 + psrad xmm2, 16 + packssdw xmm1, xmm2 + movdqa xmm2, xmm6 ; src[2n + 1] + movdqa xmm3, xmm7 + psrldq xmm2, 2 + psrldq xmm3, 2 + pand xmm2, [rel cdFFFF] + pand xmm3, [rel cdFFFF] + pslld xmm2, 16 + pslld xmm3, 16 + psrad xmm2, 16 + psrad xmm3, 16 + packssdw xmm2, xmm3 + movdqa xmm3, xmm6 ; src[2n + 2] + movdqa xmm4, xmm7 + psrldq xmm3, 4 + psrldq xmm4, 4 + movd eax, xmm7 + movd xmm5, eax + pslldq xmm5, 12 + por xmm3, xmm5 + mov eax, [rsi + 32] + movd xmm5, eax + pslldq xmm5, 12 + por xmm4, xmm5 + pand xmm3, [rel cdFFFF] + pand xmm4, [rel cdFFFF] + pslld xmm3, 16 + pslld xmm4, 16 + psrad xmm3, 16 + psrad xmm4, 16 + packssdw xmm3, xmm4 + movdqa xmm4, xmm1 + movdqa xmm5, xmm2 + movdqa xmm6, xmm3 + ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 + paddw xmm4, xmm6 + psraw xmm4, 1 + psubw xmm5, xmm4 + psraw xmm5, 1 + + movdqa xmm6, xmm5 ; out hi + paddw xmm6, xmm8 + psraw xmm6, xmm9 + movdqa [rdi], xmm6 + movdqa xmm2, xmm5 ; save hi + + ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) + movdqa xmm7, xmm5 + movd eax, xmm7 + pslldq xmm7, 2 + and eax, 0xFFFF + movd xmm6, eax + por xmm7, xmm6 + paddw xmm5, xmm7 + psraw xmm5, 1 + paddw xmm5, xmm1 + + psrldq xmm2, 14 + movd ebx, xmm2 ; save hi + + movdqa xmm6, xmm5 ; out lo + paddw xmm6, xmm10 + psraw xmm6, xmm11 + movdqa [rdx], xmm6 + + ; move right + lea rsi, [rsi + 16 * 2] + lea rdi, [rdi + 8 * 2] + lea rdx, [rdx + 8 * 2] + + ; post + movdqa xmm1, [rsi] ; src[2n] + movdqa xmm2, [rsi + 16] + movdqa xmm6, xmm1 + movdqa xmm7, xmm2 + pand xmm1, [rel cdFFFF] + pand xmm2, [rel cdFFFF] + pslld xmm1, 16 + pslld xmm2, 16 + psrad xmm1, 16 + psrad xmm2, 16 + packssdw xmm1, xmm2 + movdqa xmm2, xmm6 ; src[2n + 1] + movdqa xmm3, xmm7 + psrldq xmm2, 2 + psrldq xmm3, 2 + pand xmm2, [rel cdFFFF] + pand xmm3, [rel cdFFFF] + pslld xmm2, 16 + pslld xmm3, 16 + psrad xmm2, 16 + psrad xmm3, 16 + packssdw xmm2, xmm3 + movdqa xmm3, xmm6 ; src[2n + 2] + movdqa xmm4, xmm7 + psrldq xmm3, 4 + psrldq xmm4, 4 + movd eax, xmm7 + movd xmm5, eax + pslldq xmm5, 12 + por xmm3, xmm5 + movdqa xmm5, xmm7 + psrldq xmm5, 12 + pslldq xmm5, 12 + por xmm4, xmm5 + pand xmm3, [rel cdFFFF] + pand xmm4, [rel cdFFFF] + pslld xmm3, 16 + pslld xmm4, 16 + psrad xmm3, 16 + psrad xmm4, 16 + packssdw xmm3, xmm4 + movdqa xmm4, xmm1 + movdqa xmm5, xmm2 + movdqa xmm6, xmm3 + ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 + paddw xmm4, xmm6 + psraw xmm4, 1 + psubw xmm5, xmm4 + psraw xmm5, 1 + + movdqa xmm6, xmm5 ; out hi + paddw xmm6, xmm8 + psraw xmm6, xmm9 + movdqa [rdi], xmm6 + + ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) + movdqa xmm7, xmm5 + pslldq xmm7, 2 + movd xmm6, ebx + por xmm7, xmm6 + paddw xmm5, xmm7 + psraw xmm5, 1 + paddw xmm5, xmm1 + + movdqa xmm6, xmm5 ; out lo + paddw xmm6, xmm10 + psraw xmm6, xmm11 + movdqa [rdx], xmm6 + + ; move right + lea rsi, [rsi + 16 * 2] + lea rdi, [rdi + 8 * 2] + lea rdx, [rdx + 8 * 2] + + ; move left + lea rsi, [rsi - 32 * 2] + lea rdi, [rdi - 16 * 2] + lea rdx, [rdx - 16 * 2] + + ; move down + lea rsi, [rsi + 32 * 2] + lea rdi, [rdi + 16 * 2] + lea rdx, [rdx + 16 * 2] + + dec ecx + jnz loop1c + + ret + +;****************************************************************************** +; source 16 bit signed, 32 pixel width +rfx_dwt_2d_encode_block_horiz_16_32_no_lo: + mov ecx, 16 +loop1c1: + ; pre + movdqa xmm1, [rsi] ; src[2n] + movdqa xmm2, [rsi + 16] + movdqa xmm6, xmm1 + movdqa xmm7, xmm2 + pand xmm1, [rel cdFFFF] + pand xmm2, [rel cdFFFF] + pslld xmm1, 16 + pslld xmm2, 16 + psrad xmm1, 16 + psrad xmm2, 16 + packssdw xmm1, xmm2 + movdqa xmm2, xmm6 ; src[2n + 1] + movdqa xmm3, xmm7 + psrldq xmm2, 2 + psrldq xmm3, 2 + pand xmm2, [rel cdFFFF] + pand xmm3, [rel cdFFFF] + pslld xmm2, 16 + pslld xmm3, 16 + psrad xmm2, 16 + psrad xmm3, 16 + packssdw xmm2, xmm3 + movdqa xmm3, xmm6 ; src[2n + 2] + movdqa xmm4, xmm7 + psrldq xmm3, 4 + psrldq xmm4, 4 + movd eax, xmm7 + movd xmm5, eax + pslldq xmm5, 12 + por xmm3, xmm5 + mov eax, [rsi + 32] + movd xmm5, eax + pslldq xmm5, 12 + por xmm4, xmm5 + pand xmm3, [rel cdFFFF] + pand xmm4, [rel cdFFFF] + pslld xmm3, 16 + pslld xmm4, 16 + psrad xmm3, 16 + psrad xmm4, 16 + packssdw xmm3, xmm4 + movdqa xmm4, xmm1 + movdqa xmm5, xmm2 + movdqa xmm6, xmm3 + ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 + paddw xmm4, xmm6 + psraw xmm4, 1 + psubw xmm5, xmm4 + psraw xmm5, 1 + + movdqa xmm6, xmm5 ; out hi + paddw xmm6, xmm8 + psraw xmm6, xmm9 + movdqa [rdi], xmm6 + movdqa xmm2, xmm5 ; save hi + + ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) + movdqa xmm7, xmm5 + movd eax, xmm7 + pslldq xmm7, 2 + and eax, 0xFFFF + movd xmm6, eax + por xmm7, xmm6 + paddw xmm5, xmm7 + psraw xmm5, 1 + paddw xmm5, xmm1 + + psrldq xmm2, 14 + movd ebx, xmm2 ; save hi + + movdqa [rdx], xmm5 ; out lo + + ; move right + lea rsi, [rsi + 16 * 2] + lea rdi, [rdi + 8 * 2] + lea rdx, [rdx + 8 * 2] + + ; post + movdqa xmm1, [rsi] ; src[2n] + movdqa xmm2, [rsi + 16] + movdqa xmm6, xmm1 + movdqa xmm7, xmm2 + pand xmm1, [rel cdFFFF] + pand xmm2, [rel cdFFFF] + pslld xmm1, 16 + pslld xmm2, 16 + psrad xmm1, 16 + psrad xmm2, 16 + packssdw xmm1, xmm2 + movdqa xmm2, xmm6 ; src[2n + 1] + movdqa xmm3, xmm7 + psrldq xmm2, 2 + psrldq xmm3, 2 + pand xmm2, [rel cdFFFF] + pand xmm3, [rel cdFFFF] + pslld xmm2, 16 + pslld xmm3, 16 + psrad xmm2, 16 + psrad xmm3, 16 + packssdw xmm2, xmm3 + movdqa xmm3, xmm6 ; src[2n + 2] + movdqa xmm4, xmm7 + psrldq xmm3, 4 + psrldq xmm4, 4 + movd eax, xmm7 + movd xmm5, eax + pslldq xmm5, 12 + por xmm3, xmm5 + movdqa xmm5, xmm7 + psrldq xmm5, 12 + pslldq xmm5, 12 + por xmm4, xmm5 + pand xmm3, [rel cdFFFF] + pand xmm4, [rel cdFFFF] + pslld xmm3, 16 + pslld xmm4, 16 + psrad xmm3, 16 + psrad xmm4, 16 + packssdw xmm3, xmm4 + movdqa xmm4, xmm1 + movdqa xmm5, xmm2 + movdqa xmm6, xmm3 + ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 + paddw xmm4, xmm6 + psraw xmm4, 1 + psubw xmm5, xmm4 + psraw xmm5, 1 + + movdqa xmm6, xmm5 ; out hi + paddw xmm6, xmm8 + psraw xmm6, xmm9 + movdqa [rdi], xmm6 + + ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) + movdqa xmm7, xmm5 + pslldq xmm7, 2 + movd xmm6, ebx + por xmm7, xmm6 + paddw xmm5, xmm7 + psraw xmm5, 1 + paddw xmm5, xmm1 + + movdqa [rdx], xmm5 ; out lo + + ; move right + lea rsi, [rsi + 16 * 2] + lea rdi, [rdi + 8 * 2] + lea rdx, [rdx + 8 * 2] + + ; move left + lea rsi, [rsi - 32 * 2] + lea rdi, [rdi - 16 * 2] + lea rdx, [rdx - 16 * 2] + + ; move down + lea rsi, [rsi + 32 * 2] + lea rdi, [rdi + 16 * 2] + lea rdx, [rdx + 16 * 2] + + dec ecx + jnz loop1c1 + + ret + +;****************************************************************************** +; source 16 bit signed, 32 pixel width +rfx_dwt_2d_encode_block_verti_16_32: + mov ecx, 4 +loop1d: + ; pre + movdqa xmm1, [rsi] ; src[2n] + movdqa xmm2, [rsi + 32 * 2] ; src[2n + 1] + movdqa xmm3, [rsi + 32 * 2 * 2] ; src[2n + 2] + movdqa xmm4, xmm1 + movdqa xmm5, xmm2 + movdqa xmm6, xmm3 + ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 + paddw xmm4, xmm6 + psraw xmm4, 1 + psubw xmm5, xmm4 + psraw xmm5, 1 + movdqa [rdi], xmm5 ; out hi + movdqa xmm6, xmm5 ; save hi + ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) + paddw xmm5, xmm1 + movdqa [rdx], xmm5 ; out lo + movdqa xmm7, xmm6 ; save hi + ; move down + lea rsi, [rsi + 32 * 2 * 2] ; 2 rows + lea rdi, [rdi + 32 * 2] ; 1 row + lea rdx, [rdx + 32 * 2] ; 1 row + + ; loop + shl ecx, 16 + mov cx, 14 +loop2d: + movdqa xmm1, xmm3 ; src[2n] + movdqa xmm2, [rsi + 32 * 2] ; src[2n + 1] + movdqa xmm3, [rsi + 32 * 2 * 2] ; src[2n + 2] + movdqa xmm4, xmm1 + movdqa xmm5, xmm2 + movdqa xmm6, xmm3 + ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 + paddw xmm4, xmm6 + psraw xmm4, 1 + psubw xmm5, xmm4 + psraw xmm5, 1 + movdqa [rdi], xmm5 ; out hi + movdqa xmm6, xmm5 ; save hi + ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) + paddw xmm5, xmm7 + psraw xmm5, 1 + paddw xmm5, xmm1 + movdqa [rdx], xmm5 ; out lo + movdqa xmm7, xmm6 ; save hi + ; move down + lea rsi, [rsi + 32 * 2 * 2] ; 2 rows + lea rdi, [rdi + 32 * 2] ; 1 row + lea rdx, [rdx + 32 * 2] ; 1 row + + dec cx + jnz loop2d + shr ecx, 16 + + ; post + movdqa xmm1, xmm3 ; src[2n] + movdqa xmm2, [rsi + 32 * 2] ; src[2n + 1] + movdqa xmm4, xmm1 + movdqa xmm5, xmm2 + movdqa xmm6, xmm3 + ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 + paddw xmm4, xmm6 + psraw xmm4, 1 + psubw xmm5, xmm4 + psraw xmm5, 1 + movdqa [rdi], xmm5 ; out hi + ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) + paddw xmm5, xmm7 + psraw xmm5, 1 + paddw xmm5, xmm1 + movdqa [rdx], xmm5 ; out lo + ; move down + lea rsi, [rsi + 32 * 2 * 2] ; 2 row + lea rdi, [rdi + 32 * 2] ; 1 row + lea rdx, [rdx + 32 * 2] ; 1 row + + ; move up + lea rsi, [rsi - 32 * 32 * 2] + lea rdi, [rdi - 16 * 32 * 2] + lea rdx, [rdx - 16 * 32 * 2] + + ; move right + lea rsi, [rsi + 16] + lea rdi, [rdi + 16] + lea rdx, [rdx + 16] + + dec ecx + jnz loop1d + + ret + +;****************************************************************************** +; source 16 bit signed, 64 pixel width +rfx_dwt_2d_encode_block_horiz_16_64: + mov ecx, 32 +loop1e: + ; pre + movdqa xmm1, [rsi] ; src[2n] + movdqa xmm2, [rsi + 16] + movdqa xmm6, xmm1 + movdqa xmm7, xmm2 + pand xmm1, [rel cdFFFF] + pand xmm2, [rel cdFFFF] + pslld xmm1, 16 + pslld xmm2, 16 + psrad xmm1, 16 + psrad xmm2, 16 + packssdw xmm1, xmm2 + movdqa xmm2, xmm6 ; src[2n + 1] + movdqa xmm3, xmm7 + psrldq xmm2, 2 + psrldq xmm3, 2 + pand xmm2, [rel cdFFFF] + pand xmm3, [rel cdFFFF] + pslld xmm2, 16 + pslld xmm3, 16 + psrad xmm2, 16 + psrad xmm3, 16 + packssdw xmm2, xmm3 + movdqa xmm3, xmm6 ; src[2n + 2] + movdqa xmm4, xmm7 + psrldq xmm3, 4 + psrldq xmm4, 4 + movd eax, xmm7 + movd xmm5, eax + pslldq xmm5, 12 + por xmm3, xmm5 + mov eax, [rsi + 32] + movd xmm5, eax + pslldq xmm5, 12 + por xmm4, xmm5 + pand xmm3, [rel cdFFFF] + pand xmm4, [rel cdFFFF] + pslld xmm3, 16 + pslld xmm4, 16 + psrad xmm3, 16 + psrad xmm4, 16 + packssdw xmm3, xmm4 + movdqa xmm4, xmm1 + movdqa xmm5, xmm2 + movdqa xmm6, xmm3 + ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 + paddw xmm4, xmm6 + psraw xmm4, 1 + psubw xmm5, xmm4 + psraw xmm5, 1 + + movdqa xmm6, xmm5 ; out hi + paddw xmm6, xmm8 + psraw xmm6, xmm9 + movdqa [rdi], xmm6 + movdqa xmm2, xmm5 ; save hi + + ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) + movdqa xmm7, xmm5 + movd eax, xmm7 + pslldq xmm7, 2 + and eax, 0xFFFF + movd xmm6, eax + por xmm7, xmm6 + paddw xmm5, xmm7 + psraw xmm5, 1 + paddw xmm5, xmm1 + + psrldq xmm2, 14 + movd ebx, xmm2 ; save hi + + movdqa xmm6, xmm5 ; out lo + paddw xmm6, xmm10 + psraw xmm6, xmm11 + movdqa [rdx], xmm6 + + ; move right + lea rsi, [rsi + 16 * 2] + lea rdi, [rdi + 8 * 2] + lea rdx, [rdx + 8 * 2] + + ; loop + shl ecx, 16 + mov cx, 2 +loop2e: + movdqa xmm1, [rsi] ; src[2n] + movdqa xmm2, [rsi + 16] + movdqa xmm6, xmm1 + movdqa xmm7, xmm2 + pand xmm1, [rel cdFFFF] + pand xmm2, [rel cdFFFF] + pslld xmm1, 16 + pslld xmm2, 16 + psrad xmm1, 16 + psrad xmm2, 16 + packssdw xmm1, xmm2 + movdqa xmm2, xmm6 ; src[2n + 1] + movdqa xmm3, xmm7 + psrldq xmm2, 2 + psrldq xmm3, 2 + pand xmm2, [rel cdFFFF] + pand xmm3, [rel cdFFFF] + pslld xmm2, 16 + pslld xmm3, 16 + psrad xmm2, 16 + psrad xmm3, 16 + packssdw xmm2, xmm3 + movdqa xmm3, xmm6 ; src[2n + 2] + movdqa xmm4, xmm7 + psrldq xmm3, 4 + psrldq xmm4, 4 + movd eax, xmm7 + movd xmm5, eax + pslldq xmm5, 12 + por xmm3, xmm5 + mov eax, [rsi + 32] + movd xmm5, eax + pslldq xmm5, 12 + por xmm4, xmm5 + pand xmm3, [rel cdFFFF] + pand xmm4, [rel cdFFFF] + pslld xmm3, 16 + pslld xmm4, 16 + psrad xmm3, 16 + psrad xmm4, 16 + packssdw xmm3, xmm4 + movdqa xmm4, xmm1 + movdqa xmm5, xmm2 + movdqa xmm6, xmm3 + ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 + paddw xmm4, xmm6 + psraw xmm4, 1 + psubw xmm5, xmm4 + psraw xmm5, 1 + + movdqa xmm6, xmm5 ; out hi + paddw xmm6, xmm8 + psraw xmm6, xmm9 + movdqa [rdi], xmm6 + movdqa xmm2, xmm5 ; save hi + + ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) + movdqa xmm7, xmm5 + pslldq xmm7, 2 + movd xmm6, ebx + por xmm7, xmm6 + paddw xmm5, xmm7 + psraw xmm5, 1 + paddw xmm5, xmm1 + + psrldq xmm2, 14 + movd ebx, xmm2 ; save hi + + movdqa xmm6, xmm5 ; out lo + paddw xmm6, xmm10 + psraw xmm6, xmm11 + movdqa [rdx], xmm6 + + ; move right + lea rsi, [rsi + 16 * 2] + lea rdi, [rdi + 8 * 2] + lea rdx, [rdx + 8 * 2] + + dec cx + jnz loop2e + shr ecx, 16 + + ; post + movdqa xmm1, [rsi] ; src[2n] + movdqa xmm2, [rsi + 16] + movdqa xmm6, xmm1 + movdqa xmm7, xmm2 + pand xmm1, [rel cdFFFF] + pand xmm2, [rel cdFFFF] + pslld xmm1, 16 + pslld xmm2, 16 + psrad xmm1, 16 + psrad xmm2, 16 + packssdw xmm1, xmm2 + movdqa xmm2, xmm6 ; src[2n + 1] + movdqa xmm3, xmm7 + psrldq xmm2, 2 + psrldq xmm3, 2 + pand xmm2, [rel cdFFFF] + pand xmm3, [rel cdFFFF] + pslld xmm2, 16 + pslld xmm3, 16 + psrad xmm2, 16 + psrad xmm3, 16 + packssdw xmm2, xmm3 + movdqa xmm3, xmm6 ; src[2n + 2] + movdqa xmm4, xmm7 + psrldq xmm3, 4 + psrldq xmm4, 4 + movd eax, xmm7 + movd xmm5, eax + pslldq xmm5, 12 + por xmm3, xmm5 + movdqa xmm5, xmm7 + psrldq xmm5, 12 + pslldq xmm5, 12 + por xmm4, xmm5 + pand xmm3, [rel cdFFFF] + pand xmm4, [rel cdFFFF] + pslld xmm3, 16 + pslld xmm4, 16 + psrad xmm3, 16 + psrad xmm4, 16 + packssdw xmm3, xmm4 + movdqa xmm4, xmm1 + movdqa xmm5, xmm2 + movdqa xmm6, xmm3 + ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 + paddw xmm4, xmm6 + psraw xmm4, 1 + psubw xmm5, xmm4 + psraw xmm5, 1 + + movdqa xmm6, xmm5 ; out hi + paddw xmm6, xmm8 + psraw xmm6, xmm9 + movdqa [rdi], xmm6 + + ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) + movdqa xmm7, xmm5 + pslldq xmm7, 2 + movd xmm6, ebx + por xmm7, xmm6 + paddw xmm5, xmm7 + psraw xmm5, 1 + paddw xmm5, xmm1 + + movdqa xmm6, xmm5 ; out lo + paddw xmm6, xmm10 + psraw xmm6, xmm11 + movdqa [rdx], xmm6 + + ; move right + lea rsi, [rsi + 16 * 2] + lea rdi, [rdi + 8 * 2] + lea rdx, [rdx + 8 * 2] + + ; move left + lea rsi, [rsi - 64 * 2] + lea rdi, [rdi - 32 * 2] + lea rdx, [rdx - 32 * 2] + + ; move down + lea rsi, [rsi + 64 * 2] + lea rdi, [rdi + 32 * 2] + lea rdx, [rdx + 32 * 2] + + dec ecx + jnz loop1e + + ret + +;****************************************************************************** +; source 16 bit signed, 64 pixel width +rfx_dwt_2d_encode_block_horiz_16_64_no_lo: + mov ecx, 32 +loop1e1: + ; pre + movdqa xmm1, [rsi] ; src[2n] + movdqa xmm2, [rsi + 16] + movdqa xmm6, xmm1 + movdqa xmm7, xmm2 + pand xmm1, [rel cdFFFF] + pand xmm2, [rel cdFFFF] + pslld xmm1, 16 + pslld xmm2, 16 + psrad xmm1, 16 + psrad xmm2, 16 + packssdw xmm1, xmm2 + movdqa xmm2, xmm6 ; src[2n + 1] + movdqa xmm3, xmm7 + psrldq xmm2, 2 + psrldq xmm3, 2 + pand xmm2, [rel cdFFFF] + pand xmm3, [rel cdFFFF] + pslld xmm2, 16 + pslld xmm3, 16 + psrad xmm2, 16 + psrad xmm3, 16 + packssdw xmm2, xmm3 + movdqa xmm3, xmm6 ; src[2n + 2] + movdqa xmm4, xmm7 + psrldq xmm3, 4 + psrldq xmm4, 4 + movd eax, xmm7 + movd xmm5, eax + pslldq xmm5, 12 + por xmm3, xmm5 + mov eax, [rsi + 32] + movd xmm5, eax + pslldq xmm5, 12 + por xmm4, xmm5 + pand xmm3, [rel cdFFFF] + pand xmm4, [rel cdFFFF] + pslld xmm3, 16 + pslld xmm4, 16 + psrad xmm3, 16 + psrad xmm4, 16 + packssdw xmm3, xmm4 + movdqa xmm4, xmm1 + movdqa xmm5, xmm2 + movdqa xmm6, xmm3 + ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 + paddw xmm4, xmm6 + psraw xmm4, 1 + psubw xmm5, xmm4 + psraw xmm5, 1 + + movdqa xmm6, xmm5 ; out hi + paddw xmm6, xmm8 + psraw xmm6, xmm9 + movdqa [rdi], xmm6 + movdqa xmm2, xmm5 ; save hi + + ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) + movdqa xmm7, xmm5 + movd eax, xmm7 + pslldq xmm7, 2 + and eax, 0xFFFF + movd xmm6, eax + por xmm7, xmm6 + paddw xmm5, xmm7 + psraw xmm5, 1 + paddw xmm5, xmm1 + + psrldq xmm2, 14 + movd ebx, xmm2 ; save hi + + movdqa [rdx], xmm5 ; out lo + + ; move right + lea rsi, [rsi + 16 * 2] + lea rdi, [rdi + 8 * 2] + lea rdx, [rdx + 8 * 2] + + ; loop + shl ecx, 16 + mov cx, 2 +loop2e1: + movdqa xmm1, [rsi] ; src[2n] + movdqa xmm2, [rsi + 16] + movdqa xmm6, xmm1 + movdqa xmm7, xmm2 + pand xmm1, [rel cdFFFF] + pand xmm2, [rel cdFFFF] + pslld xmm1, 16 + pslld xmm2, 16 + psrad xmm1, 16 + psrad xmm2, 16 + packssdw xmm1, xmm2 + movdqa xmm2, xmm6 ; src[2n + 1] + movdqa xmm3, xmm7 + psrldq xmm2, 2 + psrldq xmm3, 2 + pand xmm2, [rel cdFFFF] + pand xmm3, [rel cdFFFF] + pslld xmm2, 16 + pslld xmm3, 16 + psrad xmm2, 16 + psrad xmm3, 16 + packssdw xmm2, xmm3 + movdqa xmm3, xmm6 ; src[2n + 2] + movdqa xmm4, xmm7 + psrldq xmm3, 4 + psrldq xmm4, 4 + movd eax, xmm7 + movd xmm5, eax + pslldq xmm5, 12 + por xmm3, xmm5 + mov eax, [rsi + 32] + movd xmm5, eax + pslldq xmm5, 12 + por xmm4, xmm5 + pand xmm3, [rel cdFFFF] + pand xmm4, [rel cdFFFF] + pslld xmm3, 16 + pslld xmm4, 16 + psrad xmm3, 16 + psrad xmm4, 16 + packssdw xmm3, xmm4 + movdqa xmm4, xmm1 + movdqa xmm5, xmm2 + movdqa xmm6, xmm3 + ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 + paddw xmm4, xmm6 + psraw xmm4, 1 + psubw xmm5, xmm4 + psraw xmm5, 1 + + movdqa xmm6, xmm5 ; out hi + paddw xmm6, xmm8 + psraw xmm6, xmm9 + movdqa [rdi], xmm6 + movdqa xmm2, xmm5 ; save hi + + ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) + movdqa xmm7, xmm5 + pslldq xmm7, 2 + movd xmm6, ebx + por xmm7, xmm6 + paddw xmm5, xmm7 + psraw xmm5, 1 + paddw xmm5, xmm1 + + psrldq xmm2, 14 + movd ebx, xmm2 ; save hi + + movdqa [rdx], xmm5 ; out lo + + ; move right + lea rsi, [rsi + 16 * 2] + lea rdi, [rdi + 8 * 2] + lea rdx, [rdx + 8 * 2] + + dec cx + jnz loop2e1 + shr ecx, 16 + + ; post + movdqa xmm1, [rsi] ; src[2n] + movdqa xmm2, [rsi + 16] + movdqa xmm6, xmm1 + movdqa xmm7, xmm2 + pand xmm1, [rel cdFFFF] + pand xmm2, [rel cdFFFF] + pslld xmm1, 16 + pslld xmm2, 16 + psrad xmm1, 16 + psrad xmm2, 16 + packssdw xmm1, xmm2 + movdqa xmm2, xmm6 ; src[2n + 1] + movdqa xmm3, xmm7 + psrldq xmm2, 2 + psrldq xmm3, 2 + pand xmm2, [rel cdFFFF] + pand xmm3, [rel cdFFFF] + pslld xmm2, 16 + pslld xmm3, 16 + psrad xmm2, 16 + psrad xmm3, 16 + packssdw xmm2, xmm3 + movdqa xmm3, xmm6 ; src[2n + 2] + movdqa xmm4, xmm7 + psrldq xmm3, 4 + psrldq xmm4, 4 + movd eax, xmm7 + movd xmm5, eax + pslldq xmm5, 12 + por xmm3, xmm5 + movdqa xmm5, xmm7 + psrldq xmm5, 12 + pslldq xmm5, 12 + por xmm4, xmm5 + pand xmm3, [rel cdFFFF] + pand xmm4, [rel cdFFFF] + pslld xmm3, 16 + pslld xmm4, 16 + psrad xmm3, 16 + psrad xmm4, 16 + packssdw xmm3, xmm4 + movdqa xmm4, xmm1 + movdqa xmm5, xmm2 + movdqa xmm6, xmm3 + ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 + paddw xmm4, xmm6 + psraw xmm4, 1 + psubw xmm5, xmm4 + psraw xmm5, 1 + + movdqa xmm6, xmm5 ; out hi + paddw xmm6, xmm8 + psraw xmm6, xmm9 + movdqa [rdi], xmm6 + + ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) + movdqa xmm7, xmm5 + pslldq xmm7, 2 + movd xmm6, ebx + por xmm7, xmm6 + paddw xmm5, xmm7 + psraw xmm5, 1 + paddw xmm5, xmm1 + + movdqa [rdx], xmm5 ; out lo + + ; move right + lea rsi, [rsi + 16 * 2] + lea rdi, [rdi + 8 * 2] + lea rdx, [rdx + 8 * 2] + + ; move left + lea rsi, [rsi - 64 * 2] + lea rdi, [rdi - 32 * 2] + lea rdx, [rdx - 32 * 2] + + ; move down + lea rsi, [rsi + 64 * 2] + lea rdi, [rdi + 32 * 2] + lea rdx, [rdx + 32 * 2] + + dec ecx + jnz loop1e1 + + ret + +;****************************************************************************** +; source 8 bit unsigned, 64 pixel width +rfx_dwt_2d_encode_block_verti_8_64: + mov ecx, 8 +loop1f: + ; pre + movq xmm1, [rsi] ; src[2n] + movq xmm2, [rsi + 64 * 1] ; src[2n + 1] + movq xmm3, [rsi + 64 * 1 * 2] ; src[2n + 2] + punpcklbw xmm1, xmm0 + punpcklbw xmm2, xmm0 + punpcklbw xmm3, xmm0 + psubw xmm1, [rel cw128] + psubw xmm2, [rel cw128] + psubw xmm3, [rel cw128] + psllw xmm1, 5 + psllw xmm2, 5 + psllw xmm3, 5 + movdqa xmm4, xmm1 + movdqa xmm5, xmm2 + movdqa xmm6, xmm3 + ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 + paddw xmm4, xmm6 + psraw xmm4, 1 + psubw xmm5, xmm4 + psraw xmm5, 1 + movdqa [rdi], xmm5 ; out hi + movdqa xmm6, xmm5 ; save hi + ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) + paddw xmm5, xmm1 + movdqa [rdx], xmm5 ; out lo + movdqa xmm7, xmm6 ; save hi + ; move down + lea rsi, [rsi + 64 * 1 * 2] ; 2 rows + lea rdi, [rdi + 64 * 2] ; 1 row + lea rdx, [rdx + 64 * 2] ; 1 row + + ; loop + shl ecx, 16 + mov cx, 30 +loop2f: + movdqa xmm1, xmm3 ; src[2n] + movq xmm2, [rsi + 64 * 1] ; src[2n + 1] + movq xmm3, [rsi + 64 * 1 * 2] ; src[2n + 2] + punpcklbw xmm2, xmm0 + punpcklbw xmm3, xmm0 + psubw xmm2, [rel cw128] + psubw xmm3, [rel cw128] + psllw xmm2, 5 + psllw xmm3, 5 + movdqa xmm4, xmm1 + movdqa xmm5, xmm2 + movdqa xmm6, xmm3 + ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 + paddw xmm4, xmm6 + psraw xmm4, 1 + psubw xmm5, xmm4 + psraw xmm5, 1 + movdqa [rdi], xmm5 ; out hi + movdqa xmm6, xmm5 ; save hi + ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) + paddw xmm5, xmm7 + psraw xmm5, 1 + paddw xmm5, xmm1 + movdqa [rdx], xmm5 ; out lo + movdqa xmm7, xmm6 ; save hi + ; move down + lea rsi, [rsi + 64 * 1 * 2] ; 2 rows + lea rdi, [rdi + 64 * 2] ; 1 row + lea rdx, [rdx + 64 * 2] ; 1 row + + dec cx + jnz loop2f + shr ecx, 16 + + ; post + movdqa xmm1, xmm3 ; src[2n] + movq xmm2, [rsi + 64 * 1] ; src[2n + 1] + punpcklbw xmm2, xmm0 + psubw xmm2, [rel cw128] + psllw xmm2, 5 + movdqa xmm4, xmm1 + movdqa xmm5, xmm2 + movdqa xmm6, xmm3 + ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 + paddw xmm4, xmm6 + psraw xmm4, 1 + psubw xmm5, xmm4 + psraw xmm5, 1 + movdqa [rdi], xmm5 ; out hi + ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) + paddw xmm5, xmm7 + psraw xmm5, 1 + paddw xmm5, xmm1 + movdqa [rdx], xmm5 ; out lo + ; move down + lea rsi, [rsi + 64 * 1 * 2] ; 2 rows + lea rdi, [rdi + 64 * 2] ; 1 row + lea rdx, [rdx + 64 * 2] ; 1 row + + ; move up + lea rsi, [rsi - 64 * 1 * 64] + lea rdi, [rdi - 32 * 64 * 2] + lea rdx, [rdx - 32 * 64 * 2] + + ; move right + lea rsi, [rsi + 8] + lea rdi, [rdi + 16] + lea rdx, [rdx + 16] + + dec ecx + jnz loop1f + + ret + +set_quants_hi: + sub rax, 6 - 5 + movd xmm9, eax + imul rax, 16 + lea rdx, [rel cwa0] + add rdx, rax + movdqa xmm8, [rdx] + ret + +set_quants_lo: + sub rax, 6 - 5 + movd xmm11, eax + imul rax, 16 + lea rdx, [rel cwa0] + add rdx, rax + movdqa xmm10, [rdx] + ret + +;The first six integer or pointer arguments are passed in registers +;RDI, RSI, RDX, RCX, R8, and R9 + +;int +;rfxcodec_encode_dwt_shift_amd64_sse2(const char *qtable, +; unsigned char *in_buffer, +; short *out_buffer, +; short *work_buffer); + +;****************************************************************************** +%ifidn __OUTPUT_FORMAT__,elf64 +PROC rfxcodec_encode_dwt_shift_amd64_sse2 +%else +PROC _rfxcodec_encode_dwt_shift_amd64_sse2 +%endif + ; save registers + push rbx + push rdx + push rcx + push rsi + push rdi + pxor xmm0, xmm0 + + ; verical DWT to work buffer, level 1 + mov rsi, [rsp + 8] ; src + mov rdi, [rsp + 16] ; dst hi + lea rdi, [rdi + 64 * 32 * 2] ; dst hi + mov rdx, [rsp + 16] ; dst lo + call rfx_dwt_2d_encode_block_verti_8_64 + + ; horizontal DWT to out buffer, level 1, part 1 + xor rax, rax + mov rdx, [rsp] + mov al, [rdx + 4] + and al, 0xF + call set_quants_hi + mov rsi, [rsp + 16] ; src + mov rdi, [rsp + 24] ; dst hi - HL1 + mov rdx, [rsp + 24] ; dst lo - LL1 + lea rdx, [rdx + 32 * 32 * 6] ; dst lo - LL1 + call rfx_dwt_2d_encode_block_horiz_16_64_no_lo + + ; horizontal DWT to out buffer, level 1, part 2 + xor rax, rax + mov rdx, [rsp] + mov al, [rdx + 4] + shr al, 4 + call set_quants_hi + xor rax, rax + mov rdx, [rsp] + mov al, [rdx + 3] + shr al, 4 + call set_quants_lo + mov rsi, [rsp + 16] ; src + lea rsi, [rsi + 64 * 32 * 2] ; src + mov rdi, [rsp + 24] ; dst hi - HH1 + lea rdi, [rdi + 32 * 32 * 4] ; dst hi - HH1 + mov rdx, [rsp + 24] ; dst lo - LH1 + lea rdx, [rdx + 32 * 32 * 2] ; dst lo - LH1 + call rfx_dwt_2d_encode_block_horiz_16_64 + + ; verical DWT to work buffer, level 2 + mov rsi, [rsp + 24] ; src + lea rsi, [rsi + 32 * 32 * 6] ; src + mov rdi, [rsp + 16] ; dst hi + lea rdi, [rdi + 32 * 16 * 2] ; dst hi + mov rdx, [rsp + 16] ; dst lo + call rfx_dwt_2d_encode_block_verti_16_32 + + ; horizontal DWT to out buffer, level 2, part 1 + xor rax, rax + mov rdx, [rsp] + mov al, [rdx + 2] + shr al, 4 + call set_quants_hi + mov rsi, [rsp + 16] ; src + ; 32 * 32 * 6 + 16 * 16 * 0 = 6144 + mov rdi, [rsp + 24] ; dst hi - HL2 + lea rdi, [rdi + 6144] ; dst hi - HL2 + ; 32 * 32 * 6 + 16 * 16 * 6 = 7680 + mov rdx, [rsp + 24] ; dst lo - LL2 + lea rdx, [rdx + 7680] ; dst lo - LL2 + call rfx_dwt_2d_encode_block_horiz_16_32_no_lo + + ; horizontal DWT to out buffer, level 2, part 2 + xor rax, rax + mov rdx, [rsp] + mov al, [rdx + 3] + and al, 0xF + call set_quants_hi + xor rax, rax + mov rdx, [rsp] + mov al, [rdx + 2] + and al, 0xF + call set_quants_lo + mov rsi, [rsp + 16] ; src + lea rsi, [rsi + 32 * 16 * 2] ; src + ; 32 * 32 * 6 + 16 * 16 * 4 = 7168 + mov rdi, [rsp + 24] ; dst hi - HH2 + lea rdi, [rdi + 7168] ; dst hi - HH2 + ; 32 * 32 * 6 + 16 * 16 * 2 = 6656 + mov rdx, [rsp + 24] ; dst lo - LH2 + lea rdx, [rdx + 6656] ; dst lo - LH2 + call rfx_dwt_2d_encode_block_horiz_16_32 + + ; verical DWT to work buffer, level 3 + ; 32 * 32 * 6 + 16 * 16 * 6 = 7680 + mov rsi, [rsp + 24] ; src + lea rsi, [rsi + 7680] ; src + mov rdi, [rsp + 16] ; dst hi + lea rdi, [rdi + 16 * 8 * 2] ; dst hi + mov rdx, [rsp + 16] ; dst lo + call rfx_dwt_2d_encode_block_verti_16_16 + + ; horizontal DWT to out buffer, level 3, part 1 + xor rax, rax + mov rdx, [rsp] + mov al, [rdx + 1] + and al, 0xF + call set_quants_hi + xor rax, rax + mov rdx, [rsp] + mov al, [rdx + 0] + and al, 0xF + call set_quants_lo + mov rsi, [rsp + 16] ; src + ; 32 * 32 * 6 + 16 * 16 * 6 + 8 * 8 * 0 = 7680 + mov rdi, [rsp + 24] ; dst hi - HL3 + lea rdi, [rdi + 7680] ; dst hi - HL3 + ; 32 * 32 * 6 + 16 * 16 * 6 + 8 * 8 * 6 = 8064 + mov rdx, [rsp + 24] ; dst lo - LL3 + lea rdx, [rdx + 8064] ; dst lo - LL3 + call rfx_dwt_2d_encode_block_horiz_16_16 + + ; horizontal DWT to out buffer, level 3, part 2 + xor rax, rax + mov rdx, [rsp] + mov al, [rdx + 1] + shr al, 4 + call set_quants_hi + xor rax, rax + mov rdx, [rsp] + mov al, [rdx + 0] + shr al, 4 + call set_quants_lo + mov rsi, [rsp + 16] ; src + lea rsi, [rsi + 16 * 8 * 2] ; src + ; 32 * 32 * 6 + 16 * 16 * 6 + 8 * 8 * 4 = 7936 + mov rdi, [rsp + 24] ; dst hi - HH3 + lea rdi, [rdi + 7936] ; dst hi - HH3 + ; 32 * 32 * 6 + 16 * 16 * 6 + 8 * 8 * 2 = 7808 + mov rdx, [rsp + 24] ; dst lo - LH3 + lea rdx, [rdx + 7808] ; dst lo - LH3 + call rfx_dwt_2d_encode_block_horiz_16_16 + + mov rax, 0 + ; restore registers + pop rdi + pop rsi + pop rcx + pop rdx + pop rbx + ret + align 16 + diff --git a/src/amd64/rfxcodec_encode_dwt_shift_amd64_sse41.asm b/src/amd64/rfxcodec_encode_dwt_shift_amd64_sse41.asm new file mode 100644 index 0000000..ab52808 --- /dev/null +++ b/src/amd64/rfxcodec_encode_dwt_shift_amd64_sse41.asm @@ -0,0 +1,1371 @@ +; +;Copyright 2016 Jay Sorg +; +;Permission to use, copy, modify, distribute, and sell this software and its +;documentation for any purpose is hereby granted without fee, provided that +;the above copyright notice appear in all copies and that both that +;copyright notice and this permission notice appear in supporting +;documentation. +; +;The above copyright notice and this permission notice shall be included in +;all copies or substantial portions of the Software. +; +;THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +;IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +;FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +;OPEN GROUP BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN +;AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +;CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +; +;amd64 asm dwt + +%ifidn __OUTPUT_FORMAT__,elf64 +section .note.GNU-stack noalloc noexec nowrite progbits +%endif + +section .data + align 16 + cw128 times 8 dw 128 + cdFFFF times 4 dd 65535 + ; these are 1 << (factor - 1) 0 to 15 is factor + cwa0 times 8 dw 0 ; 0 + cwa1 times 8 dw 1 ; 1 + cwa2 times 8 dw 2 ; 2 + cwa4 times 8 dw 4 ; 3 + cwa8 times 8 dw 8 ; 4 + cwa16 times 8 dw 16 ; 5 + cwa32 times 8 dw 32 ; 6 + cwa64 times 8 dw 64 ; 7 + cwa128 times 8 dw 128 ; 8 + cwa256 times 8 dw 256 ; 9 + cwa512 times 8 dw 512 ; 10 + cwa1024 times 8 dw 1024 ; 11 + cwa2048 times 8 dw 2048 ; 12 + cwa4096 times 8 dw 4096 ; 13 + cwa8192 times 8 dw 8192 ; 14 + cwa16384 times 8 dw 16384 ; 15 + +section .text + +%macro PROC 1 + align 16 + global %1 + %1: +%endmacro + +;****************************************************************************** +; source 16 bit signed, 16 pixel width +rfx_dwt_2d_encode_block_horiz_16_16: + mov ecx, 8 +loop1a: + ; pre / post + movdqa xmm1, [rsi] ; src[2n] + movdqa xmm2, [rsi + 16] + movdqa xmm6, xmm1 + movdqa xmm7, xmm2 + pand xmm1, [rel cdFFFF] + pand xmm2, [rel cdFFFF] + packusdw xmm1, xmm2 + movdqa xmm2, xmm6 ; src[2n + 1] + movdqa xmm3, xmm7 + psrldq xmm2, 2 + psrldq xmm3, 2 + pand xmm2, [rel cdFFFF] + pand xmm3, [rel cdFFFF] + packusdw xmm2, xmm3 + movdqa xmm3, xmm6 ; src[2n + 2] + movdqa xmm4, xmm7 + psrldq xmm3, 4 + psrldq xmm4, 4 + movd eax, xmm7 + movd xmm5, eax + pslldq xmm5, 12 + por xmm3, xmm5 + movdqa xmm5, xmm7 + psrldq xmm5, 12 + pslldq xmm5, 12 + por xmm4, xmm5 + pand xmm3, [rel cdFFFF] + pand xmm4, [rel cdFFFF] + packusdw xmm3, xmm4 + movdqa xmm4, xmm1 + movdqa xmm5, xmm2 + movdqa xmm6, xmm3 + ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 + paddw xmm4, xmm6 + psraw xmm4, 1 + psubw xmm5, xmm4 + psraw xmm5, 1 + movdqa xmm6, xmm5 ; out hi + paddw xmm6, xmm8 + psraw xmm6, xmm9 + movdqa [rdi], xmm6 + ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) + movdqa xmm7, xmm5 + movd eax, xmm7 + pslldq xmm7, 2 + and eax, 0xFFFF + movd xmm6, eax + por xmm7, xmm6 + paddw xmm5, xmm7 + psraw xmm5, 1 + paddw xmm5, xmm1 + + movdqa xmm6, xmm5 ; out lo + paddw xmm6, xmm10 + psraw xmm6, xmm11 + movdqa [rdx], xmm6 + + ; move right + lea rsi, [rsi + 16 * 2] + lea rdi, [rdi + 8 * 2] + lea rdx, [rdx + 8 * 2] + + ; move left + lea rsi, [rsi - 16 * 2] + lea rdi, [rdi - 8 * 2] + lea rdx, [rdx - 8 * 2] + + ; move down + lea rsi, [rsi + 16 * 2] + lea rdi, [rdi + 8 * 2] + lea rdx, [rdx + 8 * 2] + + dec ecx + jnz loop1a + + ret + +;****************************************************************************** +; source 16 bit signed, 16 pixel width +rfx_dwt_2d_encode_block_verti_16_16: + mov ecx, 2 +loop1b: + ; pre + movdqa xmm1, [rsi] ; src[2n] + movdqa xmm2, [rsi + 16 * 2] ; src[2n + 1] + movdqa xmm3, [rsi + 16 * 2 * 2] ; src[2n + 2] + movdqa xmm4, xmm1 + movdqa xmm5, xmm2 + movdqa xmm6, xmm3 + ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 + paddw xmm4, xmm6 + psraw xmm4, 1 + psubw xmm5, xmm4 + psraw xmm5, 1 + movdqa [rdi], xmm5 ; out hi + movdqa xmm6, xmm5 ; save hi + ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) + paddw xmm5, xmm1 + movdqa [rdx], xmm5 ; out lo + movdqa xmm7, xmm6 ; save hi + ; move down + lea rsi, [rsi + 16 * 2 * 2] ; 2 rows + lea rdi, [rdi + 16 * 2] ; 1 row + lea rdx, [rdx + 16 * 2] ; 1 row + + ; loop + shl ecx, 16 + mov cx, 6 +loop2b: + movdqa xmm1, xmm3 ; src[2n] + movdqa xmm2, [rsi + 16 * 2] ; src[2n + 1] + movdqa xmm3, [rsi + 16 * 2 * 2] ; src[2n + 2] + movdqa xmm4, xmm1 + movdqa xmm5, xmm2 + movdqa xmm6, xmm3 + ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 + paddw xmm4, xmm6 + psraw xmm4, 1 + psubw xmm5, xmm4 + psraw xmm5, 1 + movdqa [rdi], xmm5 ; out hi + movdqa xmm6, xmm5 ; save hi + ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) + paddw xmm5, xmm7 + psraw xmm5, 1 + paddw xmm5, xmm1 + movdqa [rdx], xmm5 ; out lo + movdqa xmm7, xmm6 ; save hi + ; move down + lea rsi, [rsi + 16 * 2 * 2] ; 2 rows + lea rdi, [rdi + 16 * 2] ; 1 row + lea rdx, [rdx + 16 * 2] ; 1 row + + dec cx + jnz loop2b + shr ecx, 16 + + ; post + movdqa xmm1, xmm3 ; src[2n] + movdqa xmm2, [rsi + 16 * 2] ; src[2n + 1] + movdqa xmm4, xmm1 + movdqa xmm5, xmm2 + movdqa xmm6, xmm3 + ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 + paddw xmm4, xmm6 + psraw xmm4, 1 + psubw xmm5, xmm4 + psraw xmm5, 1 + movdqa [rdi], xmm5 ; out hi + ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) + paddw xmm5, xmm7 + psraw xmm5, 1 + paddw xmm5, xmm1 + movdqa [rdx], xmm5 ; out lo + ; move down + lea rsi, [rsi + 16 * 2 * 2] ; 2 row + lea rdi, [rdi + 16 * 2] ; 1 row + lea rdx, [rdx + 16 * 2] ; 1 row + + ; move up + lea rsi, [rsi - 16 * 16 * 2] + lea rdi, [rdi - 8 * 16 * 2] + lea rdx, [rdx - 8 * 16 * 2] + + ; move right + lea rsi, [rsi + 16] + lea rdi, [rdi + 16] + lea rdx, [rdx + 16] + + dec ecx + jnz loop1b + + ret + +;****************************************************************************** +; source 16 bit signed, 32 pixel width +rfx_dwt_2d_encode_block_horiz_16_32: + mov ecx, 16 +loop1c: + ; pre + movdqa xmm1, [rsi] ; src[2n] + movdqa xmm2, [rsi + 16] + movdqa xmm6, xmm1 + movdqa xmm7, xmm2 + pand xmm1, [rel cdFFFF] + pand xmm2, [rel cdFFFF] + packusdw xmm1, xmm2 + movdqa xmm2, xmm6 ; src[2n + 1] + movdqa xmm3, xmm7 + psrldq xmm2, 2 + psrldq xmm3, 2 + pand xmm2, [rel cdFFFF] + pand xmm3, [rel cdFFFF] + packusdw xmm2, xmm3 + movdqa xmm3, xmm6 ; src[2n + 2] + movdqa xmm4, xmm7 + psrldq xmm3, 4 + psrldq xmm4, 4 + movd eax, xmm7 + movd xmm5, eax + pslldq xmm5, 12 + por xmm3, xmm5 + mov eax, [rsi + 32] + movd xmm5, eax + pslldq xmm5, 12 + por xmm4, xmm5 + pand xmm3, [rel cdFFFF] + pand xmm4, [rel cdFFFF] + packusdw xmm3, xmm4 + movdqa xmm4, xmm1 + movdqa xmm5, xmm2 + movdqa xmm6, xmm3 + ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 + paddw xmm4, xmm6 + psraw xmm4, 1 + psubw xmm5, xmm4 + psraw xmm5, 1 + + movdqa xmm6, xmm5 ; out hi + paddw xmm6, xmm8 + psraw xmm6, xmm9 + movdqa [rdi], xmm6 + movdqa xmm2, xmm5 ; save hi + + ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) + movdqa xmm7, xmm5 + movd eax, xmm7 + pslldq xmm7, 2 + and eax, 0xFFFF + movd xmm6, eax + por xmm7, xmm6 + paddw xmm5, xmm7 + psraw xmm5, 1 + paddw xmm5, xmm1 + + psrldq xmm2, 14 + movd ebx, xmm2 ; save hi + + movdqa xmm6, xmm5 ; out lo + paddw xmm6, xmm10 + psraw xmm6, xmm11 + movdqa [rdx], xmm6 + + ; move right + lea rsi, [rsi + 16 * 2] + lea rdi, [rdi + 8 * 2] + lea rdx, [rdx + 8 * 2] + + ; post + movdqa xmm1, [rsi] ; src[2n] + movdqa xmm2, [rsi + 16] + movdqa xmm6, xmm1 + movdqa xmm7, xmm2 + pand xmm1, [rel cdFFFF] + pand xmm2, [rel cdFFFF] + packusdw xmm1, xmm2 + movdqa xmm2, xmm6 ; src[2n + 1] + movdqa xmm3, xmm7 + psrldq xmm2, 2 + psrldq xmm3, 2 + pand xmm2, [rel cdFFFF] + pand xmm3, [rel cdFFFF] + packusdw xmm2, xmm3 + movdqa xmm3, xmm6 ; src[2n + 2] + movdqa xmm4, xmm7 + psrldq xmm3, 4 + psrldq xmm4, 4 + movd eax, xmm7 + movd xmm5, eax + pslldq xmm5, 12 + por xmm3, xmm5 + movdqa xmm5, xmm7 + psrldq xmm5, 12 + pslldq xmm5, 12 + por xmm4, xmm5 + pand xmm3, [rel cdFFFF] + pand xmm4, [rel cdFFFF] + packusdw xmm3, xmm4 + movdqa xmm4, xmm1 + movdqa xmm5, xmm2 + movdqa xmm6, xmm3 + ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 + paddw xmm4, xmm6 + psraw xmm4, 1 + psubw xmm5, xmm4 + psraw xmm5, 1 + + movdqa xmm6, xmm5 ; out hi + paddw xmm6, xmm8 + psraw xmm6, xmm9 + movdqa [rdi], xmm6 + + ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) + movdqa xmm7, xmm5 + pslldq xmm7, 2 + movd xmm6, ebx + por xmm7, xmm6 + paddw xmm5, xmm7 + psraw xmm5, 1 + paddw xmm5, xmm1 + + movdqa xmm6, xmm5 ; out lo + paddw xmm6, xmm10 + psraw xmm6, xmm11 + movdqa [rdx], xmm6 + + ; move right + lea rsi, [rsi + 16 * 2] + lea rdi, [rdi + 8 * 2] + lea rdx, [rdx + 8 * 2] + + ; move left + lea rsi, [rsi - 32 * 2] + lea rdi, [rdi - 16 * 2] + lea rdx, [rdx - 16 * 2] + + ; move down + lea rsi, [rsi + 32 * 2] + lea rdi, [rdi + 16 * 2] + lea rdx, [rdx + 16 * 2] + + dec ecx + jnz loop1c + + ret + +;****************************************************************************** +; source 16 bit signed, 32 pixel width +rfx_dwt_2d_encode_block_horiz_16_32_no_lo: + mov ecx, 16 +loop1c1: + ; pre + movdqa xmm1, [rsi] ; src[2n] + movdqa xmm2, [rsi + 16] + movdqa xmm6, xmm1 + movdqa xmm7, xmm2 + pand xmm1, [rel cdFFFF] + pand xmm2, [rel cdFFFF] + packusdw xmm1, xmm2 + movdqa xmm2, xmm6 ; src[2n + 1] + movdqa xmm3, xmm7 + psrldq xmm2, 2 + psrldq xmm3, 2 + pand xmm2, [rel cdFFFF] + pand xmm3, [rel cdFFFF] + packusdw xmm2, xmm3 + movdqa xmm3, xmm6 ; src[2n + 2] + movdqa xmm4, xmm7 + psrldq xmm3, 4 + psrldq xmm4, 4 + movd eax, xmm7 + movd xmm5, eax + pslldq xmm5, 12 + por xmm3, xmm5 + mov eax, [rsi + 32] + movd xmm5, eax + pslldq xmm5, 12 + por xmm4, xmm5 + pand xmm3, [rel cdFFFF] + pand xmm4, [rel cdFFFF] + packusdw xmm3, xmm4 + movdqa xmm4, xmm1 + movdqa xmm5, xmm2 + movdqa xmm6, xmm3 + ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 + paddw xmm4, xmm6 + psraw xmm4, 1 + psubw xmm5, xmm4 + psraw xmm5, 1 + + movdqa xmm6, xmm5 ; out hi + paddw xmm6, xmm8 + psraw xmm6, xmm9 + movdqa [rdi], xmm6 + movdqa xmm2, xmm5 ; save hi + + ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) + movdqa xmm7, xmm5 + movd eax, xmm7 + pslldq xmm7, 2 + and eax, 0xFFFF + movd xmm6, eax + por xmm7, xmm6 + paddw xmm5, xmm7 + psraw xmm5, 1 + paddw xmm5, xmm1 + + psrldq xmm2, 14 + movd ebx, xmm2 ; save hi + + movdqa [rdx], xmm5 ; out lo + + ; move right + lea rsi, [rsi + 16 * 2] + lea rdi, [rdi + 8 * 2] + lea rdx, [rdx + 8 * 2] + + ; post + movdqa xmm1, [rsi] ; src[2n] + movdqa xmm2, [rsi + 16] + movdqa xmm6, xmm1 + movdqa xmm7, xmm2 + pand xmm1, [rel cdFFFF] + pand xmm2, [rel cdFFFF] + packusdw xmm1, xmm2 + movdqa xmm2, xmm6 ; src[2n + 1] + movdqa xmm3, xmm7 + psrldq xmm2, 2 + psrldq xmm3, 2 + pand xmm2, [rel cdFFFF] + pand xmm3, [rel cdFFFF] + packusdw xmm2, xmm3 + movdqa xmm3, xmm6 ; src[2n + 2] + movdqa xmm4, xmm7 + psrldq xmm3, 4 + psrldq xmm4, 4 + movd eax, xmm7 + movd xmm5, eax + pslldq xmm5, 12 + por xmm3, xmm5 + movdqa xmm5, xmm7 + psrldq xmm5, 12 + pslldq xmm5, 12 + por xmm4, xmm5 + pand xmm3, [rel cdFFFF] + pand xmm4, [rel cdFFFF] + packusdw xmm3, xmm4 + movdqa xmm4, xmm1 + movdqa xmm5, xmm2 + movdqa xmm6, xmm3 + ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 + paddw xmm4, xmm6 + psraw xmm4, 1 + psubw xmm5, xmm4 + psraw xmm5, 1 + + movdqa xmm6, xmm5 ; out hi + paddw xmm6, xmm8 + psraw xmm6, xmm9 + movdqa [rdi], xmm6 + + ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) + movdqa xmm7, xmm5 + pslldq xmm7, 2 + movd xmm6, ebx + por xmm7, xmm6 + paddw xmm5, xmm7 + psraw xmm5, 1 + paddw xmm5, xmm1 + + movdqa [rdx], xmm5 ; out lo + + ; move right + lea rsi, [rsi + 16 * 2] + lea rdi, [rdi + 8 * 2] + lea rdx, [rdx + 8 * 2] + + ; move left + lea rsi, [rsi - 32 * 2] + lea rdi, [rdi - 16 * 2] + lea rdx, [rdx - 16 * 2] + + ; move down + lea rsi, [rsi + 32 * 2] + lea rdi, [rdi + 16 * 2] + lea rdx, [rdx + 16 * 2] + + dec ecx + jnz loop1c1 + + ret + +;****************************************************************************** +; source 16 bit signed, 32 pixel width +rfx_dwt_2d_encode_block_verti_16_32: + mov ecx, 4 +loop1d: + ; pre + movdqa xmm1, [rsi] ; src[2n] + movdqa xmm2, [rsi + 32 * 2] ; src[2n + 1] + movdqa xmm3, [rsi + 32 * 2 * 2] ; src[2n + 2] + movdqa xmm4, xmm1 + movdqa xmm5, xmm2 + movdqa xmm6, xmm3 + ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 + paddw xmm4, xmm6 + psraw xmm4, 1 + psubw xmm5, xmm4 + psraw xmm5, 1 + movdqa [rdi], xmm5 ; out hi + movdqa xmm6, xmm5 ; save hi + ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) + paddw xmm5, xmm1 + movdqa [rdx], xmm5 ; out lo + movdqa xmm7, xmm6 ; save hi + ; move down + lea rsi, [rsi + 32 * 2 * 2] ; 2 rows + lea rdi, [rdi + 32 * 2] ; 1 row + lea rdx, [rdx + 32 * 2] ; 1 row + + ; loop + shl ecx, 16 + mov cx, 14 +loop2d: + movdqa xmm1, xmm3 ; src[2n] + movdqa xmm2, [rsi + 32 * 2] ; src[2n + 1] + movdqa xmm3, [rsi + 32 * 2 * 2] ; src[2n + 2] + movdqa xmm4, xmm1 + movdqa xmm5, xmm2 + movdqa xmm6, xmm3 + ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 + paddw xmm4, xmm6 + psraw xmm4, 1 + psubw xmm5, xmm4 + psraw xmm5, 1 + movdqa [rdi], xmm5 ; out hi + movdqa xmm6, xmm5 ; save hi + ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) + paddw xmm5, xmm7 + psraw xmm5, 1 + paddw xmm5, xmm1 + movdqa [rdx], xmm5 ; out lo + movdqa xmm7, xmm6 ; save hi + ; move down + lea rsi, [rsi + 32 * 2 * 2] ; 2 rows + lea rdi, [rdi + 32 * 2] ; 1 row + lea rdx, [rdx + 32 * 2] ; 1 row + + dec cx + jnz loop2d + shr ecx, 16 + + ; post + movdqa xmm1, xmm3 ; src[2n] + movdqa xmm2, [rsi + 32 * 2] ; src[2n + 1] + movdqa xmm4, xmm1 + movdqa xmm5, xmm2 + movdqa xmm6, xmm3 + ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 + paddw xmm4, xmm6 + psraw xmm4, 1 + psubw xmm5, xmm4 + psraw xmm5, 1 + movdqa [rdi], xmm5 ; out hi + ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) + paddw xmm5, xmm7 + psraw xmm5, 1 + paddw xmm5, xmm1 + movdqa [rdx], xmm5 ; out lo + ; move down + lea rsi, [rsi + 32 * 2 * 2] ; 2 row + lea rdi, [rdi + 32 * 2] ; 1 row + lea rdx, [rdx + 32 * 2] ; 1 row + + ; move up + lea rsi, [rsi - 32 * 32 * 2] + lea rdi, [rdi - 16 * 32 * 2] + lea rdx, [rdx - 16 * 32 * 2] + + ; move right + lea rsi, [rsi + 16] + lea rdi, [rdi + 16] + lea rdx, [rdx + 16] + + dec ecx + jnz loop1d + + ret + +;****************************************************************************** +; source 16 bit signed, 64 pixel width +rfx_dwt_2d_encode_block_horiz_16_64: + mov ecx, 32 +loop1e: + ; pre + movdqa xmm1, [rsi] ; src[2n] + movdqa xmm2, [rsi + 16] + movdqa xmm6, xmm1 + movdqa xmm7, xmm2 + pand xmm1, [rel cdFFFF] + pand xmm2, [rel cdFFFF] + packusdw xmm1, xmm2 + movdqa xmm2, xmm6 ; src[2n + 1] + movdqa xmm3, xmm7 + psrldq xmm2, 2 + psrldq xmm3, 2 + pand xmm2, [rel cdFFFF] + pand xmm3, [rel cdFFFF] + packusdw xmm2, xmm3 + movdqa xmm3, xmm6 ; src[2n + 2] + movdqa xmm4, xmm7 + psrldq xmm3, 4 + psrldq xmm4, 4 + movd eax, xmm7 + movd xmm5, eax + pslldq xmm5, 12 + por xmm3, xmm5 + mov eax, [rsi + 32] + movd xmm5, eax + pslldq xmm5, 12 + por xmm4, xmm5 + pand xmm3, [rel cdFFFF] + pand xmm4, [rel cdFFFF] + packusdw xmm3, xmm4 + movdqa xmm4, xmm1 + movdqa xmm5, xmm2 + movdqa xmm6, xmm3 + ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 + paddw xmm4, xmm6 + psraw xmm4, 1 + psubw xmm5, xmm4 + psraw xmm5, 1 + + movdqa xmm6, xmm5 ; out hi + paddw xmm6, xmm8 + psraw xmm6, xmm9 + movdqa [rdi], xmm6 + movdqa xmm2, xmm5 ; save hi + + ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) + movdqa xmm7, xmm5 + movd eax, xmm7 + pslldq xmm7, 2 + and eax, 0xFFFF + movd xmm6, eax + por xmm7, xmm6 + paddw xmm5, xmm7 + psraw xmm5, 1 + paddw xmm5, xmm1 + + psrldq xmm2, 14 + movd ebx, xmm2 ; save hi + + movdqa xmm6, xmm5 ; out lo + paddw xmm6, xmm10 + psraw xmm6, xmm11 + movdqa [rdx], xmm6 + + ; move right + lea rsi, [rsi + 16 * 2] + lea rdi, [rdi + 8 * 2] + lea rdx, [rdx + 8 * 2] + + ; loop + shl ecx, 16 + mov cx, 2 +loop2e: + movdqa xmm1, [rsi] ; src[2n] + movdqa xmm2, [rsi + 16] + movdqa xmm6, xmm1 + movdqa xmm7, xmm2 + pand xmm1, [rel cdFFFF] + pand xmm2, [rel cdFFFF] + packusdw xmm1, xmm2 + movdqa xmm2, xmm6 ; src[2n + 1] + movdqa xmm3, xmm7 + psrldq xmm2, 2 + psrldq xmm3, 2 + pand xmm2, [rel cdFFFF] + pand xmm3, [rel cdFFFF] + packusdw xmm2, xmm3 + movdqa xmm3, xmm6 ; src[2n + 2] + movdqa xmm4, xmm7 + psrldq xmm3, 4 + psrldq xmm4, 4 + movd eax, xmm7 + movd xmm5, eax + pslldq xmm5, 12 + por xmm3, xmm5 + mov eax, [rsi + 32] + movd xmm5, eax + pslldq xmm5, 12 + por xmm4, xmm5 + pand xmm3, [rel cdFFFF] + pand xmm4, [rel cdFFFF] + packusdw xmm3, xmm4 + movdqa xmm4, xmm1 + movdqa xmm5, xmm2 + movdqa xmm6, xmm3 + ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 + paddw xmm4, xmm6 + psraw xmm4, 1 + psubw xmm5, xmm4 + psraw xmm5, 1 + + movdqa xmm6, xmm5 ; out hi + paddw xmm6, xmm8 + psraw xmm6, xmm9 + movdqa [rdi], xmm6 + movdqa xmm2, xmm5 ; save hi + + ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) + movdqa xmm7, xmm5 + pslldq xmm7, 2 + movd xmm6, ebx + por xmm7, xmm6 + paddw xmm5, xmm7 + psraw xmm5, 1 + paddw xmm5, xmm1 + + psrldq xmm2, 14 + movd ebx, xmm2 ; save hi + + movdqa xmm6, xmm5 ; out lo + paddw xmm6, xmm10 + psraw xmm6, xmm11 + movdqa [rdx], xmm6 + + ; move right + lea rsi, [rsi + 16 * 2] + lea rdi, [rdi + 8 * 2] + lea rdx, [rdx + 8 * 2] + + dec cx + jnz loop2e + shr ecx, 16 + + ; post + movdqa xmm1, [rsi] ; src[2n] + movdqa xmm2, [rsi + 16] + movdqa xmm6, xmm1 + movdqa xmm7, xmm2 + pand xmm1, [rel cdFFFF] + pand xmm2, [rel cdFFFF] + packusdw xmm1, xmm2 + movdqa xmm2, xmm6 ; src[2n + 1] + movdqa xmm3, xmm7 + psrldq xmm2, 2 + psrldq xmm3, 2 + pand xmm2, [rel cdFFFF] + pand xmm3, [rel cdFFFF] + packusdw xmm2, xmm3 + movdqa xmm3, xmm6 ; src[2n + 2] + movdqa xmm4, xmm7 + psrldq xmm3, 4 + psrldq xmm4, 4 + movd eax, xmm7 + movd xmm5, eax + pslldq xmm5, 12 + por xmm3, xmm5 + movdqa xmm5, xmm7 + psrldq xmm5, 12 + pslldq xmm5, 12 + por xmm4, xmm5 + pand xmm3, [rel cdFFFF] + pand xmm4, [rel cdFFFF] + packusdw xmm3, xmm4 + movdqa xmm4, xmm1 + movdqa xmm5, xmm2 + movdqa xmm6, xmm3 + ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 + paddw xmm4, xmm6 + psraw xmm4, 1 + psubw xmm5, xmm4 + psraw xmm5, 1 + + movdqa xmm6, xmm5 ; out hi + paddw xmm6, xmm8 + psraw xmm6, xmm9 + movdqa [rdi], xmm6 + + ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) + movdqa xmm7, xmm5 + pslldq xmm7, 2 + movd xmm6, ebx + por xmm7, xmm6 + paddw xmm5, xmm7 + psraw xmm5, 1 + paddw xmm5, xmm1 + + movdqa xmm6, xmm5 ; out lo + paddw xmm6, xmm10 + psraw xmm6, xmm11 + movdqa [rdx], xmm6 + + ; move right + lea rsi, [rsi + 16 * 2] + lea rdi, [rdi + 8 * 2] + lea rdx, [rdx + 8 * 2] + + ; move left + lea rsi, [rsi - 64 * 2] + lea rdi, [rdi - 32 * 2] + lea rdx, [rdx - 32 * 2] + + ; move down + lea rsi, [rsi + 64 * 2] + lea rdi, [rdi + 32 * 2] + lea rdx, [rdx + 32 * 2] + + dec ecx + jnz loop1e + + ret + +;****************************************************************************** +; source 16 bit signed, 64 pixel width +rfx_dwt_2d_encode_block_horiz_16_64_no_lo: + mov ecx, 32 +loop1e1: + ; pre + movdqa xmm1, [rsi] ; src[2n] + movdqa xmm2, [rsi + 16] + movdqa xmm6, xmm1 + movdqa xmm7, xmm2 + pand xmm1, [rel cdFFFF] + pand xmm2, [rel cdFFFF] + packusdw xmm1, xmm2 + movdqa xmm2, xmm6 ; src[2n + 1] + movdqa xmm3, xmm7 + psrldq xmm2, 2 + psrldq xmm3, 2 + pand xmm2, [rel cdFFFF] + pand xmm3, [rel cdFFFF] + packusdw xmm2, xmm3 + movdqa xmm3, xmm6 ; src[2n + 2] + movdqa xmm4, xmm7 + psrldq xmm3, 4 + psrldq xmm4, 4 + movd eax, xmm7 + movd xmm5, eax + pslldq xmm5, 12 + por xmm3, xmm5 + mov eax, [rsi + 32] + movd xmm5, eax + pslldq xmm5, 12 + por xmm4, xmm5 + pand xmm3, [rel cdFFFF] + pand xmm4, [rel cdFFFF] + packusdw xmm3, xmm4 + movdqa xmm4, xmm1 + movdqa xmm5, xmm2 + movdqa xmm6, xmm3 + ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 + paddw xmm4, xmm6 + psraw xmm4, 1 + psubw xmm5, xmm4 + psraw xmm5, 1 + + movdqa xmm6, xmm5 ; out hi + paddw xmm6, xmm8 + psraw xmm6, xmm9 + movdqa [rdi], xmm6 + movdqa xmm2, xmm5 ; save hi + + ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) + movdqa xmm7, xmm5 + movd eax, xmm7 + pslldq xmm7, 2 + and eax, 0xFFFF + movd xmm6, eax + por xmm7, xmm6 + paddw xmm5, xmm7 + psraw xmm5, 1 + paddw xmm5, xmm1 + + psrldq xmm2, 14 + movd ebx, xmm2 ; save hi + + movdqa [rdx], xmm5 ; out lo + + ; move right + lea rsi, [rsi + 16 * 2] + lea rdi, [rdi + 8 * 2] + lea rdx, [rdx + 8 * 2] + + ; loop + shl ecx, 16 + mov cx, 2 +loop2e1: + movdqa xmm1, [rsi] ; src[2n] + movdqa xmm2, [rsi + 16] + movdqa xmm6, xmm1 + movdqa xmm7, xmm2 + pand xmm1, [rel cdFFFF] + pand xmm2, [rel cdFFFF] + packusdw xmm1, xmm2 + movdqa xmm2, xmm6 ; src[2n + 1] + movdqa xmm3, xmm7 + psrldq xmm2, 2 + psrldq xmm3, 2 + pand xmm2, [rel cdFFFF] + pand xmm3, [rel cdFFFF] + packusdw xmm2, xmm3 + movdqa xmm3, xmm6 ; src[2n + 2] + movdqa xmm4, xmm7 + psrldq xmm3, 4 + psrldq xmm4, 4 + movd eax, xmm7 + movd xmm5, eax + pslldq xmm5, 12 + por xmm3, xmm5 + mov eax, [rsi + 32] + movd xmm5, eax + pslldq xmm5, 12 + por xmm4, xmm5 + pand xmm3, [rel cdFFFF] + pand xmm4, [rel cdFFFF] + packusdw xmm3, xmm4 + movdqa xmm4, xmm1 + movdqa xmm5, xmm2 + movdqa xmm6, xmm3 + ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 + paddw xmm4, xmm6 + psraw xmm4, 1 + psubw xmm5, xmm4 + psraw xmm5, 1 + + movdqa xmm6, xmm5 ; out hi + paddw xmm6, xmm8 + psraw xmm6, xmm9 + movdqa [rdi], xmm6 + movdqa xmm2, xmm5 ; save hi + + ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) + movdqa xmm7, xmm5 + pslldq xmm7, 2 + movd xmm6, ebx + por xmm7, xmm6 + paddw xmm5, xmm7 + psraw xmm5, 1 + paddw xmm5, xmm1 + + psrldq xmm2, 14 + movd ebx, xmm2 ; save hi + + movdqa [rdx], xmm5 ; out lo + + ; move right + lea rsi, [rsi + 16 * 2] + lea rdi, [rdi + 8 * 2] + lea rdx, [rdx + 8 * 2] + + dec cx + jnz loop2e1 + shr ecx, 16 + + ; post + movdqa xmm1, [rsi] ; src[2n] + movdqa xmm2, [rsi + 16] + movdqa xmm6, xmm1 + movdqa xmm7, xmm2 + pand xmm1, [rel cdFFFF] + pand xmm2, [rel cdFFFF] + packusdw xmm1, xmm2 + movdqa xmm2, xmm6 ; src[2n + 1] + movdqa xmm3, xmm7 + psrldq xmm2, 2 + psrldq xmm3, 2 + pand xmm2, [rel cdFFFF] + pand xmm3, [rel cdFFFF] + packusdw xmm2, xmm3 + movdqa xmm3, xmm6 ; src[2n + 2] + movdqa xmm4, xmm7 + psrldq xmm3, 4 + psrldq xmm4, 4 + movd eax, xmm7 + movd xmm5, eax + pslldq xmm5, 12 + por xmm3, xmm5 + movdqa xmm5, xmm7 + psrldq xmm5, 12 + pslldq xmm5, 12 + por xmm4, xmm5 + pand xmm3, [rel cdFFFF] + pand xmm4, [rel cdFFFF] + packusdw xmm3, xmm4 + movdqa xmm4, xmm1 + movdqa xmm5, xmm2 + movdqa xmm6, xmm3 + ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 + paddw xmm4, xmm6 + psraw xmm4, 1 + psubw xmm5, xmm4 + psraw xmm5, 1 + + movdqa xmm6, xmm5 ; out hi + paddw xmm6, xmm8 + psraw xmm6, xmm9 + movdqa [rdi], xmm6 + + ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) + movdqa xmm7, xmm5 + pslldq xmm7, 2 + movd xmm6, ebx + por xmm7, xmm6 + paddw xmm5, xmm7 + psraw xmm5, 1 + paddw xmm5, xmm1 + + movdqa [rdx], xmm5 ; out lo + + ; move right + lea rsi, [rsi + 16 * 2] + lea rdi, [rdi + 8 * 2] + lea rdx, [rdx + 8 * 2] + + ; move left + lea rsi, [rsi - 64 * 2] + lea rdi, [rdi - 32 * 2] + lea rdx, [rdx - 32 * 2] + + ; move down + lea rsi, [rsi + 64 * 2] + lea rdi, [rdi + 32 * 2] + lea rdx, [rdx + 32 * 2] + + dec ecx + jnz loop1e1 + + ret + +;****************************************************************************** +; source 8 bit unsigned, 64 pixel width +rfx_dwt_2d_encode_block_verti_8_64: + mov ecx, 8 +loop1f: + ; pre + movq xmm1, [rsi] ; src[2n] + movq xmm2, [rsi + 64 * 1] ; src[2n + 1] + movq xmm3, [rsi + 64 * 1 * 2] ; src[2n + 2] + punpcklbw xmm1, xmm0 + punpcklbw xmm2, xmm0 + punpcklbw xmm3, xmm0 + psubw xmm1, [rel cw128] + psubw xmm2, [rel cw128] + psubw xmm3, [rel cw128] + psllw xmm1, 5 + psllw xmm2, 5 + psllw xmm3, 5 + movdqa xmm4, xmm1 + movdqa xmm5, xmm2 + movdqa xmm6, xmm3 + ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 + paddw xmm4, xmm6 + psraw xmm4, 1 + psubw xmm5, xmm4 + psraw xmm5, 1 + movdqa [rdi], xmm5 ; out hi + movdqa xmm6, xmm5 ; save hi + ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) + paddw xmm5, xmm1 + movdqa [rdx], xmm5 ; out lo + movdqa xmm7, xmm6 ; save hi + ; move down + lea rsi, [rsi + 64 * 1 * 2] ; 2 rows + lea rdi, [rdi + 64 * 2] ; 1 row + lea rdx, [rdx + 64 * 2] ; 1 row + + ; loop + shl ecx, 16 + mov cx, 30 +loop2f: + movdqa xmm1, xmm3 ; src[2n] + movq xmm2, [rsi + 64 * 1] ; src[2n + 1] + movq xmm3, [rsi + 64 * 1 * 2] ; src[2n + 2] + punpcklbw xmm2, xmm0 + punpcklbw xmm3, xmm0 + psubw xmm2, [rel cw128] + psubw xmm3, [rel cw128] + psllw xmm2, 5 + psllw xmm3, 5 + movdqa xmm4, xmm1 + movdqa xmm5, xmm2 + movdqa xmm6, xmm3 + ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 + paddw xmm4, xmm6 + psraw xmm4, 1 + psubw xmm5, xmm4 + psraw xmm5, 1 + movdqa [rdi], xmm5 ; out hi + movdqa xmm6, xmm5 ; save hi + ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) + paddw xmm5, xmm7 + psraw xmm5, 1 + paddw xmm5, xmm1 + movdqa [rdx], xmm5 ; out lo + movdqa xmm7, xmm6 ; save hi + ; move down + lea rsi, [rsi + 64 * 1 * 2] ; 2 rows + lea rdi, [rdi + 64 * 2] ; 1 row + lea rdx, [rdx + 64 * 2] ; 1 row + + dec cx + jnz loop2f + shr ecx, 16 + + ; post + movdqa xmm1, xmm3 ; src[2n] + movq xmm2, [rsi + 64 * 1] ; src[2n + 1] + punpcklbw xmm2, xmm0 + psubw xmm2, [rel cw128] + psllw xmm2, 5 + movdqa xmm4, xmm1 + movdqa xmm5, xmm2 + movdqa xmm6, xmm3 + ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 + paddw xmm4, xmm6 + psraw xmm4, 1 + psubw xmm5, xmm4 + psraw xmm5, 1 + movdqa [rdi], xmm5 ; out hi + ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) + paddw xmm5, xmm7 + psraw xmm5, 1 + paddw xmm5, xmm1 + movdqa [rdx], xmm5 ; out lo + ; move down + lea rsi, [rsi + 64 * 1 * 2] ; 2 rows + lea rdi, [rdi + 64 * 2] ; 1 row + lea rdx, [rdx + 64 * 2] ; 1 row + + ; move up + lea rsi, [rsi - 64 * 1 * 64] + lea rdi, [rdi - 32 * 64 * 2] + lea rdx, [rdx - 32 * 64 * 2] + + ; move right + lea rsi, [rsi + 8] + lea rdi, [rdi + 16] + lea rdx, [rdx + 16] + + dec ecx + jnz loop1f + + ret + +set_quants_hi: + sub rax, 6 - 5 + movd xmm9, eax + imul rax, 16 + lea rdx, [rel cwa0] + add rdx, rax + movdqa xmm8, [rdx] + ret + +set_quants_lo: + sub rax, 6 - 5 + movd xmm11, eax + imul rax, 16 + lea rdx, [rel cwa0] + add rdx, rax + movdqa xmm10, [rdx] + ret + +;The first six integer or pointer arguments are passed in registers +;RDI, RSI, RDX, RCX, R8, and R9 + +;int +;rfxcodec_encode_dwt_shift_amd64_sse41(const char *qtable, +; unsigned char *in_buffer, +; short *out_buffer, +; short *work_buffer); + +;****************************************************************************** +%ifidn __OUTPUT_FORMAT__,elf64 +PROC rfxcodec_encode_dwt_shift_amd64_sse41 +%else +PROC _rfxcodec_encode_dwt_shift_amd64_sse41 +%endif + ; save registers + push rbx + push rdx + push rcx + push rsi + push rdi + pxor xmm0, xmm0 + + ; verical DWT to work buffer, level 1 + mov rsi, [rsp + 8] ; src + mov rdi, [rsp + 16] ; dst hi + lea rdi, [rdi + 64 * 32 * 2] ; dst hi + mov rdx, [rsp + 16] ; dst lo + call rfx_dwt_2d_encode_block_verti_8_64 + + ; horizontal DWT to out buffer, level 1, part 1 + xor rax, rax + mov rdx, [rsp] + mov al, [rdx + 4] + and al, 0xF + call set_quants_hi + mov rsi, [rsp + 16] ; src + mov rdi, [rsp + 24] ; dst hi - HL1 + mov rdx, [rsp + 24] ; dst lo - LL1 + lea rdx, [rdx + 32 * 32 * 6] ; dst lo - LL1 + call rfx_dwt_2d_encode_block_horiz_16_64_no_lo + + ; horizontal DWT to out buffer, level 1, part 2 + xor rax, rax + mov rdx, [rsp] + mov al, [rdx + 4] + shr al, 4 + call set_quants_hi + xor rax, rax + mov rdx, [rsp] + mov al, [rdx + 3] + shr al, 4 + call set_quants_lo + mov rsi, [rsp + 16] ; src + lea rsi, [rsi + 64 * 32 * 2] ; src + mov rdi, [rsp + 24] ; dst hi - HH1 + lea rdi, [rdi + 32 * 32 * 4] ; dst hi - HH1 + mov rdx, [rsp + 24] ; dst lo - LH1 + lea rdx, [rdx + 32 * 32 * 2] ; dst lo - LH1 + call rfx_dwt_2d_encode_block_horiz_16_64 + + ; verical DWT to work buffer, level 2 + mov rsi, [rsp + 24] ; src + lea rsi, [rsi + 32 * 32 * 6] ; src + mov rdi, [rsp + 16] ; dst hi + lea rdi, [rdi + 32 * 16 * 2] ; dst hi + mov rdx, [rsp + 16] ; dst lo + call rfx_dwt_2d_encode_block_verti_16_32 + + ; horizontal DWT to out buffer, level 2, part 1 + xor rax, rax + mov rdx, [rsp] + mov al, [rdx + 2] + shr al, 4 + call set_quants_hi + mov rsi, [rsp + 16] ; src + ; 32 * 32 * 6 + 16 * 16 * 0 = 6144 + mov rdi, [rsp + 24] ; dst hi - HL2 + lea rdi, [rdi + 6144] ; dst hi - HL2 + ; 32 * 32 * 6 + 16 * 16 * 6 = 7680 + mov rdx, [rsp + 24] ; dst lo - LL2 + lea rdx, [rdx + 7680] ; dst lo - LL2 + call rfx_dwt_2d_encode_block_horiz_16_32_no_lo + + ; horizontal DWT to out buffer, level 2, part 2 + xor rax, rax + mov rdx, [rsp] + mov al, [rdx + 3] + and al, 0xF + call set_quants_hi + xor rax, rax + mov rdx, [rsp] + mov al, [rdx + 2] + and al, 0xF + call set_quants_lo + mov rsi, [rsp + 16] ; src + lea rsi, [rsi + 32 * 16 * 2] ; src + ; 32 * 32 * 6 + 16 * 16 * 4 = 7168 + mov rdi, [rsp + 24] ; dst hi - HH2 + lea rdi, [rdi + 7168] ; dst hi - HH2 + ; 32 * 32 * 6 + 16 * 16 * 2 = 6656 + mov rdx, [rsp + 24] ; dst lo - LH2 + lea rdx, [rdx + 6656] ; dst lo - LH2 + call rfx_dwt_2d_encode_block_horiz_16_32 + + ; verical DWT to work buffer, level 3 + ; 32 * 32 * 6 + 16 * 16 * 6 = 7680 + mov rsi, [rsp + 24] ; src + lea rsi, [rsi + 7680] ; src + mov rdi, [rsp + 16] ; dst hi + lea rdi, [rdi + 16 * 8 * 2] ; dst hi + mov rdx, [rsp + 16] ; dst lo + call rfx_dwt_2d_encode_block_verti_16_16 + + ; horizontal DWT to out buffer, level 3, part 1 + xor rax, rax + mov rdx, [rsp] + mov al, [rdx + 1] + and al, 0xF + call set_quants_hi + xor rax, rax + mov rdx, [rsp] + mov al, [rdx + 0] + and al, 0xF + call set_quants_lo + mov rsi, [rsp + 16] ; src + ; 32 * 32 * 6 + 16 * 16 * 6 + 8 * 8 * 0 = 7680 + mov rdi, [rsp + 24] ; dst hi - HL3 + lea rdi, [rdi + 7680] ; dst hi - HL3 + ; 32 * 32 * 6 + 16 * 16 * 6 + 8 * 8 * 6 = 8064 + mov rdx, [rsp + 24] ; dst lo - LL3 + lea rdx, [rdx + 8064] ; dst lo - LL3 + call rfx_dwt_2d_encode_block_horiz_16_16 + + ; horizontal DWT to out buffer, level 3, part 2 + xor rax, rax + mov rdx, [rsp] + mov al, [rdx + 1] + shr al, 4 + call set_quants_hi + xor rax, rax + mov rdx, [rsp] + mov al, [rdx + 0] + shr al, 4 + call set_quants_lo + mov rsi, [rsp + 16] ; src + lea rsi, [rsi + 16 * 8 * 2] ; src + ; 32 * 32 * 6 + 16 * 16 * 6 + 8 * 8 * 4 = 7936 + mov rdi, [rsp + 24] ; dst hi - HH3 + lea rdi, [rdi + 7936] ; dst hi - HH3 + ; 32 * 32 * 6 + 16 * 16 * 6 + 8 * 8 * 2 = 7808 + mov rdx, [rsp + 24] ; dst lo - LH3 + lea rdx, [rdx + 7808] ; dst lo - LH3 + call rfx_dwt_2d_encode_block_horiz_16_16 + + mov rax, 0 + ; restore registers + pop rdi + pop rsi + pop rcx + pop rdx + pop rbx + ret + align 16 + diff --git a/src/amd64/rfxdwt_amd64_sse2.asm b/src/amd64/rfxdwt_amd64_sse2.asm deleted file mode 100644 index 4648371..0000000 --- a/src/amd64/rfxdwt_amd64_sse2.asm +++ /dev/null @@ -1,21 +0,0 @@ - -section .data - const1 times 8 dw 1 - -%macro PROC 1 - align 16 - global %1 - %1: -%endmacro - -;int -;dwt_shift_amd64_sse2(const int* qtable, sint8* src, sint16* dst, sint16* temp) - -PROC dwt_shift_amd64_sse2 - ; save registers - push rbx - mov rax, 0 - pop rbx - ret - align 16 - diff --git a/src/amd64/rfxrlgr1_amd64.asm b/src/amd64/rfxrlgr1_amd64.asm deleted file mode 100644 index 7c80678..0000000 --- a/src/amd64/rfxrlgr1_amd64.asm +++ /dev/null @@ -1,21 +0,0 @@ - -section .data - const1 times 8 dw 1 - -%macro PROC 1 - align 16 - global %1 - %1: -%endmacro - -;int -;diff_rlgr1_amd64(sint16 *co, int num_co, uint8 *dst, int dst_bytes); - -PROC diff_rlgr1_amd64 - ; save registers - push rbx - mov rax, 0 - pop rbx - ret - align 16 - diff --git a/src/amd64/rfxrlgr3_amd64.asm b/src/amd64/rfxrlgr3_amd64.asm deleted file mode 100644 index 3270760..0000000 --- a/src/amd64/rfxrlgr3_amd64.asm +++ /dev/null @@ -1,21 +0,0 @@ - -section .data - const1 times 8 dw 1 - -%macro PROC 1 - align 16 - global %1 - %1: -%endmacro - -;int -;diff_rlgr3_amd64(sint16 *co, int num_co, uint8 *dst, int dst_bytes); - -PROC diff_rlgr3_amd64 - ; save registers - push rbx - mov rax, 0 - pop rbx - ret - align 16 - diff --git a/src/nasm_lt.sh b/src/nasm_lt.sh new file mode 100755 index 0000000..6cd7329 --- /dev/null +++ b/src/nasm_lt.sh @@ -0,0 +1,57 @@ +#! /bin/sh +command="" +infile="" +o_opt=no +pic=no +while [ $# -gt 0 ]; do + case "$1" in + -DPIC|-fPIC|-fpic|-Kpic|-KPIC) + if [ "$pic" != "yes" ] ; then + command="$command -DPIC" + pic=yes + fi + ;; + -f|-fbin|-faout|-faoutb|-fcoff|-felf|-felf64|-fas86| \ + -fobj|-fwin32|-fwin64|-frdf|-fieee|-fmacho|-fmacho64) + # it's a file format specifier for nasm. + command="$command $1" + ;; + -f*) + # maybe a code-generation flag for gcc. + ;; + -[Ii]*) + incdir=`echo "$1" | sed 's/^-[Ii]//'` + if [ "x$incdir" = x -a "x$2" != x ] ; then + case "$2" in + -*) ;; + *) incdir="$2"; shift;; + esac + fi + if [ "x$incdir" != x ] ; then + # In the case of NASM, the trailing slash is necessary. + incdir=`echo "$incdir" | sed 's%/*$%/%'` + command="$command -I$incdir" + fi + ;; + -o*) + o_opt=yes + command="$command $1" + ;; + *.asm) + infile=$1 + command="$command $1" + ;; + *) + command="$command $1" + ;; + esac + shift +done +if [ "$o_opt" != yes ] ; then + # By default, NASM creates an output file + # in the same directory as the input file. + outfile="-o `echo $infile | sed -e 's%^.*/%%' -e 's%\.[^.]*$%%'`.o" + command="$command $outfile" +fi +echo $command +exec $command diff --git a/src/rfxcommon.h b/src/rfxcommon.h index 6b4c6f8..74514e9 100644 --- a/src/rfxcommon.h +++ b/src/rfxcommon.h @@ -1,7 +1,7 @@ /** * RFX codec * - * Copyright 2014 Jay Sorg + * Copyright 2014-2015 Jay Sorg * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -23,6 +23,8 @@ #define MAX(_val1, _val2) (_val1) > (_val2) ? (_val1) : (_val2) #define MINMAX(_v, _l, _h) ((_v) < (_l) ? (_l) : ((_v) > (_h) ? (_h) : (_v))) +#define DWT_FACTOR 5 + typedef signed char sint8; typedef unsigned char uint8; typedef signed short sint16; diff --git a/src/rfxcompose.c b/src/rfxcompose.c index d3af2dd..f208a32 100644 --- a/src/rfxcompose.c +++ b/src/rfxcompose.c @@ -3,6 +3,7 @@ * RemoteFX Codec Library * * Copyright 2011 Vic Lee + * Copyright 2015 Jay Sorg * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -28,12 +29,16 @@ #include "rfxconstants.h" #include "rfxencode_tile.h" +#define LLOG_LEVEL 1 +#define LLOGLN(_level, _args) \ + do { if (_level < LLOG_LEVEL) { printf _args ; printf("\n"); } } while (0) + /* * LL3, LH3, HL3, HH3, LH2, HL2, HH2, LH1, HL1, HH1 */ -static const int g_rfx_default_quantization_values[] = +static const unsigned char g_rfx_default_quantization_values[] = { - 6, 6, 6, 6, 7, 7, 8, 8, 8, 9 + 0x66, 0x66, 0x77, 0x88, 0x98 }; /******************************************************************************/ @@ -168,7 +173,7 @@ rfx_compose_message_frame_begin(struct rfxencode* enc, STREAM* s) /******************************************************************************/ static int rfx_compose_message_region(struct rfxencode* enc, STREAM* s, - struct rfx_rect *regions, int num_regions) + const struct rfx_rect *regions, int num_regions) { int size; int i; @@ -200,7 +205,7 @@ rfx_compose_message_region(struct rfxencode* enc, STREAM* s, static int rfx_compose_message_tile_yuv(struct rfxencode *enc, STREAM *s, char *tile_data, int tile_width, int tile_height, - int stride_bytes, const int *quantVals, + int stride_bytes, const char *quantVals, int quantIdxY, int quantIdxCb, int quantIdxCr, int xIdx, int yIdx) { @@ -221,9 +226,9 @@ rfx_compose_message_tile_yuv(struct rfxencode *enc, STREAM *s, stream_seek(s, 6); /* YLen, CbLen, CrLen */ if (rfx_encode_yuv(enc, tile_data, tile_width, tile_height, stride_bytes, - quantVals + quantIdxY * 10, - quantVals + quantIdxCb * 10, - quantVals + quantIdxCr * 10, + quantVals + quantIdxY * 5, + quantVals + quantIdxCb * 5, + quantVals + quantIdxCr * 5, s, &YLen, &CbLen, &CrLen) != 0) { return 1; @@ -239,11 +244,56 @@ rfx_compose_message_tile_yuv(struct rfxencode *enc, STREAM *s, return 0; } +/******************************************************************************/ +static int +rfx_compose_message_tile_yuva(struct rfxencode *enc, STREAM *s, + char *tile_data, int tile_width, int tile_height, + int stride_bytes, const char *quantVals, + int quantIdxY, int quantIdxCb, int quantIdxCr, + int xIdx, int yIdx) +{ + int YLen = 0; + int CbLen = 0; + int CrLen = 0; + int ALen = 0; + int start_pos; + int end_pos; + + start_pos = stream_get_pos(s); + stream_write_uint16(s, CBT_TILE); /* BlockT.blockType */ + stream_seek_uint32(s); /* set BlockT.blockLen later */ + stream_write_uint8(s, quantIdxY); + stream_write_uint8(s, quantIdxCb); + stream_write_uint8(s, quantIdxCr); + stream_write_uint16(s, xIdx); + stream_write_uint16(s, yIdx); + stream_seek(s, 8); /* YLen, CbLen, CrLen, ALen */ + if (rfx_encode_yuva(enc, tile_data, tile_width, tile_height, + stride_bytes, + quantVals + quantIdxY * 5, + quantVals + quantIdxCb * 5, + quantVals + quantIdxCr * 5, + s, &YLen, &CbLen, &CrLen, &ALen) != 0) + { + return 1; + } + end_pos = stream_get_pos(s); + stream_set_pos(s, start_pos + 2); + stream_write_uint32(s, 19 + YLen + CbLen + CrLen + ALen); /* BlockT.blockLen */ + stream_set_pos(s, start_pos + 13); + stream_write_uint16(s, YLen); + stream_write_uint16(s, CbLen); + stream_write_uint16(s, CrLen); + stream_write_uint16(s, ALen); + stream_set_pos(s, end_pos); + return 0; +} + /******************************************************************************/ static int rfx_compose_message_tile_rgb(struct rfxencode *enc, STREAM *s, char *tile_data, int tile_width, int tile_height, - int stride_bytes, const int *quantVals, + int stride_bytes, const char *quantVals, int quantIdxY, int quantIdxCb, int quantIdxCr, int xIdx, int yIdx) { @@ -264,9 +314,9 @@ rfx_compose_message_tile_rgb(struct rfxencode *enc, STREAM *s, stream_seek(s, 6); /* YLen, CbLen, CrLen */ if (rfx_encode_rgb(enc, tile_data, tile_width, tile_height, stride_bytes, - quantVals + quantIdxY * 10, - quantVals + quantIdxCb * 10, - quantVals + quantIdxCr * 10, + quantVals + quantIdxY * 5, + quantVals + quantIdxCb * 5, + quantVals + quantIdxCr * 5, s, &YLen, &CbLen, &CrLen) != 0) { return 1; @@ -282,21 +332,68 @@ rfx_compose_message_tile_rgb(struct rfxencode *enc, STREAM *s, return 0; } +/******************************************************************************/ +static int +rfx_compose_message_tile_argb(struct rfxencode *enc, STREAM *s, + char *tile_data, int tile_width, int tile_height, + int stride_bytes, const char *quantVals, + int quantIdxY, int quantIdxCb, int quantIdxCr, + int xIdx, int yIdx) +{ + int YLen = 0; + int CbLen = 0; + int CrLen = 0; + int ALen = 0; + int start_pos; + int end_pos; + + LLOGLN(10, ("rfx_compose_message_tile_argb:")); + start_pos = stream_get_pos(s); + stream_write_uint16(s, CBT_TILE); /* BlockT.blockType */ + stream_seek_uint32(s); /* set BlockT.blockLen later */ + stream_write_uint8(s, quantIdxY); + stream_write_uint8(s, quantIdxCb); + stream_write_uint8(s, quantIdxCr); + stream_write_uint16(s, xIdx); + stream_write_uint16(s, yIdx); + stream_seek(s, 8); /* YLen, CbLen, CrLen, ALen */ + if (rfx_encode_argb(enc, tile_data, tile_width, tile_height, + stride_bytes, + quantVals + quantIdxY * 5, + quantVals + quantIdxCb * 5, + quantVals + quantIdxCr * 5, + s, &YLen, &CbLen, &CrLen, &ALen) != 0) + { + LLOGLN(10, ("rfx_compose_message_tile_argb: rfx_encode_argb failed")); + return 1; + } + end_pos = stream_get_pos(s); + stream_set_pos(s, start_pos + 2); + stream_write_uint32(s, 19 + YLen + CbLen + CrLen + ALen); /* BlockT.blockLen */ + stream_set_pos(s, start_pos + 13); + stream_write_uint16(s, YLen); + stream_write_uint16(s, CbLen); + stream_write_uint16(s, CrLen); + stream_write_uint16(s, ALen); + stream_set_pos(s, end_pos); + return 0; +} + /******************************************************************************/ static int rfx_compose_message_tileset(struct rfxencode* enc, STREAM* s, char* buf, int width, int height, int stride_bytes, - struct rfx_tile *tiles, int num_tiles, - const int *quants, int num_quants) + const struct rfx_tile *tiles, int num_tiles, + const char *quants, int num_quants, + int flags) { int size; int start_pos; int end_pos; int index; int numQuants; - const int *quantVals; - const int *quantValsPtr; + const char *quantVals; int quantIdxY; int quantIdxCb; int quantIdxCr; @@ -308,10 +405,11 @@ rfx_compose_message_tileset(struct rfxencode* enc, STREAM* s, int cy; char *tile_data; + LLOGLN(10, ("rfx_compose_message_tileset:")); if (quants == 0) { numQuants = 1; - quantVals = g_rfx_default_quantization_values; + quantVals = (const char *) g_rfx_default_quantization_values; } else { @@ -321,7 +419,15 @@ rfx_compose_message_tileset(struct rfxencode* enc, STREAM* s, numTiles = num_tiles; size = 22 + numQuants * 5; start_pos = stream_get_pos(s); - stream_write_uint16(s, WBT_EXTENSION); /* CodecChannelT.blockType */ + if (flags & RFX_FLAGS_ALPHAV1) + { + LLOGLN(10, ("rfx_compose_message_tileset: RFX_FLAGS_ALPHAV1 set")); + stream_write_uint16(s, WBT_EXTENSION_PLUS); /* CodecChannelT.blockType */ + } + else + { + stream_write_uint16(s, WBT_EXTENSION); /* CodecChannelT.blockType */ + } stream_seek_uint32(s); /* set CodecChannelT.blockLen later */ stream_write_uint8(s, 1); /* CodecChannelT.codecId */ stream_write_uint8(s, 0); /* CodecChannelT.channelId */ @@ -332,54 +438,100 @@ rfx_compose_message_tileset(struct rfxencode* enc, STREAM* s, stream_write_uint8(s, 0x40); /* tileSize */ stream_write_uint16(s, numTiles); /* numTiles */ stream_seek_uint32(s); /* set tilesDataSize later */ - quantValsPtr = quantVals; - for (index = 0; index < numQuants * 5; index++) - { - stream_write_uint8(s, quantValsPtr[0] + (quantValsPtr[1] << 4)); - quantValsPtr += 2; - } + memcpy(s->p, quantVals, numQuants * 5); + s->p += numQuants * 5; end_pos = stream_get_pos(s); if (enc->format == RFX_FORMAT_YUV) { - for (index = 0; index < numTiles; index++) + if (flags & RFX_FLAGS_ALPHAV1) + { + for (index = 0; index < numTiles; index++) + { + x = tiles[index].x; + y = tiles[index].y; + cx = tiles[index].cx; + cy = tiles[index].cy; + quantIdxY = tiles[index].quant_y; + quantIdxCb = tiles[index].quant_cb; + quantIdxCr = tiles[index].quant_cr; + tile_data = buf + (y << 8) * (stride_bytes >> 8) + (x << 8); + if (rfx_compose_message_tile_yuva(enc, s, + tile_data, cx, cy, stride_bytes, + quantVals, + quantIdxY, quantIdxCb, quantIdxCr, + x / 64, y / 64) != 0) + { + return 1; + } + } + } + else { - x = tiles[index].x; - y = tiles[index].y; - cx = tiles[index].cx; - cy = tiles[index].cy; - quantIdxY = tiles[index].quant_y; - quantIdxCb = tiles[index].quant_cb; - quantIdxCr = tiles[index].quant_cr; - tile_data = buf + (y << 8) * (stride_bytes >> 8) + (x << 8); - if (rfx_compose_message_tile_yuv(enc, s, - tile_data, cx, cy, stride_bytes, - quantVals, - quantIdxY, quantIdxCb, quantIdxCr, - x / 64, y / 64) != 0) + for (index = 0; index < numTiles; index++) { - return 1; + x = tiles[index].x; + y = tiles[index].y; + cx = tiles[index].cx; + cy = tiles[index].cy; + quantIdxY = tiles[index].quant_y; + quantIdxCb = tiles[index].quant_cb; + quantIdxCr = tiles[index].quant_cr; + tile_data = buf + (y << 8) * (stride_bytes >> 8) + (x << 8); + if (rfx_compose_message_tile_yuv(enc, s, + tile_data, cx, cy, stride_bytes, + quantVals, + quantIdxY, quantIdxCb, quantIdxCr, + x / 64, y / 64) != 0) + { + return 1; + } } } } else { - for (index = 0; index < numTiles; index++) + if (flags & RFX_FLAGS_ALPHAV1) + { + for (index = 0; index < numTiles; index++) + { + x = tiles[index].x; + y = tiles[index].y; + cx = tiles[index].cx; + cy = tiles[index].cy; + quantIdxY = tiles[index].quant_y; + quantIdxCb = tiles[index].quant_cb; + quantIdxCr = tiles[index].quant_cr; + tile_data = buf + y * stride_bytes + x * (enc->bits_per_pixel / 8); + if (rfx_compose_message_tile_argb(enc, s, + tile_data, cx, cy, stride_bytes, + quantVals, + quantIdxY, quantIdxCb, quantIdxCr, + x / 64, y / 64) != 0) + { + return 1; + } + } + } + else { - x = tiles[index].x; - y = tiles[index].y; - cx = tiles[index].cx; - cy = tiles[index].cy; - quantIdxY = tiles[index].quant_y; - quantIdxCb = tiles[index].quant_cb; - quantIdxCr = tiles[index].quant_cr; - tile_data = buf + y * stride_bytes + x * (enc->bits_per_pixel / 8); - if (rfx_compose_message_tile_rgb(enc, s, - tile_data, cx, cy, stride_bytes, - quantVals, - quantIdxY, quantIdxCb, quantIdxCr, - x / 64, y / 64) != 0) + for (index = 0; index < numTiles; index++) { - return 1; + x = tiles[index].x; + y = tiles[index].y; + cx = tiles[index].cx; + cy = tiles[index].cy; + quantIdxY = tiles[index].quant_y; + quantIdxCb = tiles[index].quant_cb; + quantIdxCr = tiles[index].quant_cr; + tile_data = buf + y * stride_bytes + x * (enc->bits_per_pixel / 8); + if (rfx_compose_message_tile_rgb(enc, s, + tile_data, cx, cy, stride_bytes, + quantVals, + quantIdxY, quantIdxCb, quantIdxCr, + x / 64, y / 64) != 0) + { + return 1; + } } } } @@ -412,10 +564,10 @@ rfx_compose_message_frame_end(struct rfxencode* enc, STREAM* s) /******************************************************************************/ int rfx_compose_message_data(struct rfxencode* enc, STREAM* s, - struct rfx_rect *regions, int num_regions, + const struct rfx_rect *regions, int num_regions, char *buf, int width, int height, int stride_bytes, - struct rfx_tile *tiles, int num_tiles, - const int *quants, int num_quants) + const struct rfx_tile *tiles, int num_tiles, + const char *quants, int num_quants, int flags) { if (rfx_compose_message_frame_begin(enc, s) != 0) { @@ -426,7 +578,8 @@ rfx_compose_message_data(struct rfxencode* enc, STREAM* s, return 1; } if (rfx_compose_message_tileset(enc, s, buf, width, height, stride_bytes, - tiles, num_tiles, quants, num_quants) != 0) + tiles, num_tiles, quants, num_quants, + flags) != 0) { return 1; } diff --git a/src/rfxcompose.h b/src/rfxcompose.h index aab4770..7d30233 100644 --- a/src/rfxcompose.h +++ b/src/rfxcompose.h @@ -1,7 +1,7 @@ /** * RFX codec encoder * - * Copyright 2014 Jay Sorg + * Copyright 2014-2015 Jay Sorg * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -25,9 +25,9 @@ int rfx_compose_message_header(struct rfxencode* enc, STREAM* s); int rfx_compose_message_data(struct rfxencode* enc, STREAM* s, - struct rfx_rect *regions, int num_regions, + const struct rfx_rect *regions, int num_regions, char *buf, int width, int height, int stride_bytes, - struct rfx_tile *tiles, int num_tiles, - const int *quants, int num_quants); + const struct rfx_tile *tiles, int num_tiles, + const char *quants, int num_quants, int flags); #endif diff --git a/src/rfxconstants.h b/src/rfxconstants.h index 05cb18d..770fccb 100644 --- a/src/rfxconstants.h +++ b/src/rfxconstants.h @@ -39,6 +39,7 @@ enum _RLGR_MODE #define WBT_FRAME_END 0xCCC5 #define WBT_REGION 0xCCC6 #define WBT_EXTENSION 0xCCC7 +#define WBT_EXTENSION_PLUS 0xDDD7 #define CBT_REGION 0xCAC1 #define CBT_TILESET 0xCAC2 #define CBT_TILE 0xCAC3 diff --git a/src/rfxencode.c b/src/rfxencode.c index 4ad57f8..9bbf103 100644 --- a/src/rfxencode.c +++ b/src/rfxencode.c @@ -1,7 +1,7 @@ /** * RFX codec encoder * - * Copyright 2014 Jay Sorg + * Copyright 2014-2015 Jay Sorg * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -28,103 +28,46 @@ #include "rfxconstants.h" #include "rfxencode_tile.h" -/******************************************************************************/ -static void -cpuid(int func, int *eax, int *ebx, int *ecx, int *edx) -{ - *eax = 0; - *ebx = 0; - *ecx = 0; - *edx = 0; -#ifdef __GNUC__ -#if defined(__i386__) || defined(__x86_64__) - *eax = func; - __asm volatile - ( - "mov %%ebx, %%edi;" - "cpuid;" - "mov %%ebx, %%esi;" - "mov %%edi, %%ebx;" - :"+a" (*eax), "=S" (*ebx), "=c" (*ecx), "=d" (*edx) - : :"edi" - ); -#endif +#ifdef RFX_USE_ACCEL_X86 +#include "x86/funcs_x86.h" #endif -} - -#if 0 -inline unsigned int get_cpu_feature_flags() -{ - unsigned int features; - - __asm - { - // Save registers - push eax - push ebx - push ecx - push edx - - // Get the feature flags (eax=1) from edx - mov eax, 1 - cpuid - mov features, edx - - // Restore registers - pop edx - pop ecx - pop ebx - pop eax - } - - return features; -} - -#define cpuid(func,a,b,c,d)\ - asm {\ - mov eax, func\ - cpuid\ - mov a, eax\ - mov b, ebx\ - mov c, ecx\ - mov d, edx\ - } -#endif - -// http://softpixel.com/~cwright/programming/simd/cpuid.php - -#define SSE4_1_FLAG 0x080000 -#define SSE4_2_FLAG 0x100000 - -/* -Function 0x80000001: -bit (edx) feature -22 AMD MMX Extensions -30 3DNow!2 -31 3DNow! -*/ - -#if 0 -#define cpuid(_func, _ax, _bx, _cx, _dx) \ - __asm volatile ("cpuid": \ - "=a" (_ax), "=b" (_bx), "=c" (_cx), "=d" (_dx) : "a" (_func)); +#ifdef RFX_USE_ACCEL_AMD64 +#include "amd64/funcs_amd64.h" #endif /******************************************************************************/ -void * -rfxcodec_encode_create(int width, int height, int format, int flags) +int +rfxcodec_encode_create_ex(int width, int height, int format, int flags, + void **handle) { struct rfxencode *enc; - int ax, bx, cx, dx; + int ax; + int bx; + int cx; + int dx; enc = (struct rfxencode *) malloc(sizeof(struct rfxencode)); if (enc == 0) { - return 0; + return 1; } memset(enc, 0, sizeof(struct rfxencode)); - cpuid(1, &ax, &bx, &cx, &dx); + + enc->dwt_buffer = (sint16*)(((size_t)(enc->dwt_buffer_a)) & ~15); + enc->dwt_buffer1 = (sint16*)(((size_t)(enc->dwt_buffer1_a)) & ~15); + enc->dwt_buffer2 = (sint16*)(((size_t)(enc->dwt_buffer2_a)) & ~15); + +#if defined(RFX_USE_ACCEL_X86) + cpuid_x86(1, 0, &ax, &bx, &cx, &dx); +#elif defined(RFX_USE_ACCEL_AMD64) + cpuid_amd64(1, 0, &ax, &bx, &cx, &dx); +#else + ax = 0; + bx = 0; + cx = 0; + dx = 0; +#endif if (dx & (1 << 26)) /* SSE 2 */ { printf("rfxcodec_encode_create: got sse2\n"); @@ -150,7 +93,16 @@ rfxcodec_encode_create(int width, int height, int format, int flags) printf("rfxcodec_encode_create: got popcnt\n"); enc->got_popcnt = 1; } - cpuid(0x80000001, &ax, &bx, &cx, &dx); +#if defined(RFX_USE_ACCEL_X86) + cpuid_x86(0x80000001, 0, &ax, &bx, &cx, &dx); +#elif defined(RFX_USE_ACCEL_AMD64) + cpuid_amd64(0x80000001, 0, &ax, &bx, &cx, &dx); +#else + ax = 0; + bx = 0; + cx = 0; + dx = 0; +#endif if (cx & (1 << 5)) /* lzcnt */ { printf("rfxcodec_encode_create: got lzcnt\n"); @@ -169,7 +121,7 @@ rfxcodec_encode_create(int width, int height, int format, int flags) { enc->mode = RLGR1; } - switch (format) + switch (format) { case RFX_FORMAT_BGRA: enc->bits_per_pixel = 32; @@ -188,7 +140,7 @@ rfxcodec_encode_create(int width, int height, int format, int flags) break; default: free(enc); - return NULL; + return 2; } enc->format = format; /* assign encoding functions */ @@ -196,29 +148,133 @@ rfxcodec_encode_create(int width, int height, int format, int flags) { if (enc->mode == RLGR3) { + printf("rfxcodec_encode_create: rfx_encode set to rfx_encode_component_rlgr3\n"); enc->rfx_encode = rfx_encode_component_rlgr3; /* rfxencode_tile.c */ } else { + printf("rfxcodec_encode_create: rfx_encode set to rfx_encode_component_rlgr1\n"); enc->rfx_encode = rfx_encode_component_rlgr1; /* rfxencode_tile.c */ } } else { -#if defined(RFX_USE_ACCEL) && RFX_USE_ACCEL - enc->rfx_encode = rfx_encode_component_x86_sse4; /* rfxencode_tile.c */ +#if defined(RFX_USE_ACCEL_X86) + if (enc->got_sse41) + { + if (enc->mode == RLGR3) + { + printf("rfxcodec_encode_create: rfx_encode set to rfx_encode_component_rlgr3_x86_sse41\n"); + enc->rfx_encode = rfx_encode_component_rlgr3_x86_sse41; /* rfxencode_tile.c */ + } + else + { + printf("rfxcodec_encode_create: rfx_encode set to rfx_encode_component_rlgr1_x86_sse41\n"); + enc->rfx_encode = rfx_encode_component_rlgr1_x86_sse41; /* rfxencode_tile.c */ + } + } + else if (enc->got_sse2) + { + if (enc->mode == RLGR3) + { + printf("rfxcodec_encode_create: rfx_encode set to rfx_encode_component_rlgr3_x86_sse2\n"); + enc->rfx_encode = rfx_encode_component_rlgr3_x86_sse2; /* rfxencode_tile.c */ + } + else + { + printf("rfxcodec_encode_create: rfx_encode set to rfx_encode_component_rlgr1_x86_sse2\n"); + enc->rfx_encode = rfx_encode_component_rlgr1_x86_sse2; /* rfxencode_tile.c */ + } + } + else + { + if (enc->mode == RLGR3) + { + printf("rfxcodec_encode_create: rfx_encode set to rfx_encode_component_rlgr3\n"); + enc->rfx_encode = rfx_encode_component_rlgr3; /* rfxencode_tile.c */ + } + else + { + printf("rfxcodec_encode_create: rfx_encode set to rfx_encode_component_rlgr1\n"); + enc->rfx_encode = rfx_encode_component_rlgr1; /* rfxencode_tile.c */ + } + } +#elif defined(RFX_USE_ACCEL_AMD64) + if (enc->got_sse41) + { + if (enc->mode == RLGR3) + { + printf("rfxcodec_encode_create: rfx_encode set to rfx_encode_component_rlgr3_amd64_sse41\n"); + enc->rfx_encode = rfx_encode_component_rlgr3_amd64_sse41; /* rfxencode_tile.c */ + } + else + { + printf("rfxcodec_encode_create: rfx_encode set to rfx_encode_component_rlgr1_amd64_sse41\n"); + enc->rfx_encode = rfx_encode_component_rlgr1_amd64_sse41; /* rfxencode_tile.c */ + } + } + else if (enc->got_sse2) + { + if (enc->mode == RLGR3) + { + printf("rfxcodec_encode_create: rfx_encode set to rfx_encode_component_rlgr3_amd64_sse2\n"); + enc->rfx_encode = rfx_encode_component_rlgr3_amd64_sse2; /* rfxencode_tile.c */ + } + else + { + printf("rfxcodec_encode_create: rfx_encode set to rfx_encode_component_rlgr1_amd64_sse2\n"); + enc->rfx_encode = rfx_encode_component_rlgr1_amd64_sse2; /* rfxencode_tile.c */ + } + } + else + { + if (enc->mode == RLGR3) + { + printf("rfxcodec_encode_create: rfx_encode set to rfx_encode_component_rlgr3\n"); + enc->rfx_encode = rfx_encode_component_rlgr3; /* rfxencode_tile.c */ + } + else + { + printf("rfxcodec_encode_create: rfx_encode set to rfx_encode_component_rlgr1\n"); + enc->rfx_encode = rfx_encode_component_rlgr1; /* rfxencode_tile.c */ + } + } #else if (enc->mode == RLGR3) { + printf("rfxcodec_encode_create: rfx_encode set to rfx_encode_component_rlgr3\n"); enc->rfx_encode = rfx_encode_component_rlgr3; /* rfxencode_tile.c */ } else { + printf("rfxcodec_encode_create: rfx_encode set to rfx_encode_component_rlgr1\n"); enc->rfx_encode = rfx_encode_component_rlgr1; /* rfxencode_tile.c */ } #endif } - return enc; + if (ax == 0) + { + } + if (bx == 0) + { + } + *handle = enc; + return 0; +} + +/******************************************************************************/ +void * +rfxcodec_encode_create(int width, int height, int format, int flags) +{ + int error; + void *handle; + + error = rfxcodec_encode_create_ex(width, height, format, flags, &handle); + if (error == 0) + { + return handle; + } + return 0; } /******************************************************************************/ @@ -238,11 +294,11 @@ rfxcodec_encode_destroy(void * handle) /******************************************************************************/ int -rfxcodec_encode(void *handle, char *cdata, int *cdata_bytes, - char *buf, int width, int height, int stride_bytes, - struct rfx_rect *regions, int num_regions, - struct rfx_tile *tiles, int num_tiles, - const int *quants, int num_quants) +rfxcodec_encode_ex(void *handle, char *cdata, int *cdata_bytes, + char *buf, int width, int height, int stride_bytes, + const struct rfx_rect *regions, int num_regions, + const struct rfx_tile *tiles, int num_tiles, + const char *quants, int num_quants, int flags) { struct rfxencode *enc; STREAM s; @@ -263,10 +319,25 @@ rfxcodec_encode(void *handle, char *cdata, int *cdata_bytes, } if (rfx_compose_message_data(enc, &s, regions, num_regions, buf, width, height, stride_bytes, - tiles, num_tiles, quants, num_quants) != 0) + tiles, num_tiles, quants, num_quants, + flags) != 0) { return 1; } *cdata_bytes = (int) (s.p - s.data); return 0; } + +/******************************************************************************/ +int +rfxcodec_encode(void *handle, char *cdata, int *cdata_bytes, + char *buf, int width, int height, int stride_bytes, + const struct rfx_rect *regions, int num_regions, + const struct rfx_tile *tiles, int num_tiles, + const char *quants, int num_quants) +{ + return rfxcodec_encode_ex(handle, cdata, cdata_bytes, buf, width, height, + stride_bytes, regions, num_regions, tiles, + num_tiles, quants, num_quants, 0); +} + diff --git a/src/rfxencode.h b/src/rfxencode.h index 4db6a01..c9fc5d0 100644 --- a/src/rfxencode.h +++ b/src/rfxencode.h @@ -1,7 +1,7 @@ /** * RFX codec encoder * - * Copyright 2014 Jay Sorg + * Copyright 2014-2015 Jay Sorg * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,8 +21,7 @@ struct rfxencode; -typedef int (*rfx_encode_proc)(struct rfxencode *enc, - const int *quantization_values, +typedef int (*rfx_encode_proc)(struct rfxencode *enc, const char *qtable, uint8 *data, uint8 *buffer, int buffer_size, int *size); @@ -39,13 +38,18 @@ struct rfxencode int format; int pad0[7]; + uint8 a_buffer[4096]; uint8 y_r_buffer[4096]; - uint8 cb_g_buffer[4096]; - uint8 cr_b_buffer[4096]; - - sint16 dwt_buffer[4096]; - sint16 dwt_buffer1[4096]; - + uint8 u_g_buffer[4096]; + uint8 v_b_buffer[4096]; + uint8 pad1[16]; + sint16 dwt_buffer_a[4096]; + sint16 dwt_buffer1_a[4096]; + sint16 dwt_buffer2_a[4096]; + uint8 pad2[16]; + sint16* dwt_buffer; + sint16* dwt_buffer1; + sint16* dwt_buffer2; rfx_encode_proc rfx_encode; int got_sse2; @@ -56,7 +60,6 @@ struct rfxencode int got_popcnt; int got_lzcnt; int got_neon; - }; #endif diff --git a/src/rfxencode_alpha.c b/src/rfxencode_alpha.c new file mode 100644 index 0000000..58d8e10 --- /dev/null +++ b/src/rfxencode_alpha.c @@ -0,0 +1,279 @@ +/** + * librfxcodec: A Remote Desktop Protocol client. + * RemoteFX Codec Library + * + * Copyright 2015 Jay Sorg + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include + +#include "rfxcommon.h" +#include "rfxencode.h" +#include "rfxconstants.h" +#include "rfxencode_tile.h" + +#define LLOG_LEVEL 1 +#define LLOGLN(_level, _args) \ + do { if (_level < LLOG_LEVEL) { printf _args ; printf("\n"); } } while (0) + +#if 1 +/*****************************************************************************/ +static int +fdelta(char *in_plane, char *out_plane, int cx, int cy) +{ + char delta; + char *src8; + char *dst8; + int index; + int jndex; + + memcpy(out_plane, in_plane, cx); + src8 = in_plane; + dst8 = out_plane; + for (jndex = 1; jndex < cy; jndex++) + { + for (index = 0; index < cx; index++) + { + delta = src8[cx] - src8[0]; + if (delta & 0x80) + { + delta = (((~delta) + 1) << 1) - 1; + } + else + { + delta = delta << 1; + } + dst8[cx] = delta; + src8++; + dst8++; + } + } + return 0; +} +#endif + +#if 0 +/*****************************************************************************/ +#define DELTA_ONE \ +do { \ + delta = src8[cx] - src8[0]; \ + is_neg = (delta >> 7) & 1; \ + dst8[cx] = (((delta ^ -is_neg) + is_neg) << 1) - is_neg; \ + src8++; \ + dst8++; \ +} while (0) + +/*****************************************************************************/ +static int +fdelta(char *in_plane, char *out_plane, int cx, int cy) +{ + char delta; + char is_neg; + char *src8; + char *dst8; + char *src8_end; + + memcpy(out_plane, in_plane, cx); + src8 = in_plane; + dst8 = out_plane; + src8_end = src8 + (cx * cy - cx); + while (src8 + 8 <= src8_end) + { + DELTA_ONE; + DELTA_ONE; + DELTA_ONE; + DELTA_ONE; + DELTA_ONE; + DELTA_ONE; + DELTA_ONE; + DELTA_ONE; + } + while (src8 < src8_end) + { + DELTA_ONE; + } + return 0; +} +#endif + +/*****************************************************************************/ +static int +fout(int collen, int replen, char *colptr, STREAM *s) +{ + int code; + int lcollen; + int lreplen; + int cont; + + LLOGLN(10, ("fout: collen %d replen %d", collen, replen)); + cont = collen > 13; + while (cont) + { + lcollen = collen; + if (lcollen > 15) + { + lcollen = 15; + } + code = lcollen << 4; + stream_write_uint8(s, code); + memcpy(s->p, colptr, lcollen); + s->p += lcollen; + colptr += lcollen; + collen -= lcollen; + cont = collen > 13; + } + cont = (collen > 0) || (replen > 0); + while (cont) + { + lreplen = replen; + if ((collen == 0) && (lreplen > 15)) + { + /* big run */ + if (lreplen > 47) + { + lreplen = 47; + } + LLOGLN(10, ("fout: big run lreplen %d", lreplen)); + replen -= lreplen; + code = ((lreplen & 0xF) << 4) | ((lreplen & 0xF0) >> 4); + stream_write_uint8(s, code); + colptr += lreplen; + } + else + { + if (lreplen > 15) + { + lreplen = 15; + } + replen -= lreplen; + if (lreplen < 3) + { + collen += lreplen; + lreplen = 0; + } + code = (collen << 4) | lreplen; + stream_write_uint8(s, code); + memcpy(s->p, colptr, collen); + s->p += collen; + colptr += collen + lreplen; + collen = 0; + } + cont = replen > 0; + } + return 0; +} + +/*****************************************************************************/ +static int +fpack(char *plane, int cx, int cy, STREAM *s) +{ + char *ptr8; + char *colptr; + char *lend; + uint8 *holdp; + int jndex; + int collen; + int replen; + + LLOGLN(10, ("fpack:")); + holdp = s->p; + for (jndex = 0; jndex < cy; jndex++) + { + LLOGLN(10, ("line start line %d cx %d cy %d", jndex, cx, cy)); + ptr8 = (char *) (plane + jndex * cx); + lend = ptr8 + (cx - 1); + colptr = ptr8; + if (colptr[0] == 0) + { + collen = 0; + replen = 1; + } + else + { + collen = 1; + replen = 0; + } + while (ptr8 < lend) + { + if (ptr8[0] == ptr8[1]) + { + replen++; + } + else + { + if (replen > 0) + { + if (replen < 3) + { + collen += replen + 1; + replen = 0; + } + else + { + fout(collen, replen, colptr, s); + colptr = ptr8 + 1; + replen = 0; + collen = 1; + } + } + else + { + collen++; + } + } + ptr8++; + } + /* end of line */ + fout(collen, replen, colptr, s); + } + return (int) (s->p - holdp); +} + +/*****************************************************************************/ +int +rfx_encode_plane(struct rfxencode *enc, uint8 *plane, int cx, int cy, + STREAM *s) +{ + char *org_plane; + char *delta_plane; + int bytes; + uint8 *holdp; + + org_plane = (char *) plane; + delta_plane = (char *) (enc->dwt_buffer1); + fdelta(org_plane, delta_plane, cx, cy); + holdp = s->p; + stream_write_uint8(s, 0x10); /* flags, RLE */ + bytes = fpack(delta_plane, cx, cy, s); + if (bytes > cx * cy) + { + LLOGLN(10, ("rfx_encode_plane: too big bytes %d", bytes)); + s->p = holdp; + stream_write_uint8(s, 0); /* flags */ + memcpy(s->p, plane, cx * cy); + s->p += cx * cy; + stream_write_uint8(s, 0); /* pad if not RLE */ + bytes = cx * cy + 2; + } + else + { + LLOGLN(10, ("rfx_encode_plane: ok bytes %d", bytes)); + } + return bytes; +} diff --git a/src/rfxencode_alpha.h b/src/rfxencode_alpha.h new file mode 100644 index 0000000..3f01218 --- /dev/null +++ b/src/rfxencode_alpha.h @@ -0,0 +1,28 @@ +/** + * librfxcodec: A Remote Desktop Protocol client. + * RemoteFX Codec Library + * + * Copyright 2015 Jay Sorg + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __RFXCODEC_ENCODE_ALPHA_H +#define __RFXCODEC_ENCODE_ALPHA_H + +int +rfx_encode_plane(struct rfxencode *enc, uint8 *plane, int cx, int cy, + STREAM *s); + +#endif + diff --git a/src/rfxencode_dwt.c b/src/rfxencode_dwt.c index b68b765..36c8e93 100644 --- a/src/rfxencode_dwt.c +++ b/src/rfxencode_dwt.c @@ -3,7 +3,7 @@ * RemoteFX Codec Library - DWT * * Copyright 2011 Vic Lee - * Copyright 2014 Jay Sorg + * Copyright 2014-2015 Jay Sorg * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -151,6 +151,7 @@ rfx_dwt_2d_encode_block8(uint8 *in_buffer, { uint8 *src; sint16 *l, *h; + sint16 s1, s2, s3; int total_width; int x, y; int n; @@ -166,8 +167,12 @@ rfx_dwt_2d_encode_block8(uint8 *in_buffer, l = dwt + x; h = l + subband_width * total_width; src = in_buffer + x; - *h = ((src[total_width] - 128) - (((src[0] - 128) + (src[2 * total_width] - 128)) >> 1)) >> 1; - *l = (src[0] - 128) + *h; + s1 = (src[total_width] - 128) << DWT_FACTOR; + s2 = (src[0] - 128) << DWT_FACTOR; + s3 = (src[2 * total_width] - 128) << DWT_FACTOR; + *h = (s1 - ((s2 + s3) >> 1)) >> 1; + s1 = (src[0] - 128) << DWT_FACTOR; + *l = s1 + *h; /* loop */ for (n = 1; n < subband_width - 1; n++) @@ -176,8 +181,12 @@ rfx_dwt_2d_encode_block8(uint8 *in_buffer, l = dwt + n * total_width + x; h = l + subband_width * total_width; src = in_buffer + y * total_width + x; - *h = ((src[total_width] - 128) - (((src[0] - 128) + (src[2 * total_width] - 128)) >> 1)) >> 1; - *l = (src[0] - 128) + ((*(h - total_width) + *h) >> 1); + s1 = (src[total_width] - 128) << DWT_FACTOR; + s2 = (src[0] - 128) << DWT_FACTOR; + s3 = (src[2 * total_width] - 128) << DWT_FACTOR; + *h = (s1 - ((s2 + s3) >> 1)) >> 1; + s1 = (src[0] - 128) << DWT_FACTOR; + *l = s1 + ((*(h - total_width) + *h) >> 1); } /* post */ @@ -186,8 +195,12 @@ rfx_dwt_2d_encode_block8(uint8 *in_buffer, l = dwt + n * total_width + x; h = l + subband_width * total_width; src = in_buffer + y * total_width + x; - *h = ((src[total_width] - 128) - (((src[0] - 128) + (src[0] - 128)) >> 1)) >> 1; - *l = (src[0] - 128) + ((*(h - total_width) + *h) >> 1); + s1 = (src[total_width] - 128) << DWT_FACTOR; + s2 = (src[0] - 128) << DWT_FACTOR; + s3 = (src[0] - 128) << DWT_FACTOR; + *h = (s1 - ((s2 + s3) >> 1)) >> 1; + s1 = (src[0] - 128) << DWT_FACTOR; + *l = s1 + ((*(h - total_width) + *h) >> 1); } diff --git a/src/rfxencode_dwt.h b/src/rfxencode_dwt.h index 248edc1..36a62ed 100644 --- a/src/rfxencode_dwt.h +++ b/src/rfxencode_dwt.h @@ -1,7 +1,7 @@ /** * RFX codec encoder * - * Copyright 2014 Jay Sorg + * Copyright 2014-2015 Jay Sorg * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/src/rfxencode_quantization.c b/src/rfxencode_quantization.c index 9c65b40..6e3a577 100644 --- a/src/rfxencode_quantization.c +++ b/src/rfxencode_quantization.c @@ -3,7 +3,7 @@ * RemoteFX Codec Library - Quantization * * Copyright 2011 Vic Lee - * Copyright 2014 Jay Sorg + * Copyright 2014-2015 Jay Sorg * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -88,7 +88,7 @@ rfx_quantization_encode_block(sint16* buffer, int buffer_size, uint32 factor) } #endif -#if 1 +#if 0 /******************************************************************************/ static int rfx_quantization_encode_block(sint16* buffer, int buffer_size, uint32 factor) @@ -110,20 +110,54 @@ rfx_quantization_encode_block(sint16* buffer, int buffer_size, uint32 factor) } #endif +#if 1 +/******************************************************************************/ +static int +rfx_quantization_encode_block(sint16* buffer, int buffer_size, uint32 factor) +{ + sint16* dst; + sint16 half; + + factor += DWT_FACTOR; + if (factor == 0) + { + return 1; + } + half = (1 << (factor - 1)); + for (dst = buffer; buffer_size > 0; dst++, buffer_size--) + { + *dst = (*dst + half) >> factor; + } + return 0; +} +#endif + /******************************************************************************/ int -rfx_quantization_encode(sint16* buffer, const int* quantization_values) +rfx_quantization_encode(sint16* buffer, const char* qtable) { - rfx_quantization_encode_block(buffer, 1024, quantization_values[8] - 6); /* HL1 */ - rfx_quantization_encode_block(buffer + 1024, 1024, quantization_values[7] - 6); /* LH1 */ - rfx_quantization_encode_block(buffer + 2048, 1024, quantization_values[9] - 6); /* HH1 */ - rfx_quantization_encode_block(buffer + 3072, 256, quantization_values[5] - 6); /* HL2 */ - rfx_quantization_encode_block(buffer + 3328, 256, quantization_values[4] - 6); /* LH2 */ - rfx_quantization_encode_block(buffer + 3584, 256, quantization_values[6] - 6); /* HH2 */ - rfx_quantization_encode_block(buffer + 3840, 64, quantization_values[2] - 6); /* HL3 */ - rfx_quantization_encode_block(buffer + 3904, 64, quantization_values[1] - 6); /* LH3 */ - rfx_quantization_encode_block(buffer + 3968, 64, quantization_values[3] - 6); /* HH3 */ - rfx_quantization_encode_block(buffer + 4032, 64, quantization_values[0] - 6); /* LL3 */ + uint32 factor; + + factor = ((qtable[4] >> 0) & 0xf) - 6; + rfx_quantization_encode_block(buffer, 1024, factor); /* HL1 */ + factor = ((qtable[3] >> 4) & 0xf) - 6; + rfx_quantization_encode_block(buffer + 1024, 1024, factor); /* LH1 */ + factor = ((qtable[4] >> 4) & 0xf) - 6; + rfx_quantization_encode_block(buffer + 2048, 1024, factor); /* HH1 */ + factor = ((qtable[2] >> 4) & 0xf) - 6; + rfx_quantization_encode_block(buffer + 3072, 256, factor); /* HL2 */ + factor = ((qtable[2] >> 0) & 0xf) - 6; + rfx_quantization_encode_block(buffer + 3328, 256, factor); /* LH2 */ + factor = ((qtable[3] >> 0) & 0xf) - 6; + rfx_quantization_encode_block(buffer + 3584, 256, factor); /* HH2 */ + factor = ((qtable[1] >> 0) & 0xf) - 6; + rfx_quantization_encode_block(buffer + 3840, 64, factor); /* HL3 */ + factor = ((qtable[0] >> 4) & 0xf) - 6; + rfx_quantization_encode_block(buffer + 3904, 64, factor); /* LH3 */ + factor = ((qtable[1] >> 4) & 0xf) - 6; + rfx_quantization_encode_block(buffer + 3968, 64, factor); /* HH3 */ + factor = ((qtable[0] >> 0) & 0xf) - 6; + rfx_quantization_encode_block(buffer + 4032, 64, factor); /* LL3 */ return 0; } diff --git a/src/rfxencode_quantization.h b/src/rfxencode_quantization.h index a0cd802..d246889 100644 --- a/src/rfxencode_quantization.h +++ b/src/rfxencode_quantization.h @@ -23,6 +23,6 @@ #include "rfxcommon.h" int -rfx_quantization_encode(sint16* buffer, const int* quantization_values); +rfx_quantization_encode(sint16 *buffer, const char *quantization_values); #endif /* __RFX_QUANTIZATION_H */ diff --git a/src/rfxencode_rlgr1.c b/src/rfxencode_rlgr1.c index e4b9867..638b535 100644 --- a/src/rfxencode_rlgr1.c +++ b/src/rfxencode_rlgr1.c @@ -124,7 +124,7 @@ do { \ } while (0) int -rfx_rlgr1_encode(const sint16* data, int data_size, uint8* buffer, int buffer_size) +rfx_rlgr1_encode(const sint16* data, uint8* buffer, int buffer_size) { int k; int kp; @@ -137,6 +137,7 @@ rfx_rlgr1_encode(const sint16* data, int data_size, uint8* buffer, int buffer_si int sign; int processed_size; int lmag; + int data_size; RFX_BITSTREAM bs; @@ -150,6 +151,7 @@ rfx_rlgr1_encode(const sint16* data, int data_size, uint8* buffer, int buffer_si krp = 1 << LSGR; /* process all the input coefficients */ + data_size = 4096; while (data_size > 0) { if (k) diff --git a/src/rfxencode_rlgr1.h b/src/rfxencode_rlgr1.h index a08e637..f941e06 100644 --- a/src/rfxencode_rlgr1.h +++ b/src/rfxencode_rlgr1.h @@ -23,6 +23,6 @@ #include "rfxcommon.h" int -rfx_rlgr1_encode(const sint16* data, int data_size, uint8* buffer, int buffer_size); +rfx_rlgr1_encode(const sint16* data, uint8* buffer, int buffer_size); #endif /* __RFX_RLGR_H */ diff --git a/src/rfxencode_rlgr3.c b/src/rfxencode_rlgr3.c index 3b1666d..809767d 100644 --- a/src/rfxencode_rlgr3.c +++ b/src/rfxencode_rlgr3.c @@ -124,7 +124,7 @@ do { \ } while (0) int -rfx_rlgr3_encode(const sint16* data, int data_size, uint8* buffer, int buffer_size) +rfx_rlgr3_encode(const sint16* data, uint8* buffer, int buffer_size) { int k; int kp; @@ -137,6 +137,7 @@ rfx_rlgr3_encode(const sint16* data, int data_size, uint8* buffer, int buffer_si int sign; int processed_size; int lmag; + int data_size; RFX_BITSTREAM bs; @@ -153,6 +154,7 @@ rfx_rlgr3_encode(const sint16* data, int data_size, uint8* buffer, int buffer_si krp = 1 << LSGR; /* process all the input coefficients */ + data_size = 4096; while (data_size > 0) { if (k) diff --git a/src/rfxencode_rlgr3.h b/src/rfxencode_rlgr3.h index 1efdc4c..2743e39 100644 --- a/src/rfxencode_rlgr3.h +++ b/src/rfxencode_rlgr3.h @@ -23,6 +23,6 @@ #include "rfxcommon.h" int -rfx_rlgr3_encode(const sint16* data, int data_size, uint8* buffer, int buffer_size); +rfx_rlgr3_encode(const sint16* data, uint8* buffer, int buffer_size); #endif /* __RFX_RLGR_H */ diff --git a/src/rfxencode_tile.c b/src/rfxencode_tile.c index 409121c..e78b746 100644 --- a/src/rfxencode_tile.c +++ b/src/rfxencode_tile.c @@ -3,7 +3,7 @@ * RemoteFX Codec Library - Encode * * Copyright 2011 Vic Lee - * Copyright 2014 Jay Sorg + * Copyright 2014-2015 Jay Sorg * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -33,6 +33,15 @@ #include "rfxencode_differential.h" #include "rfxencode_rlgr1.h" #include "rfxencode_rlgr3.h" +#include "rfxencode_alpha.h" + +#ifdef RFX_USE_ACCEL_X86 +#include "x86/funcs_x86.h" +#endif + +#ifdef RFX_USE_ACCEL_AMD64 +#include "amd64/funcs_amd64.h" +#endif #define LLOG_LEVEL 1 #define LLOGLN(_level, _args) \ @@ -50,69 +59,337 @@ rfx_encode_format_rgb(char *rgb_data, int width, int height, uint8 r; uint8 g; uint8 b; + uint8 *lr_buf; + uint8 *lg_buf; + uint8 *lb_buf; + LLOGLN(10, ("rfx_encode_format_rgb: pixel_format %d", pixel_format)); + b = 0; + g = 0; + r = 0; switch (pixel_format) { case RFX_FORMAT_BGRA: for (y = 0; y < height; y++) { src = (uint8*) (rgb_data + y * stride_bytes); + lr_buf = r_buf + y * 64; + lg_buf = g_buf + y * 64; + lb_buf = b_buf + y * 64; for (x = 0; x < width; x++) { b = *src++; - *b_buf++ = b; + *lb_buf++ = b; g = *src++; - *g_buf++ = g; + *lg_buf++ = g; r = *src++; - *r_buf++ = r; + *lr_buf++ = r; src++; } + while (x < 64) + { + *lr_buf++ = r; + *lg_buf++ = g; + *lb_buf++ = r; + x++; + } + } + while (y < 64) + { + lr_buf = r_buf + y * 64; + lg_buf = g_buf + y * 64; + lb_buf = b_buf + y * 64; + memcpy(lr_buf, lr_buf - 64, 64); + memcpy(lg_buf, lg_buf - 64, 64); + memcpy(lb_buf, lb_buf - 64, 64); + y++; } break; case RFX_FORMAT_RGBA: for (y = 0; y < height; y++) { src = (uint8*) (rgb_data + y * stride_bytes); + lr_buf = r_buf + y * 64; + lg_buf = g_buf + y * 64; + lb_buf = b_buf + y * 64; for (x = 0; x < width; x++) { r = *src++; - *r_buf++ = r; + *lr_buf++ = r; g = *src++; - *g_buf++ = g; + *lg_buf++ = g; b = *src++; - *b_buf++ = b; + *lb_buf++ = b; src++; } + while (x < 64) + { + *lr_buf++ = r; + *lg_buf++ = g; + *lb_buf++ = b; + x++; + } + } + while (y < 64) + { + lr_buf = r_buf + y * 64; + lg_buf = g_buf + y * 64; + lb_buf = b_buf + y * 64; + memcpy(lr_buf, lr_buf - 64, 64); + memcpy(lg_buf, lg_buf - 64, 64); + memcpy(lb_buf, lb_buf - 64, 64); + y++; } break; case RFX_FORMAT_BGR: for (y = 0; y < height; y++) { src = (uint8*) (rgb_data + y * stride_bytes); + lr_buf = r_buf + y * 64; + lg_buf = g_buf + y * 64; + lb_buf = b_buf + y * 64; for (x = 0; x < width; x++) { b = *src++; - *b_buf++ = b; + *lb_buf++ = b; g = *src++; - *g_buf++ = g; + *lg_buf++ = g; r = *src++; - *r_buf++ = r; + *lr_buf++ = r; + } + while (x < 64) + { + *lr_buf++ = r; + *lg_buf++ = g; + *lb_buf++ = b; + x++; } } + while (y < 64) + { + lr_buf = r_buf + y * 64; + lg_buf = g_buf + y * 64; + lb_buf = b_buf + y * 64; + memcpy(lr_buf, lr_buf - 64, 64); + memcpy(lg_buf, lg_buf - 64, 64); + memcpy(lb_buf, lb_buf - 64, 64); + y++; + } break; case RFX_FORMAT_RGB: for (y = 0; y < height; y++) { src = (uint8*) (rgb_data + y * stride_bytes); + lr_buf = r_buf + y * 64; + lg_buf = g_buf + y * 64; + lb_buf = b_buf + y * 64; + for (x = 0; x < width; x++) + { + r = *src++; + *lr_buf++ = r; + g = *src++; + *lg_buf++ = g; + b = *src++; + *lb_buf++ = b; + } + while (x < 64) + { + *lr_buf++ = r; + *lg_buf++ = g; + *lb_buf++ = b; + x++; + } + } + while (y < 64) + { + lr_buf = r_buf + y * 64; + lg_buf = g_buf + y * 64; + lb_buf = b_buf + y * 64; + memcpy(lr_buf, lr_buf - 64, 64); + memcpy(lg_buf, lg_buf - 64, 64); + memcpy(lb_buf, lb_buf - 64, 64); + y++; + } + break; + } + return 0; +} + +/******************************************************************************/ +static int +rfx_encode_format_argb(char *argb_data, int width, int height, + int stride_bytes, int pixel_format, + uint8 *a_buf, uint8 *r_buf, uint8 *g_buf, uint8 *b_buf) +{ + int x; + int y; + const uint8 *src; + uint8 a; + uint8 r; + uint8 g; + uint8 b; + uint8 *la_buf; + uint8 *lr_buf; + uint8 *lg_buf; + uint8 *lb_buf; + + LLOGLN(10, ("rfx_encode_format_argb: pixel_format %d", pixel_format)); + b = 0; + g = 0; + r = 0; + a = 0; + switch (pixel_format) + { + case RFX_FORMAT_BGRA: + for (y = 0; y < height; y++) + { + src = (uint8*) (argb_data + y * stride_bytes); + la_buf = a_buf + y * 64; + lr_buf = r_buf + y * 64; + lg_buf = g_buf + y * 64; + lb_buf = b_buf + y * 64; + for (x = 0; x < width; x++) + { + b = *src++; + *lb_buf++ = b; + g = *src++; + *lg_buf++ = g; + r = *src++; + *lr_buf++ = r; + a = *src++; + *la_buf++ = a; + } + while (x < 64) + { + *la_buf++ = a; + *lr_buf++ = r; + *lg_buf++ = g; + *lb_buf++ = r; + x++; + } + } + while (y < 64) + { + la_buf = a_buf + y * 64; + lr_buf = r_buf + y * 64; + lg_buf = g_buf + y * 64; + lb_buf = b_buf + y * 64; + memcpy(la_buf, la_buf - 64, 64); + memcpy(lr_buf, lr_buf - 64, 64); + memcpy(lg_buf, lg_buf - 64, 64); + memcpy(lb_buf, lb_buf - 64, 64); + y++; + } + break; + case RFX_FORMAT_RGBA: + for (y = 0; y < height; y++) + { + src = (uint8*) (argb_data + y * stride_bytes); + la_buf = a_buf + y * 64; + lr_buf = r_buf + y * 64; + lg_buf = g_buf + y * 64; + lb_buf = b_buf + y * 64; + for (x = 0; x < width; x++) + { + r = *src++; + *lr_buf++ = r; + g = *src++; + *lg_buf++ = g; + b = *src++; + *lb_buf++ = b; + a = *src++; + *la_buf++ = a; + } + while (x < 64) + { + *la_buf++ = a; + *lr_buf++ = r; + *lg_buf++ = g; + *lb_buf++ = b; + x++; + } + } + while (y < 64) + { + la_buf = a_buf + y * 64; + lr_buf = r_buf + y * 64; + lg_buf = g_buf + y * 64; + lb_buf = b_buf + y * 64; + memcpy(la_buf, la_buf - 64, 64); + memcpy(lr_buf, lr_buf - 64, 64); + memcpy(lg_buf, lg_buf - 64, 64); + memcpy(lb_buf, lb_buf - 64, 64); + y++; + } + break; + case RFX_FORMAT_BGR: + for (y = 0; y < height; y++) + { + src = (uint8*) (argb_data + y * stride_bytes); + lr_buf = r_buf + y * 64; + lg_buf = g_buf + y * 64; + lb_buf = b_buf + y * 64; for (x = 0; x < width; x++) { + b = *src++; + *lb_buf++ = b; + g = *src++; + *lg_buf++ = g; r = *src++; - *r_buf++ = r; + *lr_buf++ = r; + } + while (x < 64) + { + *lr_buf++ = r; + *lg_buf++ = g; + *lb_buf++ = b; + x++; + } + } + while (y < 64) + { + lr_buf = r_buf + y * 64; + lg_buf = g_buf + y * 64; + lb_buf = b_buf + y * 64; + memcpy(lr_buf, lr_buf - 64, 64); + memcpy(lg_buf, lg_buf - 64, 64); + memcpy(lb_buf, lb_buf - 64, 64); + y++; + } + break; + case RFX_FORMAT_RGB: + for (y = 0; y < height; y++) + { + src = (uint8*) (argb_data + y * stride_bytes); + lr_buf = r_buf + y * 64; + lg_buf = g_buf + y * 64; + lb_buf = b_buf + y * 64; + for (x = 0; x < width; x++) + { + r = *src++; + *lr_buf++ = r; g = *src++; - *g_buf++ = g; + *lg_buf++ = g; b = *src++; - *b_buf++ = b; + *lb_buf++ = b; } + while (x < 64) + { + *lr_buf++ = r; + *lg_buf++ = g; + *lb_buf++ = b; + x++; + } + } + while (y < 64) + { + lr_buf = r_buf + y * 64; + lg_buf = g_buf + y * 64; + lb_buf = b_buf + y * 64; + memcpy(lr_buf, lr_buf - 64, 64); + memcpy(lg_buf, lg_buf - 64, 64); + memcpy(lb_buf, lb_buf - 64, 64); + y++; } break; } @@ -131,25 +408,25 @@ rfx_encode_format_rgb(char *rgb_data, int width, int height, -11071 -21736 32807 32756 -27429 -5327 */ static int -rfx_encode_rgb_to_ycbcr(uint8 *y_r_buf, uint8 *cb_g_buf, uint8 *cr_b_buf) +rfx_encode_rgb_to_yuv(uint8 *y_r_buf, uint8 *u_g_buf, uint8 *v_b_buf) { int i; sint32 r, g, b; - sint32 y, cb, cr; + sint32 y, u, v; for (i = 0; i < 4096; i++) { r = y_r_buf[i]; - g = cb_g_buf[i]; - b = cr_b_buf[i]; + g = u_g_buf[i]; + b = v_b_buf[i]; - y = (r * 19595 + g * 38470 + b * 7471) >> 16; - cb = (r * -11071 + g * -21736 + b * 32807) >> 16; - cr = (r * 32756 + g * -27429 + b * -5327) >> 16; + y = (r * 19595 + g * 38470 + b * 7471) >> 16; + u = (r * -11071 + g * -21736 + b * 32807) >> 16; + v = (r * 32756 + g * -27429 + b * -5327) >> 16; y_r_buf[i] = MINMAX(y, 0, 255); - cb_g_buf[i] = MINMAX(cb + 128, 0, 255); - cr_b_buf[i] = MINMAX(cr + 128, 0, 255); + u_g_buf[i] = MINMAX(u + 128, 0, 255); + v_b_buf[i] = MINMAX(v + 128, 0, 255); } return 0; @@ -157,14 +434,15 @@ rfx_encode_rgb_to_ycbcr(uint8 *y_r_buf, uint8 *cb_g_buf, uint8 *cr_b_buf) /******************************************************************************/ int -rfx_encode_component_rlgr1(struct rfxencode *enc, const int *quantization_values, +rfx_encode_component_rlgr1(struct rfxencode *enc, const char *qtable, uint8 *data, uint8 *buffer, int buffer_size, int *size) { + LLOGLN(10, ("rfx_encode_component_rlgr1:")); if (rfx_dwt_2d_encode(data, enc->dwt_buffer1, enc->dwt_buffer) != 0) { return 1; } - if (rfx_quantization_encode(enc->dwt_buffer1, quantization_values) != 0) + if (rfx_quantization_encode(enc->dwt_buffer1, qtable) != 0) { return 1; } @@ -172,47 +450,220 @@ rfx_encode_component_rlgr1(struct rfxencode *enc, const int *quantization_values { return 1; } - *size = rfx_rlgr1_encode(enc->dwt_buffer1, 4096, buffer, buffer_size); + *size = rfx_rlgr1_encode(enc->dwt_buffer1, buffer, buffer_size); return 0; } /******************************************************************************/ int -rfx_encode_component_rlgr3(struct rfxencode *enc, const int *quantization_values, +rfx_encode_component_rlgr3(struct rfxencode *enc, const char *qtable, uint8 *data, uint8 *buffer, int buffer_size, int *size) { + LLOGLN(10, ("rfx_encode_component_rlgr3:")); if (rfx_dwt_2d_encode(data, enc->dwt_buffer1, enc->dwt_buffer) != 0) { return 1; } - if (rfx_quantization_encode(enc->dwt_buffer1, quantization_values) != 0) + if (rfx_quantization_encode(enc->dwt_buffer1, qtable) != 0) + { + return 1; + } + if (rfx_differential_encode(enc->dwt_buffer1 + 4032, 64) != 0) + { + return 1; + } + *size = rfx_rlgr3_encode(enc->dwt_buffer1, buffer, buffer_size); + return 0; +} + +/******************************************************************************/ +int +rfx_encode_component_rlgr1_x86_sse2(struct rfxencode *enc, const char *qtable, + uint8 *data, + uint8 *buffer, int buffer_size, int *size) +{ + LLOGLN(10, ("rfx_encode_component_rlgr1_x86_sse2:")); +#if defined(RFX_USE_ACCEL_X86) + if (rfxcodec_encode_dwt_shift_x86_sse2(qtable, data, enc->dwt_buffer1, + enc->dwt_buffer) != 0) + { + return 1; + } + //*size = rfxcodec_encode_diff_rlgr1_x86_sse2(enc->dwt_buffer1, + // buffer, buffer_size); + if (rfx_differential_encode(enc->dwt_buffer1 + 4032, 64) != 0) + { + return 1; + } + *size = rfx_rlgr1_encode(enc->dwt_buffer1, buffer, buffer_size); +#endif + return 0; +} + +/******************************************************************************/ +int +rfx_encode_component_rlgr3_x86_sse2(struct rfxencode *enc, const char *qtable, + uint8 *data, + uint8 *buffer, int buffer_size, int *size) +{ + LLOGLN(10, ("rfx_encode_component_rlgr3_x86_sse2:")); +#if defined(RFX_USE_ACCEL_X86) + if (rfxcodec_encode_dwt_shift_x86_sse2(qtable, data, enc->dwt_buffer1, + enc->dwt_buffer) != 0) + { + return 1; + } + //*size = rfxcodec_encode_diff_rlgr3_x86_sse2(enc->dwt_buffer1, + // buffer, buffer_size); + if (rfx_differential_encode(enc->dwt_buffer1 + 4032, 64) != 0) + { + return 1; + } + *size = rfx_rlgr3_encode(enc->dwt_buffer1, buffer, buffer_size); +#endif + return 0; +} + +/******************************************************************************/ +int +rfx_encode_component_rlgr1_x86_sse41(struct rfxencode *enc, const char *qtable, + uint8 *data, + uint8 *buffer, int buffer_size, int *size) +{ + LLOGLN(10, ("rfx_encode_component_rlgr1_x86_sse41:")); +#if defined(RFX_USE_ACCEL_X86) + if (rfxcodec_encode_dwt_shift_x86_sse41(qtable, data, enc->dwt_buffer1, + enc->dwt_buffer) != 0) + { + return 1; + } + //*size = rfxcodec_encode_diff_rlgr1_x86_sse2(enc->dwt_buffer1, + // buffer, buffer_size); + if (rfx_differential_encode(enc->dwt_buffer1 + 4032, 64) != 0) + { + return 1; + } + *size = rfx_rlgr1_encode(enc->dwt_buffer1, buffer, buffer_size); +#endif + return 0; +} + +/******************************************************************************/ +int +rfx_encode_component_rlgr3_x86_sse41(struct rfxencode *enc, const char *qtable, + uint8 *data, + uint8 *buffer, int buffer_size, int *size) +{ + LLOGLN(10, ("rfx_encode_component_rlgr3_x86_sse41:")); +#if defined(RFX_USE_ACCEL_X86) + if (rfxcodec_encode_dwt_shift_x86_sse41(qtable, data, enc->dwt_buffer1, + enc->dwt_buffer) != 0) { return 1; } + //*size = rfxcodec_encode_diff_rlgr3_x86_sse(enc->dwt_buffer1, + // buffer, buffer_size); if (rfx_differential_encode(enc->dwt_buffer1 + 4032, 64) != 0) { return 1; } - *size = rfx_rlgr3_encode(enc->dwt_buffer1, 4096, buffer, buffer_size); + *size = rfx_rlgr3_encode(enc->dwt_buffer1, buffer, buffer_size); +#endif return 0; } /******************************************************************************/ int -rfx_encode_component_x86_sse2(struct rfxencode *enc, - const int *quantization_values, - uint8 *data, - uint8 *buffer, int buffer_size, int *size) +rfx_encode_component_rlgr1_amd64_sse2(struct rfxencode *enc, const char *qtable, + uint8 *data, + uint8 *buffer, int buffer_size, int *size) { - LLOGLN(10, ("rfx_encode_component_x86_sse2:")); -#if defined(RFX_USE_ACCEL) && RFX_USE_ACCEL - /* put asm calls here */ - if (dwt_shift_x86_sse2(quantization_values, data, enc->dwt_buffer1, - enc->dwt_buffer) != 0) + LLOGLN(10, ("rfx_encode_component_rlgr1_amd64_sse2:")); +#if defined(RFX_USE_ACCEL_AMD64) + if (rfxcodec_encode_dwt_shift_amd64_sse2(qtable, data, enc->dwt_buffer1, + enc->dwt_buffer) != 0) + { + return 1; + } + //*size = rfxcodec_encode_diff_rlgr1_amd64_sse2(enc->dwt_buffer1, + // buffer, buffer_size); + if (rfx_differential_encode(enc->dwt_buffer1 + 4032, 64) != 0) { return 1; } - *size = diff_rlgr3_x86(enc->dwt_buffer1, 4096, buffer, buffer_size); + *size = rfx_rlgr1_encode(enc->dwt_buffer1, buffer, buffer_size); +#endif + return 0; +} + +/******************************************************************************/ +int +rfx_encode_component_rlgr3_amd64_sse2(struct rfxencode *enc, const char *qtable, + uint8 *data, + uint8 *buffer, int buffer_size, int *size) +{ + LLOGLN(10, ("rfx_encode_component_rlgr3_amd64_sse2:")); +#if defined(RFX_USE_ACCEL_AMD64) + if (rfxcodec_encode_dwt_shift_amd64_sse2(qtable, data, enc->dwt_buffer1, + enc->dwt_buffer) != 0) + { + return 1; + } + //*size = rfxcodec_encode_diff_rlgr3_amd64_sse2(enc->dwt_buffer1, + // buffer, buffer_size); + if (rfx_differential_encode(enc->dwt_buffer1 + 4032, 64) != 0) + { + return 1; + } + *size = rfx_rlgr3_encode(enc->dwt_buffer1, buffer, buffer_size); +#endif + return 0; +} + +/******************************************************************************/ +int +rfx_encode_component_rlgr1_amd64_sse41(struct rfxencode *enc, const char *qtable, + uint8 *data, + uint8 *buffer, int buffer_size, int *size) +{ + LLOGLN(10, ("rfx_encode_component_rlgr1_amd64_sse2:")); +#if defined(RFX_USE_ACCEL_AMD64) + if (rfxcodec_encode_dwt_shift_amd64_sse41(qtable, data, enc->dwt_buffer1, + enc->dwt_buffer) != 0) + { + return 1; + } + //*size = rfxcodec_encode_diff_rlgr1_amd64_sse2(enc->dwt_buffer1, + // buffer, buffer_size); + if (rfx_differential_encode(enc->dwt_buffer1 + 4032, 64) != 0) + { + return 1; + } + *size = rfx_rlgr1_encode(enc->dwt_buffer1, buffer, buffer_size); +#endif + return 0; +} + +/******************************************************************************/ +int +rfx_encode_component_rlgr3_amd64_sse41(struct rfxencode *enc, const char *qtable, + uint8 *data, + uint8 *buffer, int buffer_size, int *size) +{ + LLOGLN(10, ("rfx_encode_component_rlgr3_amd64_sse2:")); +#if defined(RFX_USE_ACCEL_AMD64) + if (rfxcodec_encode_dwt_shift_amd64_sse41(qtable, data, enc->dwt_buffer1, + enc->dwt_buffer) != 0) + { + return 1; + } + //*size = rfxcodec_encode_diff_rlgr3_amd64_sse2(enc->dwt_buffer1, + // buffer, buffer_size); + if (rfx_differential_encode(enc->dwt_buffer1 + 4032, 64) != 0) + { + return 1; + } + *size = rfx_rlgr3_encode(enc->dwt_buffer1, buffer, buffer_size); #endif return 0; } @@ -221,23 +672,84 @@ rfx_encode_component_x86_sse2(struct rfxencode *enc, int rfx_encode_rgb(struct rfxencode *enc, char *rgb_data, int width, int height, int stride_bytes, - const int *y_quants, const int *cb_quants, const int *cr_quants, - STREAM *data_out, int *y_size, int *cb_size, int *cr_size) + const char *y_quants, const char *u_quants, + const char *v_quants, + STREAM *data_out, int *y_size, int *u_size, int *v_size) { uint8 *y_r_buffer; - uint8 *cb_g_buffer; - uint8 *cr_b_buffer; + uint8 *u_g_buffer; + uint8 *v_b_buffer; y_r_buffer = enc->y_r_buffer; - cb_g_buffer = enc->cb_g_buffer; - cr_b_buffer = enc->cr_b_buffer; + u_g_buffer = enc->u_g_buffer; + v_b_buffer = enc->v_b_buffer; if (rfx_encode_format_rgb(rgb_data, width, height, stride_bytes, enc->format, - y_r_buffer, cb_g_buffer, cr_b_buffer) != 0) + y_r_buffer, u_g_buffer, v_b_buffer) != 0) + { + return 1; + } + if (rfx_encode_rgb_to_yuv(y_r_buffer, u_g_buffer, v_b_buffer) != 0) + { + return 1; + } + if (enc->rfx_encode(enc, y_quants, y_r_buffer, + stream_get_tail(data_out), + stream_get_left(data_out), + y_size) != 0) + { + return 1; + } + LLOGLN(10, ("rfx_encode_rgb: y_size %d", *y_size)); + stream_seek(data_out, *y_size); + if (enc->rfx_encode(enc, u_quants, u_g_buffer, + stream_get_tail(data_out), + stream_get_left(data_out), + u_size) != 0) { return 1; } - if (rfx_encode_rgb_to_ycbcr(y_r_buffer, cb_g_buffer, cr_b_buffer) != 0) + LLOGLN(10, ("rfx_encode_rgb: u_size %d", *u_size)); + stream_seek(data_out, *u_size); + if (enc->rfx_encode(enc, v_quants, v_b_buffer, + stream_get_tail(data_out), + stream_get_left(data_out), + v_size) != 0) + { + return 1; + } + LLOGLN(10, ("rfx_encode_rgb: v_size %d", *v_size)); + stream_seek(data_out, *v_size); + return 0; +} + +/******************************************************************************/ +int +rfx_encode_argb(struct rfxencode *enc, char *rgb_data, + int width, int height, int stride_bytes, + const char *y_quants, const char *u_quants, + const char *v_quants, + STREAM *data_out, int *y_size, int *u_size, + int *v_size, int *a_size) +{ + uint8 *a_buffer; + uint8 *y_r_buffer; + uint8 *u_g_buffer; + uint8 *v_b_buffer; + + LLOGLN(10, ("rfx_encode_argb:")); + a_buffer = enc->a_buffer; + y_r_buffer = enc->y_r_buffer; + u_g_buffer = enc->u_g_buffer; + v_b_buffer = enc->v_b_buffer; + if (rfx_encode_format_argb(rgb_data, width, height, stride_bytes, + enc->format, + a_buffer, y_r_buffer, + u_g_buffer, v_b_buffer) != 0) + { + return 1; + } + if (rfx_encode_rgb_to_yuv(y_r_buffer, u_g_buffer, v_b_buffer) != 0) { return 1; } @@ -250,24 +762,25 @@ rfx_encode_rgb(struct rfxencode *enc, char *rgb_data, } LLOGLN(10, ("rfx_encode_rgb: y_size %d", *y_size)); stream_seek(data_out, *y_size); - if (enc->rfx_encode(enc, cb_quants, cb_g_buffer, + if (enc->rfx_encode(enc, u_quants, u_g_buffer, stream_get_tail(data_out), stream_get_left(data_out), - cb_size) != 0) + u_size) != 0) { return 1; } - LLOGLN(10, ("rfx_encode_rgb: cb_size %d", *cb_size)); - stream_seek(data_out, *cb_size); - if (enc->rfx_encode(enc, cr_quants, cr_b_buffer, + LLOGLN(10, ("rfx_encode_rgb: u_size %d", *u_size)); + stream_seek(data_out, *u_size); + if (enc->rfx_encode(enc, v_quants, v_b_buffer, stream_get_tail(data_out), stream_get_left(data_out), - cr_size) != 0) + v_size) != 0) { return 1; } - LLOGLN(10, ("rfx_encode_rgb: cr_size %d", *cr_size)); - stream_seek(data_out, *cr_size); + LLOGLN(10, ("rfx_encode_rgb: v_size %d", *v_size)); + stream_seek(data_out, *v_size); + *a_size = rfx_encode_plane(enc, a_buffer, 64, 64, data_out); return 0; } @@ -275,7 +788,8 @@ rfx_encode_rgb(struct rfxencode *enc, char *rgb_data, int rfx_encode_yuv(struct rfxencode *enc, char *yuv_data, int width, int height, int stride_bytes, - const int *y_quants, const int *u_quants, const int *v_quants, + const char *y_quants, const char *u_quants, + const char *v_quants, STREAM *data_out, int *y_size, int *u_size, int *v_size) { uint8 *y_buffer; @@ -311,3 +825,50 @@ rfx_encode_yuv(struct rfxencode *enc, char *yuv_data, stream_seek(data_out, *v_size); return 0; } + +/******************************************************************************/ +int +rfx_encode_yuva(struct rfxencode *enc, char *yuva_data, + int width, int height, int stride_bytes, + const char *y_quants, const char *u_quants, + const char *v_quants, + STREAM *data_out, int *y_size, int *u_size, + int *v_size, int *a_size) +{ + uint8 *y_buffer; + uint8 *u_buffer; + uint8 *v_buffer; + uint8 *a_buffer; + + y_buffer = (uint8 *) yuva_data; + u_buffer = (uint8 *) (yuva_data + RFX_YUV_BTES); + v_buffer = (uint8 *) (yuva_data + RFX_YUV_BTES * 2); + a_buffer = (uint8 *) (yuva_data + RFX_YUV_BTES * 3); + if (enc->rfx_encode(enc, y_quants, y_buffer, + stream_get_tail(data_out), + stream_get_left(data_out), + y_size) != 0) + { + return 1; + } + stream_seek(data_out, *y_size); + if (enc->rfx_encode(enc, u_quants, u_buffer, + stream_get_tail(data_out), + stream_get_left(data_out), + u_size) != 0) + { + return 1; + } + stream_seek(data_out, *u_size); + if (enc->rfx_encode(enc, v_quants, v_buffer, + stream_get_tail(data_out), + stream_get_left(data_out), + v_size) != 0) + { + return 1; + } + stream_seek(data_out, *v_size); + *a_size = rfx_encode_plane(enc, a_buffer, 64, 64, data_out); + return 0; +} + diff --git a/src/rfxencode_tile.h b/src/rfxencode_tile.h index 01604c6..6195d8d 100644 --- a/src/rfxencode_tile.h +++ b/src/rfxencode_tile.h @@ -3,7 +3,7 @@ * RemoteFX Codec Library - Encode * * Copyright 2011 Vic Lee - * Copyright 2014 Jay Sorg + * Copyright 2014-2015 Jay Sorg * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -27,34 +27,71 @@ #define RFX_YUV_BTES (64 * 64) int -rfx_encode_component_rlgr1(struct rfxencode *enc, - const int *quantization_values, +rfx_encode_component_rlgr1(struct rfxencode *enc, const char *qtable, uint8 *data, uint8 *buffer, int buffer_size, int *size); int -rfx_encode_component_rlgr3(struct rfxencode *enc, - const int *quantization_values, +rfx_encode_component_rlgr3(struct rfxencode *enc, const char *qtable, uint8 *data, uint8 *buffer, int buffer_size, int *size); int -rfx_encode_component_x86_sse2(struct rfxencode *enc, - const int *quantization_values, - uint8 *data, - uint8 *buffer, int buffer_size, int *size); -int -rfx_encode_component_amd64_sse2(struct rfxencode *enc, - const int *quantization_values, - uint8 *data, - uint8 *buffer, int buffer_size, int *size); -int rfx_encode_rgb(struct rfxencode *enc, char *rgb_data, int width, int height, int stride_bytes, - const int *y_quants, const int *cb_quants, const int *cr_quants, + const char *y_quants, const char *u_quants, + const char *v_quants, STREAM *data_out, int *y_size, int *cb_size, int *cr_size); int +rfx_encode_argb(struct rfxencode *enc, char *argb_data, + int width, int height, int stride_bytes, + const char *y_quants, const char *cb_quants, + const char *cr_quants, + STREAM *data_out, int *y_size, int *u_size, + int *v_size, int *a_size); +int rfx_encode_yuv(struct rfxencode *enc, char *yuv_data, int width, int height, int stride_bytes, - const int *y_quants, const int *u_quants, const int *v_quants, + const char *y_quants, const char *u_quants, + const char *v_quants, STREAM *data_out, int *y_size, int *u_size, int *v_size); +int +rfx_encode_yuva(struct rfxencode *enc, char *yuv_data, + int width, int height, int stride_bytes, + const char *y_quants, const char *u_quants, + const char *v_quants, + STREAM *data_out, int *y_size, int *u_size, + int *v_size, int *a_size); + +int +rfx_encode_component_rlgr1_x86_sse2(struct rfxencode *enc, const char *qtable, + uint8 *data, + uint8 *buffer, int buffer_size, int *size); +int +rfx_encode_component_rlgr3_x86_sse2(struct rfxencode *enc, const char *qtable, + uint8 *data, + uint8 *buffer, int buffer_size, int *size); +int +rfx_encode_component_rlgr1_x86_sse41(struct rfxencode *enc, const char *qtable, + uint8 *data, + uint8 *buffer, int buffer_size, int *size); +int +rfx_encode_component_rlgr3_x86_sse41(struct rfxencode *enc, const char *qtable, + uint8 *data, + uint8 *buffer, int buffer_size, int *size); +int +rfx_encode_component_rlgr1_amd64_sse2(struct rfxencode *enc, const char *qtable, + uint8 *data, + uint8 *buffer, int buffer_size, int *size); +int +rfx_encode_component_rlgr3_amd64_sse2(struct rfxencode *enc, const char *qtable, + uint8 *data, + uint8 *buffer, int buffer_size, int *size); +int +rfx_encode_component_rlgr1_amd64_sse41(struct rfxencode *enc, const char *qtable, + uint8 *data, + uint8 *buffer, int buffer_size, int *size); +int +rfx_encode_component_rlgr3_amd64_sse41(struct rfxencode *enc, const char *qtable, + uint8 *data, + uint8 *buffer, int buffer_size, int *size); #endif diff --git a/src/x86/cpuid_x86.asm b/src/x86/cpuid_x86.asm index 6f9e8c2..fe19a90 100644 --- a/src/x86/cpuid_x86.asm +++ b/src/x86/cpuid_x86.asm @@ -1,3 +1,6 @@ +%ifidn __OUTPUT_FORMAT__,elf +SECTION .note.GNU-stack noalloc noexec nowrite progbits +%endif SECTION .text @@ -10,7 +13,11 @@ SECTION .text ;int ;cpuid_x86(int eax_in, int ecx_in, int *eax, int *ebx, int *ecx, int *edx) +%ifidn __OUTPUT_FORMAT__,elf PROC cpuid_x86 +%else +PROC _cpuid_x86 +%endif ; save registers push ebx push ecx diff --git a/src/x86/funcs_x86.h b/src/x86/funcs_x86.h index 6025d0a..858bc5c 100644 --- a/src/x86/funcs_x86.h +++ b/src/x86/funcs_x86.h @@ -1,5 +1,5 @@ /* -Copyright 2014 Jay Sorg +Copyright 2014-2015 Jay Sorg Permission to use, copy, modify, distribute, and sell this software and its documentation for any purpose is hereby granted without fee, provided that @@ -24,12 +24,49 @@ x86 asm files #ifndef __FUNCS_X86_H #define __FUNCS_X86_H +#ifdef __cplusplus +extern "C" { +#endif + int cpuid_x86(int eax_in, int ecx_in, int *eax, int *ebx, int *ecx, int *edx); + int -dwt_shift_x86_sse2(unsigned char *yuvs, int width, int height, int *rgbs); +rfxcodec_encode_dwt_shift_x86_sse2(const char *qtable, + unsigned char *data, + short *dwt_buffer1, + short *dwt_buffer); int -diff_rlgr3_x86(sint16 *co, int num_co, uint8 *dst, int dst_bytes); +rfxcodec_encode_dwt_shift_x86_sse41(const char *qtable, + unsigned char *data, + short *dwt_buffer1, + short *dwt_buffer); +int +rfxcodec_encode_diff_rlgr1_x86_sse2(short *co, + void *dst, int dst_bytes); +int +rfxcodec_encode_diff_rlgr3_x86_sse2(short *co, + void *dst, int dst_bytes); + +int +rfxcodec_decode_rlgr1_diff_x86_sse2(void *data, int data_bytes, + short *out_data); +int +rfxcodec_decode_rlgr3_diff_x86_sse2(void *data, int data_bytes, + short *out_data); +int +rfxcodec_decode_shift_idwt_x86_sse2(const char *qtable, short *src, short *dst); +int +rfxcodec_decode_yuv2rgb_x86_sse2(short *ydata, short *udata, short *vdata, + unsigned int *rgbdata, int stride); +int +rfxcodec_decode_yuva2argb_x86_sse2(short *ydata, short *udata, + short *vdata, char *adata, + unsigned int *rgbdata, int stride); + +#ifdef __cplusplus +} #endif +#endif diff --git a/src/x86/readme.txt b/src/x86/readme.txt deleted file mode 100644 index e69de29..0000000 diff --git a/src/x86/rfxcodec_encode_diff_rlgr1_x86_sse2.asm b/src/x86/rfxcodec_encode_diff_rlgr1_x86_sse2.asm new file mode 100644 index 0000000..13d10e9 --- /dev/null +++ b/src/x86/rfxcodec_encode_diff_rlgr1_x86_sse2.asm @@ -0,0 +1,35 @@ +%ifidn __OUTPUT_FORMAT__,elf +section .note.GNU-stack noalloc noexec nowrite progbits +%endif + +section .data + const1 times 8 dw 1 + +section .text + +%macro PROC 1 + align 16 + global %1 + %1: +%endmacro + +;int +;rfxcodec_encode_diff_rlgr1_x86_sse2(short *co, +; void *dst, int dst_bytes); + +%ifidn __OUTPUT_FORMAT__,elf +PROC rfxcodec_encode_diff_rlgr1_x86_sse2 +%else +PROC _rfxcodec_encode_diff_rlgr1_x86_sse2 +%endif + push ebx + push esi + push edi + + mov eax, 0 + pop edi + pop esi + pop ebx + ret + align 16 + diff --git a/src/x86/rfxcodec_encode_diff_rlgr3_x86_sse2.asm b/src/x86/rfxcodec_encode_diff_rlgr3_x86_sse2.asm new file mode 100644 index 0000000..a8588f2 --- /dev/null +++ b/src/x86/rfxcodec_encode_diff_rlgr3_x86_sse2.asm @@ -0,0 +1,35 @@ +%ifidn __OUTPUT_FORMAT__,elf +section .note.GNU-stack noalloc noexec nowrite progbits +%endif + +section .data + const1 times 8 dw 1 + +section .text + +%macro PROC 1 + align 16 + global %1 + %1: +%endmacro + +;int +;rfxcodec_encode_diff_rlgr3_x86_sse2(short *co, +; void *dst, int dst_bytes); + +%ifidn __OUTPUT_FORMAT__,elf +PROC rfxcodec_encode_diff_rlgr3_x86_sse2 +%else +PROC _rfxcodec_encode_diff_rlgr3_x86_sse2 +%endif + push ebx + push esi + push edi + + mov eax, 0 + pop edi + pop esi + pop ebx + ret + align 16 + diff --git a/src/x86/rfxcodec_encode_dwt_shift_x86_sse2.asm b/src/x86/rfxcodec_encode_dwt_shift_x86_sse2.asm new file mode 100644 index 0000000..f6b71b2 --- /dev/null +++ b/src/x86/rfxcodec_encode_dwt_shift_x86_sse2.asm @@ -0,0 +1,1533 @@ +; +;Copyright 2016 Jay Sorg +; +;Permission to use, copy, modify, distribute, and sell this software and its +;documentation for any purpose is hereby granted without fee, provided that +;the above copyright notice appear in all copies and that both that +;copyright notice and this permission notice appear in supporting +;documentation. +; +;The above copyright notice and this permission notice shall be included in +;all copies or substantial portions of the Software. +; +;THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +;IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +;FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +;OPEN GROUP BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN +;AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +;CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +; +;x86 asm dwt + +%ifidn __OUTPUT_FORMAT__,elf +section .note.GNU-stack noalloc noexec nowrite progbits +%endif + +section .data + align 16 + cw128 times 8 dw 128 + cdFFFF times 4 dd 65535 + ; these are 1 << (factor - 1) 0 to 15 is factor + cwa0 times 8 dw 0 ; 0 + cwa1 times 8 dw 1 ; 1 + cwa2 times 8 dw 2 ; 2 + cwa4 times 8 dw 4 ; 3 + cwa8 times 8 dw 8 ; 4 + cwa16 times 8 dw 16 ; 5 + cwa32 times 8 dw 32 ; 6 + cwa64 times 8 dw 64 ; 7 + cwa128 times 8 dw 128 ; 8 + cwa256 times 8 dw 256 ; 9 + cwa512 times 8 dw 512 ; 10 + cwa1024 times 8 dw 1024 ; 11 + cwa2048 times 8 dw 2048 ; 12 + cwa4096 times 8 dw 4096 ; 13 + cwa8192 times 8 dw 8192 ; 14 + cwa16384 times 8 dw 16384 ; 15 + +section .text + +%macro PROC 1 + align 16 + global %1 + %1: +%endmacro + +%define LHI_ADD [esp + 1 * 16 + 4] +%define LHI_SFT [esp + 2 * 16 + 4] +%define LLO_ADD [esp + 3 * 16 + 4] +%define LLO_SFT [esp + 4 * 16 + 4] + +;****************************************************************************** +; source 16 bit signed, 16 pixel width +rfx_dwt_2d_encode_block_horiz_16_16: + mov ecx, 8 +loop1a: + ; pre / post + movdqa xmm1, [esi] ; src[2n] + movdqa xmm2, [esi + 16] + movdqa xmm6, xmm1 + movdqa xmm7, xmm2 + pand xmm1, [cdFFFF] + pand xmm2, [cdFFFF] + pslld xmm1, 16 + pslld xmm2, 16 + psrad xmm1, 16 + psrad xmm2, 16 + packssdw xmm1, xmm2 + movdqa xmm2, xmm6 ; src[2n + 1] + movdqa xmm3, xmm7 + psrldq xmm2, 2 + psrldq xmm3, 2 + pand xmm2, [cdFFFF] + pand xmm3, [cdFFFF] + pslld xmm2, 16 + pslld xmm3, 16 + psrad xmm2, 16 + psrad xmm3, 16 + packssdw xmm2, xmm3 + movdqa xmm3, xmm6 ; src[2n + 2] + movdqa xmm4, xmm7 + psrldq xmm3, 4 + psrldq xmm4, 4 + movd eax, xmm7 + movd xmm5, eax + pslldq xmm5, 12 + por xmm3, xmm5 + movdqa xmm5, xmm7 + psrldq xmm5, 12 + pslldq xmm5, 12 + por xmm4, xmm5 + pand xmm3, [cdFFFF] + pand xmm4, [cdFFFF] + pslld xmm3, 16 + pslld xmm4, 16 + psrad xmm3, 16 + psrad xmm4, 16 + packssdw xmm3, xmm4 + movdqa xmm4, xmm1 + movdqa xmm5, xmm2 + movdqa xmm6, xmm3 + ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 + paddw xmm4, xmm6 + psraw xmm4, 1 + psubw xmm5, xmm4 + psraw xmm5, 1 + movdqa xmm6, xmm5 ; out hi + paddw xmm6, LHI_ADD + psraw xmm6, LHI_SFT + movdqa [edi], xmm6 + ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) + movdqa xmm7, xmm5 + movd eax, xmm7 + pslldq xmm7, 2 + and eax, 0xFFFF + movd xmm6, eax + por xmm7, xmm6 + paddw xmm5, xmm7 + psraw xmm5, 1 + paddw xmm5, xmm1 + + movdqa xmm6, xmm5 ; out lo + paddw xmm6, LLO_ADD + psraw xmm6, LLO_SFT + movdqa [edx], xmm6 + + ; move right + lea esi, [esi + 16 * 2] + lea edi, [edi + 8 * 2] + lea edx, [edx + 8 * 2] + + ; move left + lea esi, [esi - 16 * 2] + lea edi, [edi - 8 * 2] + lea edx, [edx - 8 * 2] + + ; move down + lea esi, [esi + 16 * 2] + lea edi, [edi + 8 * 2] + lea edx, [edx + 8 * 2] + + dec ecx + jnz loop1a + + ret + +;****************************************************************************** +; source 16 bit signed, 16 pixel width +rfx_dwt_2d_encode_block_verti_16_16: + mov ecx, 2 +loop1b: + ; pre + movdqa xmm1, [esi] ; src[2n] + movdqa xmm2, [esi + 16 * 2] ; src[2n + 1] + movdqa xmm3, [esi + 16 * 2 * 2] ; src[2n + 2] + movdqa xmm4, xmm1 + movdqa xmm5, xmm2 + movdqa xmm6, xmm3 + ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 + paddw xmm4, xmm6 + psraw xmm4, 1 + psubw xmm5, xmm4 + psraw xmm5, 1 + movdqa [edi], xmm5 ; out hi + movdqa xmm6, xmm5 ; save hi + ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) + paddw xmm5, xmm1 + movdqa [edx], xmm5 ; out lo + movdqa xmm7, xmm6 ; save hi + ; move down + lea esi, [esi + 16 * 2 * 2] ; 2 rows + lea edi, [edi + 16 * 2] ; 1 row + lea edx, [edx + 16 * 2] ; 1 row + + ; loop + shl ecx, 16 + mov cx, 6 +loop2b: + movdqa xmm1, xmm3 ; src[2n] + movdqa xmm2, [esi + 16 * 2] ; src[2n + 1] + movdqa xmm3, [esi + 16 * 2 * 2] ; src[2n + 2] + movdqa xmm4, xmm1 + movdqa xmm5, xmm2 + movdqa xmm6, xmm3 + ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 + paddw xmm4, xmm6 + psraw xmm4, 1 + psubw xmm5, xmm4 + psraw xmm5, 1 + movdqa [edi], xmm5 ; out hi + movdqa xmm6, xmm5 ; save hi + ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) + paddw xmm5, xmm7 + psraw xmm5, 1 + paddw xmm5, xmm1 + movdqa [edx], xmm5 ; out lo + movdqa xmm7, xmm6 ; save hi + ; move down + lea esi, [esi + 16 * 2 * 2] ; 2 rows + lea edi, [edi + 16 * 2] ; 1 row + lea edx, [edx + 16 * 2] ; 1 row + + dec cx + jnz loop2b + shr ecx, 16 + + ; post + movdqa xmm1, xmm3 ; src[2n] + movdqa xmm2, [esi + 16 * 2] ; src[2n + 1] + movdqa xmm4, xmm1 + movdqa xmm5, xmm2 + movdqa xmm6, xmm3 + ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 + paddw xmm4, xmm6 + psraw xmm4, 1 + psubw xmm5, xmm4 + psraw xmm5, 1 + movdqa [edi], xmm5 ; out hi + ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) + paddw xmm5, xmm7 + psraw xmm5, 1 + paddw xmm5, xmm1 + movdqa [edx], xmm5 ; out lo + ; move down + lea esi, [esi + 16 * 2 * 2] ; 2 row + lea edi, [edi + 16 * 2] ; 1 row + lea edx, [edx + 16 * 2] ; 1 row + + ; move up + lea esi, [esi - 16 * 16 * 2] + lea edi, [edi - 8 * 16 * 2] + lea edx, [edx - 8 * 16 * 2] + + ; move right + lea esi, [esi + 16] + lea edi, [edi + 16] + lea edx, [edx + 16] + + dec ecx + jnz loop1b + + ret + +;****************************************************************************** +; source 16 bit signed, 32 pixel width +rfx_dwt_2d_encode_block_horiz_16_32: + mov ecx, 16 +loop1c: + ; pre + movdqa xmm1, [esi] ; src[2n] + movdqa xmm2, [esi + 16] + movdqa xmm6, xmm1 + movdqa xmm7, xmm2 + pand xmm1, [cdFFFF] + pand xmm2, [cdFFFF] + pslld xmm1, 16 + pslld xmm2, 16 + psrad xmm1, 16 + psrad xmm2, 16 + packssdw xmm1, xmm2 + movdqa xmm2, xmm6 ; src[2n + 1] + movdqa xmm3, xmm7 + psrldq xmm2, 2 + psrldq xmm3, 2 + pand xmm2, [cdFFFF] + pand xmm3, [cdFFFF] + pslld xmm2, 16 + pslld xmm3, 16 + psrad xmm2, 16 + psrad xmm3, 16 + packssdw xmm2, xmm3 + movdqa xmm3, xmm6 ; src[2n + 2] + movdqa xmm4, xmm7 + psrldq xmm3, 4 + psrldq xmm4, 4 + movd eax, xmm7 + movd xmm5, eax + pslldq xmm5, 12 + por xmm3, xmm5 + mov eax, [esi + 32] + movd xmm5, eax + pslldq xmm5, 12 + por xmm4, xmm5 + pand xmm3, [cdFFFF] + pand xmm4, [cdFFFF] + pslld xmm3, 16 + pslld xmm4, 16 + psrad xmm3, 16 + psrad xmm4, 16 + packssdw xmm3, xmm4 + movdqa xmm4, xmm1 + movdqa xmm5, xmm2 + movdqa xmm6, xmm3 + ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 + paddw xmm4, xmm6 + psraw xmm4, 1 + psubw xmm5, xmm4 + psraw xmm5, 1 + + movdqa xmm6, xmm5 ; out hi + paddw xmm6, LHI_ADD + psraw xmm6, LHI_SFT + movdqa [edi], xmm6 + movdqa xmm2, xmm5 ; save hi + + ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) + movdqa xmm7, xmm5 + movd eax, xmm7 + pslldq xmm7, 2 + and eax, 0xFFFF + movd xmm6, eax + por xmm7, xmm6 + paddw xmm5, xmm7 + psraw xmm5, 1 + paddw xmm5, xmm1 + + psrldq xmm2, 14 + movd ebx, xmm2 ; save hi + + movdqa xmm6, xmm5 ; out lo + paddw xmm6, LLO_ADD + psraw xmm6, LLO_SFT + movdqa [edx], xmm6 + + ; move right + lea esi, [esi + 16 * 2] + lea edi, [edi + 8 * 2] + lea edx, [edx + 8 * 2] + + ; post + movdqa xmm1, [esi] ; src[2n] + movdqa xmm2, [esi + 16] + movdqa xmm6, xmm1 + movdqa xmm7, xmm2 + pand xmm1, [cdFFFF] + pand xmm2, [cdFFFF] + pslld xmm1, 16 + pslld xmm2, 16 + psrad xmm1, 16 + psrad xmm2, 16 + packssdw xmm1, xmm2 + movdqa xmm2, xmm6 ; src[2n + 1] + movdqa xmm3, xmm7 + psrldq xmm2, 2 + psrldq xmm3, 2 + pand xmm2, [cdFFFF] + pand xmm3, [cdFFFF] + pslld xmm2, 16 + pslld xmm3, 16 + psrad xmm2, 16 + psrad xmm3, 16 + packssdw xmm2, xmm3 + movdqa xmm3, xmm6 ; src[2n + 2] + movdqa xmm4, xmm7 + psrldq xmm3, 4 + psrldq xmm4, 4 + movd eax, xmm7 + movd xmm5, eax + pslldq xmm5, 12 + por xmm3, xmm5 + movdqa xmm5, xmm7 + psrldq xmm5, 12 + pslldq xmm5, 12 + por xmm4, xmm5 + pand xmm3, [cdFFFF] + pand xmm4, [cdFFFF] + pslld xmm3, 16 + pslld xmm4, 16 + psrad xmm3, 16 + psrad xmm4, 16 + packssdw xmm3, xmm4 + movdqa xmm4, xmm1 + movdqa xmm5, xmm2 + movdqa xmm6, xmm3 + ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 + paddw xmm4, xmm6 + psraw xmm4, 1 + psubw xmm5, xmm4 + psraw xmm5, 1 + + movdqa xmm6, xmm5 ; out hi + paddw xmm6, LHI_ADD + psraw xmm6, LHI_SFT + movdqa [edi], xmm6 + + ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) + movdqa xmm7, xmm5 + pslldq xmm7, 2 + movd xmm6, ebx + por xmm7, xmm6 + paddw xmm5, xmm7 + psraw xmm5, 1 + paddw xmm5, xmm1 + + movdqa xmm6, xmm5 ; out lo + paddw xmm6, LLO_ADD + psraw xmm6, LLO_SFT + movdqa [edx], xmm6 + + ; move right + lea esi, [esi + 16 * 2] + lea edi, [edi + 8 * 2] + lea edx, [edx + 8 * 2] + + ; move left + lea esi, [esi - 32 * 2] + lea edi, [edi - 16 * 2] + lea edx, [edx - 16 * 2] + + ; move down + lea esi, [esi + 32 * 2] + lea edi, [edi + 16 * 2] + lea edx, [edx + 16 * 2] + + dec ecx + jnz loop1c + + ret + +;****************************************************************************** +; source 16 bit signed, 32 pixel width +rfx_dwt_2d_encode_block_horiz_16_32_no_lo: + mov ecx, 16 +loop1c1: + ; pre + movdqa xmm1, [esi] ; src[2n] + movdqa xmm2, [esi + 16] + movdqa xmm6, xmm1 + movdqa xmm7, xmm2 + pand xmm1, [cdFFFF] + pand xmm2, [cdFFFF] + pslld xmm1, 16 + pslld xmm2, 16 + psrad xmm1, 16 + psrad xmm2, 16 + packssdw xmm1, xmm2 + movdqa xmm2, xmm6 ; src[2n + 1] + movdqa xmm3, xmm7 + psrldq xmm2, 2 + psrldq xmm3, 2 + pand xmm2, [cdFFFF] + pand xmm3, [cdFFFF] + pslld xmm2, 16 + pslld xmm3, 16 + psrad xmm2, 16 + psrad xmm3, 16 + packssdw xmm2, xmm3 + movdqa xmm3, xmm6 ; src[2n + 2] + movdqa xmm4, xmm7 + psrldq xmm3, 4 + psrldq xmm4, 4 + movd eax, xmm7 + movd xmm5, eax + pslldq xmm5, 12 + por xmm3, xmm5 + mov eax, [esi + 32] + movd xmm5, eax + pslldq xmm5, 12 + por xmm4, xmm5 + pand xmm3, [cdFFFF] + pand xmm4, [cdFFFF] + pslld xmm3, 16 + pslld xmm4, 16 + psrad xmm3, 16 + psrad xmm4, 16 + packssdw xmm3, xmm4 + movdqa xmm4, xmm1 + movdqa xmm5, xmm2 + movdqa xmm6, xmm3 + ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 + paddw xmm4, xmm6 + psraw xmm4, 1 + psubw xmm5, xmm4 + psraw xmm5, 1 + + movdqa xmm6, xmm5 ; out hi + paddw xmm6, LHI_ADD + psraw xmm6, LHI_SFT + movdqa [edi], xmm6 + movdqa xmm2, xmm5 ; save hi + + ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) + movdqa xmm7, xmm5 + movd eax, xmm7 + pslldq xmm7, 2 + and eax, 0xFFFF + movd xmm6, eax + por xmm7, xmm6 + paddw xmm5, xmm7 + psraw xmm5, 1 + paddw xmm5, xmm1 + + psrldq xmm2, 14 + movd ebx, xmm2 ; save hi + + movdqa [edx], xmm5 ; out lo + + ; move right + lea esi, [esi + 16 * 2] + lea edi, [edi + 8 * 2] + lea edx, [edx + 8 * 2] + + ; post + movdqa xmm1, [esi] ; src[2n] + movdqa xmm2, [esi + 16] + movdqa xmm6, xmm1 + movdqa xmm7, xmm2 + pand xmm1, [cdFFFF] + pand xmm2, [cdFFFF] + pslld xmm1, 16 + pslld xmm2, 16 + psrad xmm1, 16 + psrad xmm2, 16 + packssdw xmm1, xmm2 + movdqa xmm2, xmm6 ; src[2n + 1] + movdqa xmm3, xmm7 + psrldq xmm2, 2 + psrldq xmm3, 2 + pand xmm2, [cdFFFF] + pand xmm3, [cdFFFF] + pslld xmm2, 16 + pslld xmm3, 16 + psrad xmm2, 16 + psrad xmm3, 16 + packssdw xmm2, xmm3 + movdqa xmm3, xmm6 ; src[2n + 2] + movdqa xmm4, xmm7 + psrldq xmm3, 4 + psrldq xmm4, 4 + movd eax, xmm7 + movd xmm5, eax + pslldq xmm5, 12 + por xmm3, xmm5 + movdqa xmm5, xmm7 + psrldq xmm5, 12 + pslldq xmm5, 12 + por xmm4, xmm5 + pand xmm3, [cdFFFF] + pand xmm4, [cdFFFF] + pslld xmm3, 16 + pslld xmm4, 16 + psrad xmm3, 16 + psrad xmm4, 16 + packssdw xmm3, xmm4 + movdqa xmm4, xmm1 + movdqa xmm5, xmm2 + movdqa xmm6, xmm3 + ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 + paddw xmm4, xmm6 + psraw xmm4, 1 + psubw xmm5, xmm4 + psraw xmm5, 1 + + movdqa xmm6, xmm5 ; out hi + paddw xmm6, LHI_ADD + psraw xmm6, LHI_SFT + movdqa [edi], xmm6 + + ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) + movdqa xmm7, xmm5 + pslldq xmm7, 2 + movd xmm6, ebx + por xmm7, xmm6 + paddw xmm5, xmm7 + psraw xmm5, 1 + paddw xmm5, xmm1 + + movdqa [edx], xmm5 ; out lo + + ; move right + lea esi, [esi + 16 * 2] + lea edi, [edi + 8 * 2] + lea edx, [edx + 8 * 2] + + ; move left + lea esi, [esi - 32 * 2] + lea edi, [edi - 16 * 2] + lea edx, [edx - 16 * 2] + + ; move down + lea esi, [esi + 32 * 2] + lea edi, [edi + 16 * 2] + lea edx, [edx + 16 * 2] + + dec ecx + jnz loop1c1 + + ret + +;****************************************************************************** +; source 16 bit signed, 32 pixel width +rfx_dwt_2d_encode_block_verti_16_32: + mov ecx, 4 +loop1d: + ; pre + movdqa xmm1, [esi] ; src[2n] + movdqa xmm2, [esi + 32 * 2] ; src[2n + 1] + movdqa xmm3, [esi + 32 * 2 * 2] ; src[2n + 2] + movdqa xmm4, xmm1 + movdqa xmm5, xmm2 + movdqa xmm6, xmm3 + ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 + paddw xmm4, xmm6 + psraw xmm4, 1 + psubw xmm5, xmm4 + psraw xmm5, 1 + movdqa [edi], xmm5 ; out hi + movdqa xmm6, xmm5 ; save hi + ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) + paddw xmm5, xmm1 + movdqa [edx], xmm5 ; out lo + movdqa xmm7, xmm6 ; save hi + ; move down + lea esi, [esi + 32 * 2 * 2] ; 2 rows + lea edi, [edi + 32 * 2] ; 1 row + lea edx, [edx + 32 * 2] ; 1 row + + ; loop + shl ecx, 16 + mov cx, 14 +loop2d: + movdqa xmm1, xmm3 ; src[2n] + movdqa xmm2, [esi + 32 * 2] ; src[2n + 1] + movdqa xmm3, [esi + 32 * 2 * 2] ; src[2n + 2] + movdqa xmm4, xmm1 + movdqa xmm5, xmm2 + movdqa xmm6, xmm3 + ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 + paddw xmm4, xmm6 + psraw xmm4, 1 + psubw xmm5, xmm4 + psraw xmm5, 1 + movdqa [edi], xmm5 ; out hi + movdqa xmm6, xmm5 ; save hi + ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) + paddw xmm5, xmm7 + psraw xmm5, 1 + paddw xmm5, xmm1 + movdqa [edx], xmm5 ; out lo + movdqa xmm7, xmm6 ; save hi + ; move down + lea esi, [esi + 32 * 2 * 2] ; 2 rows + lea edi, [edi + 32 * 2] ; 1 row + lea edx, [edx + 32 * 2] ; 1 row + + dec cx + jnz loop2d + shr ecx, 16 + + ; post + movdqa xmm1, xmm3 ; src[2n] + movdqa xmm2, [esi + 32 * 2] ; src[2n + 1] + movdqa xmm4, xmm1 + movdqa xmm5, xmm2 + movdqa xmm6, xmm3 + ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 + paddw xmm4, xmm6 + psraw xmm4, 1 + psubw xmm5, xmm4 + psraw xmm5, 1 + movdqa [edi], xmm5 ; out hi + ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) + paddw xmm5, xmm7 + psraw xmm5, 1 + paddw xmm5, xmm1 + movdqa [edx], xmm5 ; out lo + ; move down + lea esi, [esi + 32 * 2 * 2] ; 2 row + lea edi, [edi + 32 * 2] ; 1 row + lea edx, [edx + 32 * 2] ; 1 row + + ; move up + lea esi, [esi - 32 * 32 * 2] + lea edi, [edi - 16 * 32 * 2] + lea edx, [edx - 16 * 32 * 2] + + ; move right + lea esi, [esi + 16] + lea edi, [edi + 16] + lea edx, [edx + 16] + + dec ecx + jnz loop1d + + ret + +;****************************************************************************** +; source 16 bit signed, 64 pixel width +rfx_dwt_2d_encode_block_horiz_16_64: + mov ecx, 32 +loop1e: + ; pre + movdqa xmm1, [esi] ; src[2n] + movdqa xmm2, [esi + 16] + movdqa xmm6, xmm1 + movdqa xmm7, xmm2 + pand xmm1, [cdFFFF] + pand xmm2, [cdFFFF] + pslld xmm1, 16 + pslld xmm2, 16 + psrad xmm1, 16 + psrad xmm2, 16 + packssdw xmm1, xmm2 + movdqa xmm2, xmm6 ; src[2n + 1] + movdqa xmm3, xmm7 + psrldq xmm2, 2 + psrldq xmm3, 2 + pand xmm2, [cdFFFF] + pand xmm3, [cdFFFF] + pslld xmm2, 16 + pslld xmm3, 16 + psrad xmm2, 16 + psrad xmm3, 16 + packssdw xmm2, xmm3 + movdqa xmm3, xmm6 ; src[2n + 2] + movdqa xmm4, xmm7 + psrldq xmm3, 4 + psrldq xmm4, 4 + movd eax, xmm7 + movd xmm5, eax + pslldq xmm5, 12 + por xmm3, xmm5 + mov eax, [esi + 32] + movd xmm5, eax + pslldq xmm5, 12 + por xmm4, xmm5 + pand xmm3, [cdFFFF] + pand xmm4, [cdFFFF] + pslld xmm3, 16 + pslld xmm4, 16 + psrad xmm3, 16 + psrad xmm4, 16 + packssdw xmm3, xmm4 + movdqa xmm4, xmm1 + movdqa xmm5, xmm2 + movdqa xmm6, xmm3 + ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 + paddw xmm4, xmm6 + psraw xmm4, 1 + psubw xmm5, xmm4 + psraw xmm5, 1 + + movdqa xmm6, xmm5 ; out hi + paddw xmm6, LHI_ADD + psraw xmm6, LHI_SFT + movdqa [edi], xmm6 + movdqa xmm2, xmm5 ; save hi + + ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) + movdqa xmm7, xmm5 + movd eax, xmm7 + pslldq xmm7, 2 + and eax, 0xFFFF + movd xmm6, eax + por xmm7, xmm6 + paddw xmm5, xmm7 + psraw xmm5, 1 + paddw xmm5, xmm1 + + psrldq xmm2, 14 + movd ebx, xmm2 ; save hi + + movdqa xmm6, xmm5 ; out lo + paddw xmm6, LLO_ADD + psraw xmm6, LLO_SFT + movdqa [edx], xmm6 + + ; move right + lea esi, [esi + 16 * 2] + lea edi, [edi + 8 * 2] + lea edx, [edx + 8 * 2] + + ; loop + shl ecx, 16 + mov cx, 2 +loop2e: + movdqa xmm1, [esi] ; src[2n] + movdqa xmm2, [esi + 16] + movdqa xmm6, xmm1 + movdqa xmm7, xmm2 + pand xmm1, [cdFFFF] + pand xmm2, [cdFFFF] + pslld xmm1, 16 + pslld xmm2, 16 + psrad xmm1, 16 + psrad xmm2, 16 + packssdw xmm1, xmm2 + movdqa xmm2, xmm6 ; src[2n + 1] + movdqa xmm3, xmm7 + psrldq xmm2, 2 + psrldq xmm3, 2 + pand xmm2, [cdFFFF] + pand xmm3, [cdFFFF] + pslld xmm2, 16 + pslld xmm3, 16 + psrad xmm2, 16 + psrad xmm3, 16 + packssdw xmm2, xmm3 + movdqa xmm3, xmm6 ; src[2n + 2] + movdqa xmm4, xmm7 + psrldq xmm3, 4 + psrldq xmm4, 4 + movd eax, xmm7 + movd xmm5, eax + pslldq xmm5, 12 + por xmm3, xmm5 + mov eax, [esi + 32] + movd xmm5, eax + pslldq xmm5, 12 + por xmm4, xmm5 + pand xmm3, [cdFFFF] + pand xmm4, [cdFFFF] + pslld xmm3, 16 + pslld xmm4, 16 + psrad xmm3, 16 + psrad xmm4, 16 + packssdw xmm3, xmm4 + movdqa xmm4, xmm1 + movdqa xmm5, xmm2 + movdqa xmm6, xmm3 + ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 + paddw xmm4, xmm6 + psraw xmm4, 1 + psubw xmm5, xmm4 + psraw xmm5, 1 + + movdqa xmm6, xmm5 ; out hi + paddw xmm6, LHI_ADD + psraw xmm6, LHI_SFT + movdqa [edi], xmm6 + movdqa xmm2, xmm5 ; save hi + + ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) + movdqa xmm7, xmm5 + pslldq xmm7, 2 + movd xmm6, ebx + por xmm7, xmm6 + paddw xmm5, xmm7 + psraw xmm5, 1 + paddw xmm5, xmm1 + + psrldq xmm2, 14 + movd ebx, xmm2 ; save hi + + movdqa xmm6, xmm5 ; out lo + paddw xmm6, LLO_ADD + psraw xmm6, LLO_SFT + movdqa [edx], xmm6 + + ; move right + lea esi, [esi + 16 * 2] + lea edi, [edi + 8 * 2] + lea edx, [edx + 8 * 2] + + dec cx + jnz loop2e + shr ecx, 16 + + ; post + movdqa xmm1, [esi] ; src[2n] + movdqa xmm2, [esi + 16] + movdqa xmm6, xmm1 + movdqa xmm7, xmm2 + pand xmm1, [cdFFFF] + pand xmm2, [cdFFFF] + pslld xmm1, 16 + pslld xmm2, 16 + psrad xmm1, 16 + psrad xmm2, 16 + packssdw xmm1, xmm2 + movdqa xmm2, xmm6 ; src[2n + 1] + movdqa xmm3, xmm7 + psrldq xmm2, 2 + psrldq xmm3, 2 + pand xmm2, [cdFFFF] + pand xmm3, [cdFFFF] + pslld xmm2, 16 + pslld xmm3, 16 + psrad xmm2, 16 + psrad xmm3, 16 + packssdw xmm2, xmm3 + movdqa xmm3, xmm6 ; src[2n + 2] + movdqa xmm4, xmm7 + psrldq xmm3, 4 + psrldq xmm4, 4 + movd eax, xmm7 + movd xmm5, eax + pslldq xmm5, 12 + por xmm3, xmm5 + movdqa xmm5, xmm7 + psrldq xmm5, 12 + pslldq xmm5, 12 + por xmm4, xmm5 + pand xmm3, [cdFFFF] + pand xmm4, [cdFFFF] + pslld xmm3, 16 + pslld xmm4, 16 + psrad xmm3, 16 + psrad xmm4, 16 + packssdw xmm3, xmm4 + movdqa xmm4, xmm1 + movdqa xmm5, xmm2 + movdqa xmm6, xmm3 + ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 + paddw xmm4, xmm6 + psraw xmm4, 1 + psubw xmm5, xmm4 + psraw xmm5, 1 + + movdqa xmm6, xmm5 ; out hi + paddw xmm6, LHI_ADD + psraw xmm6, LHI_SFT + movdqa [edi], xmm6 + + ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) + movdqa xmm7, xmm5 + pslldq xmm7, 2 + movd xmm6, ebx + por xmm7, xmm6 + paddw xmm5, xmm7 + psraw xmm5, 1 + paddw xmm5, xmm1 + + movdqa xmm6, xmm5 ; out lo + paddw xmm6, LLO_ADD + psraw xmm6, LLO_SFT + movdqa [edx], xmm6 + + ; move right + lea esi, [esi + 16 * 2] + lea edi, [edi + 8 * 2] + lea edx, [edx + 8 * 2] + + ; move left + lea esi, [esi - 64 * 2] + lea edi, [edi - 32 * 2] + lea edx, [edx - 32 * 2] + + ; move down + lea esi, [esi + 64 * 2] + lea edi, [edi + 32 * 2] + lea edx, [edx + 32 * 2] + + dec ecx + jnz loop1e + + ret + +;****************************************************************************** +; source 16 bit signed, 64 pixel width +rfx_dwt_2d_encode_block_horiz_16_64_no_lo: + mov ecx, 32 +loop1e1: + ; pre + movdqa xmm1, [esi] ; src[2n] + movdqa xmm2, [esi + 16] + movdqa xmm6, xmm1 + movdqa xmm7, xmm2 + pand xmm1, [cdFFFF] + pand xmm2, [cdFFFF] + pslld xmm1, 16 + pslld xmm2, 16 + psrad xmm1, 16 + psrad xmm2, 16 + packssdw xmm1, xmm2 + movdqa xmm2, xmm6 ; src[2n + 1] + movdqa xmm3, xmm7 + psrldq xmm2, 2 + psrldq xmm3, 2 + pand xmm2, [cdFFFF] + pand xmm3, [cdFFFF] + pslld xmm2, 16 + pslld xmm3, 16 + psrad xmm2, 16 + psrad xmm3, 16 + packssdw xmm2, xmm3 + movdqa xmm3, xmm6 ; src[2n + 2] + movdqa xmm4, xmm7 + psrldq xmm3, 4 + psrldq xmm4, 4 + movd eax, xmm7 + movd xmm5, eax + pslldq xmm5, 12 + por xmm3, xmm5 + mov eax, [esi + 32] + movd xmm5, eax + pslldq xmm5, 12 + por xmm4, xmm5 + pand xmm3, [cdFFFF] + pand xmm4, [cdFFFF] + pslld xmm3, 16 + pslld xmm4, 16 + psrad xmm3, 16 + psrad xmm4, 16 + packssdw xmm3, xmm4 + movdqa xmm4, xmm1 + movdqa xmm5, xmm2 + movdqa xmm6, xmm3 + ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 + paddw xmm4, xmm6 + psraw xmm4, 1 + psubw xmm5, xmm4 + psraw xmm5, 1 + + movdqa xmm6, xmm5 ; out hi + paddw xmm6, LHI_ADD + psraw xmm6, LHI_SFT + movdqa [edi], xmm6 + movdqa xmm2, xmm5 ; save hi + + ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) + movdqa xmm7, xmm5 + movd eax, xmm7 + pslldq xmm7, 2 + and eax, 0xFFFF + movd xmm6, eax + por xmm7, xmm6 + paddw xmm5, xmm7 + psraw xmm5, 1 + paddw xmm5, xmm1 + + psrldq xmm2, 14 + movd ebx, xmm2 ; save hi + + movdqa [edx], xmm5 ; out lo + + ; move right + lea esi, [esi + 16 * 2] + lea edi, [edi + 8 * 2] + lea edx, [edx + 8 * 2] + + ; loop + shl ecx, 16 + mov cx, 2 +loop2e1: + movdqa xmm1, [esi] ; src[2n] + movdqa xmm2, [esi + 16] + movdqa xmm6, xmm1 + movdqa xmm7, xmm2 + pand xmm1, [cdFFFF] + pand xmm2, [cdFFFF] + pslld xmm1, 16 + pslld xmm2, 16 + psrad xmm1, 16 + psrad xmm2, 16 + packssdw xmm1, xmm2 + movdqa xmm2, xmm6 ; src[2n + 1] + movdqa xmm3, xmm7 + psrldq xmm2, 2 + psrldq xmm3, 2 + pand xmm2, [cdFFFF] + pand xmm3, [cdFFFF] + pslld xmm2, 16 + pslld xmm3, 16 + psrad xmm2, 16 + psrad xmm3, 16 + packssdw xmm2, xmm3 + movdqa xmm3, xmm6 ; src[2n + 2] + movdqa xmm4, xmm7 + psrldq xmm3, 4 + psrldq xmm4, 4 + movd eax, xmm7 + movd xmm5, eax + pslldq xmm5, 12 + por xmm3, xmm5 + mov eax, [esi + 32] + movd xmm5, eax + pslldq xmm5, 12 + por xmm4, xmm5 + pand xmm3, [cdFFFF] + pand xmm4, [cdFFFF] + pslld xmm3, 16 + pslld xmm4, 16 + psrad xmm3, 16 + psrad xmm4, 16 + packssdw xmm3, xmm4 + movdqa xmm4, xmm1 + movdqa xmm5, xmm2 + movdqa xmm6, xmm3 + ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 + paddw xmm4, xmm6 + psraw xmm4, 1 + psubw xmm5, xmm4 + psraw xmm5, 1 + + movdqa xmm6, xmm5 ; out hi + paddw xmm6, LHI_ADD + psraw xmm6, LHI_SFT + movdqa [edi], xmm6 + movdqa xmm2, xmm5 ; save hi + + ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) + movdqa xmm7, xmm5 + pslldq xmm7, 2 + movd xmm6, ebx + por xmm7, xmm6 + paddw xmm5, xmm7 + psraw xmm5, 1 + paddw xmm5, xmm1 + + psrldq xmm2, 14 + movd ebx, xmm2 ; save hi + + movdqa [edx], xmm5 ; out lo + + ; move right + lea esi, [esi + 16 * 2] + lea edi, [edi + 8 * 2] + lea edx, [edx + 8 * 2] + + dec cx + jnz loop2e1 + shr ecx, 16 + + ; post + movdqa xmm1, [esi] ; src[2n] + movdqa xmm2, [esi + 16] + movdqa xmm6, xmm1 + movdqa xmm7, xmm2 + pand xmm1, [cdFFFF] + pand xmm2, [cdFFFF] + pslld xmm1, 16 + pslld xmm2, 16 + psrad xmm1, 16 + psrad xmm2, 16 + packssdw xmm1, xmm2 + movdqa xmm2, xmm6 ; src[2n + 1] + movdqa xmm3, xmm7 + psrldq xmm2, 2 + psrldq xmm3, 2 + pand xmm2, [cdFFFF] + pand xmm3, [cdFFFF] + pslld xmm2, 16 + pslld xmm3, 16 + psrad xmm2, 16 + psrad xmm3, 16 + packssdw xmm2, xmm3 + movdqa xmm3, xmm6 ; src[2n + 2] + movdqa xmm4, xmm7 + psrldq xmm3, 4 + psrldq xmm4, 4 + movd eax, xmm7 + movd xmm5, eax + pslldq xmm5, 12 + por xmm3, xmm5 + movdqa xmm5, xmm7 + psrldq xmm5, 12 + pslldq xmm5, 12 + por xmm4, xmm5 + pand xmm3, [cdFFFF] + pand xmm4, [cdFFFF] + pslld xmm3, 16 + pslld xmm4, 16 + psrad xmm3, 16 + psrad xmm4, 16 + packssdw xmm3, xmm4 + movdqa xmm4, xmm1 + movdqa xmm5, xmm2 + movdqa xmm6, xmm3 + ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 + paddw xmm4, xmm6 + psraw xmm4, 1 + psubw xmm5, xmm4 + psraw xmm5, 1 + + movdqa xmm6, xmm5 ; out hi + paddw xmm6, LHI_ADD + psraw xmm6, LHI_SFT + movdqa [edi], xmm6 + + ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) + movdqa xmm7, xmm5 + pslldq xmm7, 2 + movd xmm6, ebx + por xmm7, xmm6 + paddw xmm5, xmm7 + psraw xmm5, 1 + paddw xmm5, xmm1 + + movdqa [edx], xmm5 ; out lo + + ; move right + lea esi, [esi + 16 * 2] + lea edi, [edi + 8 * 2] + lea edx, [edx + 8 * 2] + + ; move left + lea esi, [esi - 64 * 2] + lea edi, [edi - 32 * 2] + lea edx, [edx - 32 * 2] + + ; move down + lea esi, [esi + 64 * 2] + lea edi, [edi + 32 * 2] + lea edx, [edx + 32 * 2] + + dec ecx + jnz loop1e1 + + ret + +;****************************************************************************** +; source 8 bit unsigned, 64 pixel width +rfx_dwt_2d_encode_block_verti_8_64: + mov ecx, 8 +loop1f: + ; pre + movq xmm1, [esi] ; src[2n] + movq xmm2, [esi + 64 * 1] ; src[2n + 1] + movq xmm3, [esi + 64 * 1 * 2] ; src[2n + 2] + punpcklbw xmm1, xmm0 + punpcklbw xmm2, xmm0 + punpcklbw xmm3, xmm0 + psubw xmm1, [cw128] + psubw xmm2, [cw128] + psubw xmm3, [cw128] + psllw xmm1, 5 + psllw xmm2, 5 + psllw xmm3, 5 + movdqa xmm4, xmm1 + movdqa xmm5, xmm2 + movdqa xmm6, xmm3 + ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 + paddw xmm4, xmm6 + psraw xmm4, 1 + psubw xmm5, xmm4 + psraw xmm5, 1 + movdqa [edi], xmm5 ; out hi + movdqa xmm6, xmm5 ; save hi + ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) + paddw xmm5, xmm1 + movdqa [edx], xmm5 ; out lo + movdqa xmm7, xmm6 ; save hi + ; move down + lea esi, [esi + 64 * 1 * 2] ; 2 rows + lea edi, [edi + 64 * 2] ; 1 row + lea edx, [edx + 64 * 2] ; 1 row + + ; loop + shl ecx, 16 + mov cx, 30 +loop2f: + movdqa xmm1, xmm3 ; src[2n] + movq xmm2, [esi + 64 * 1] ; src[2n + 1] + movq xmm3, [esi + 64 * 1 * 2] ; src[2n + 2] + punpcklbw xmm2, xmm0 + punpcklbw xmm3, xmm0 + psubw xmm2, [cw128] + psubw xmm3, [cw128] + psllw xmm2, 5 + psllw xmm3, 5 + movdqa xmm4, xmm1 + movdqa xmm5, xmm2 + movdqa xmm6, xmm3 + ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 + paddw xmm4, xmm6 + psraw xmm4, 1 + psubw xmm5, xmm4 + psraw xmm5, 1 + movdqa [edi], xmm5 ; out hi + movdqa xmm6, xmm5 ; save hi + ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) + paddw xmm5, xmm7 + psraw xmm5, 1 + paddw xmm5, xmm1 + movdqa [edx], xmm5 ; out lo + movdqa xmm7, xmm6 ; save hi + ; move down + lea esi, [esi + 64 * 1 * 2] ; 2 rows + lea edi, [edi + 64 * 2] ; 1 row + lea edx, [edx + 64 * 2] ; 1 row + + dec cx + jnz loop2f + shr ecx, 16 + + ; post + movdqa xmm1, xmm3 ; src[2n] + movq xmm2, [esi + 64 * 1] ; src[2n + 1] + punpcklbw xmm2, xmm0 + psubw xmm2, [cw128] + psllw xmm2, 5 + movdqa xmm4, xmm1 + movdqa xmm5, xmm2 + movdqa xmm6, xmm3 + ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 + paddw xmm4, xmm6 + psraw xmm4, 1 + psubw xmm5, xmm4 + psraw xmm5, 1 + movdqa [edi], xmm5 ; out hi + ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) + paddw xmm5, xmm7 + psraw xmm5, 1 + paddw xmm5, xmm1 + movdqa [edx], xmm5 ; out lo + ; move down + lea esi, [esi + 64 * 1 * 2] ; 2 rows + lea edi, [edi + 64 * 2] ; 1 row + lea edx, [edx + 64 * 2] ; 1 row + + ; move up + lea esi, [esi - 64 * 1 * 64] + lea edi, [edi - 32 * 64 * 2] + lea edx, [edx - 32 * 64 * 2] + + ; move right + lea esi, [esi + 8] + lea edi, [edi + 16] + lea edx, [edx + 16] + + dec ecx + jnz loop1f + + ret + +set_quants_hi: + sub eax, 6 - 5 + movd xmm1, eax + movdqa LHI_SFT, xmm1 + imul eax, 16 + lea edx, [cwa0] + add edx, eax + movdqa xmm1, [edx] + movdqa LHI_ADD, xmm1 + ret + +set_quants_lo: + sub eax, 6 - 5 + movd xmm1, eax + movdqa LLO_SFT, xmm1 + imul eax, 16 + lea edx, [cwa0] + add edx, eax + movdqa xmm1, [edx] + movdqa LLO_ADD, xmm1 + ret + +%define LQTABLE [esp + 144] ; qtable +%define LIN_BUFFER [esp + 148] ; in_buffer +%define LOUT_BUFFER [esp + 152] ; out_buffer +%define LWORK_BUFFER [esp + 156] ; work_buffer + +;int +;rfxcodec_encode_dwt_shift_x86_sse2(const char *qtable, +; unsigned char *in_buffer, +; short *out_buffer, +; short *work_buffer); + +;****************************************************************************** +%ifidn __OUTPUT_FORMAT__,elf +PROC rfxcodec_encode_dwt_shift_x86_sse2 +%else +PROC _rfxcodec_encode_dwt_shift_x86_sse2 +%endif + ; align stack + mov eax, esp + sub eax, 0x10 + and eax, 0x0F + sub esp, eax + push eax + sub esp, 3 * 4 + sub esp, 4 * 4 + ; copy params to after align + movdqu xmm0, [esp + eax + 4 * 4 + 3 * 4 + 4 + 4] + movdqu [esp], xmm0 + ; save registers + push ebx + push esi + push edi + push ebp + sub esp, 16 * 8 + pxor xmm0, xmm0 + + ; verical DWT to work buffer, level 1 + mov esi, LIN_BUFFER ; src + mov edi, LWORK_BUFFER ; dst hi + lea edi, [edi + 64 * 32 * 2] ; dst hi + mov edx, LWORK_BUFFER ; dst lo + call rfx_dwt_2d_encode_block_verti_8_64 + + ; horizontal DWT to out buffer, level 1, part 1 + xor eax, eax + mov edx, LQTABLE + mov al, [edx + 4] + and al, 0xF + call set_quants_hi + mov esi, LWORK_BUFFER ; src + mov edi, LOUT_BUFFER ; dst hi - HL1 + mov edx, LOUT_BUFFER ; dst lo - LL1 + lea edx, [edx + 32 * 32 * 6] ; dst lo - LL1 + call rfx_dwt_2d_encode_block_horiz_16_64_no_lo + + ; horizontal DWT to out buffer, level 1, part 2 + xor eax, eax + mov edx, LQTABLE + mov al, [edx + 4] + shr al, 4 + call set_quants_hi + xor eax, eax + mov edx, LQTABLE + mov al, [edx + 3] + shr al, 4 + call set_quants_lo + mov esi, LWORK_BUFFER ; src + lea esi, [esi + 64 * 32 * 2] ; src + mov edi, LOUT_BUFFER ; dst hi - HH1 + lea edi, [edi + 32 * 32 * 4] ; dst hi - HH1 + mov edx, LOUT_BUFFER ; dst lo - LH1 + lea edx, [edx + 32 * 32 * 2] ; dst lo - LH1 + call rfx_dwt_2d_encode_block_horiz_16_64 + + ; verical DWT to work buffer, level 2 + mov esi, LOUT_BUFFER ; src + lea esi, [esi + 32 * 32 * 6] ; src + mov edi, LWORK_BUFFER ; dst hi + lea edi, [edi + 32 * 16 * 2] ; dst hi + mov edx, LWORK_BUFFER ; dst lo + call rfx_dwt_2d_encode_block_verti_16_32 + + ; horizontal DWT to out buffer, level 2, part 1 + xor eax, eax + mov edx, LQTABLE + mov al, [edx + 2] + shr al, 4 + call set_quants_hi + mov esi, LWORK_BUFFER ; src + ; 32 * 32 * 6 + 16 * 16 * 0 = 6144 + mov edi, LOUT_BUFFER ; dst hi - HL2 + lea edi, [edi + 6144] ; dst hi - HL2 + ; 32 * 32 * 6 + 16 * 16 * 6 = 7680 + mov edx, LOUT_BUFFER ; dst lo - LL2 + lea edx, [edx + 7680] ; dst lo - LL2 + call rfx_dwt_2d_encode_block_horiz_16_32_no_lo + + ; horizontal DWT to out buffer, level 2, part 2 + xor eax, eax + mov edx, LQTABLE + mov al, [edx + 3] + and al, 0xF + call set_quants_hi + xor eax, eax + mov edx, LQTABLE + mov al, [edx + 2] + and al, 0xF + call set_quants_lo + mov esi, LWORK_BUFFER ; src + lea esi, [esi + 32 * 16 * 2] ; src + ; 32 * 32 * 6 + 16 * 16 * 4 = 7168 + mov edi, LOUT_BUFFER ; dst hi - HH2 + lea edi, [edi + 7168] ; dst hi - HH2 + ; 32 * 32 * 6 + 16 * 16 * 2 = 6656 + mov edx, LOUT_BUFFER ; dst lo - LH2 + lea edx, [edx + 6656] ; dst lo - LH2 + call rfx_dwt_2d_encode_block_horiz_16_32 + + ; verical DWT to work buffer, level 3 + ; 32 * 32 * 6 + 16 * 16 * 6 = 7680 + mov esi, LOUT_BUFFER ; src + lea esi, [esi + 7680] ; src + mov edi, LWORK_BUFFER ; dst hi + lea edi, [edi + 16 * 8 * 2] ; dst hi + mov edx, LWORK_BUFFER ; dst lo + call rfx_dwt_2d_encode_block_verti_16_16 + + ; horizontal DWT to out buffer, level 3, part 1 + xor eax, eax + mov edx, LQTABLE + mov al, [edx + 1] + and al, 0xF + call set_quants_hi + xor eax, eax + mov edx, LQTABLE + mov al, [edx + 0] + and al, 0xF + call set_quants_lo + mov esi, LWORK_BUFFER ; src + ; 32 * 32 * 6 + 16 * 16 * 6 + 8 * 8 * 0 = 7680 + mov edi, LOUT_BUFFER ; dst hi - HL3 + lea edi, [edi + 7680] ; dst hi - HL3 + ; 32 * 32 * 6 + 16 * 16 * 6 + 8 * 8 * 6 = 8064 + mov edx, LOUT_BUFFER ; dst lo - LL3 + lea edx, [edx + 8064] ; dst lo - LL3 + call rfx_dwt_2d_encode_block_horiz_16_16 + + ; horizontal DWT to out buffer, level 3, part 2 + xor eax, eax + mov edx, LQTABLE + mov al, [edx + 1] + shr al, 4 + call set_quants_hi + xor eax, eax + mov edx, LQTABLE + mov al, [edx + 0] + shr al, 4 + call set_quants_lo + mov esi, LWORK_BUFFER ; src + lea esi, [esi + 16 * 8 * 2] ; src + ; 32 * 32 * 6 + 16 * 16 * 6 + 8 * 8 * 4 = 7936 + mov edi, LOUT_BUFFER ; dst hi - HH3 + lea edi, [edi + 7936] ; dst hi - HH3 + ; 32 * 32 * 6 + 16 * 16 * 6 + 8 * 8 * 2 = 7808 + mov edx, LOUT_BUFFER ; dst lo - LH3 + lea edx, [edx + 7808] ; dst lo - LH3 + call rfx_dwt_2d_encode_block_horiz_16_16 + + ; quants + add esp, 16 * 8 + ; restore registers + pop ebp + pop edi + pop esi + pop ebx + ; params + add esp, 3 * 4 + add esp, 4 * 4 + ; align + pop eax + add esp, eax + ; return value + mov eax, 0 + ret + align 16 + diff --git a/src/x86/rfxcodec_encode_dwt_shift_x86_sse41.asm b/src/x86/rfxcodec_encode_dwt_shift_x86_sse41.asm new file mode 100644 index 0000000..cb117da --- /dev/null +++ b/src/x86/rfxcodec_encode_dwt_shift_x86_sse41.asm @@ -0,0 +1,1401 @@ +; +;Copyright 2016 Jay Sorg +; +;Permission to use, copy, modify, distribute, and sell this software and its +;documentation for any purpose is hereby granted without fee, provided that +;the above copyright notice appear in all copies and that both that +;copyright notice and this permission notice appear in supporting +;documentation. +; +;The above copyright notice and this permission notice shall be included in +;all copies or substantial portions of the Software. +; +;THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +;IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +;FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +;OPEN GROUP BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN +;AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +;CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +; +;x86 asm dwt + +%ifidn __OUTPUT_FORMAT__,elf +section .note.GNU-stack noalloc noexec nowrite progbits +%endif + +section .data + align 16 + cw128 times 8 dw 128 + cdFFFF times 4 dd 65535 + ; these are 1 << (factor - 1) 0 to 15 is factor + cwa0 times 8 dw 0 ; 0 + cwa1 times 8 dw 1 ; 1 + cwa2 times 8 dw 2 ; 2 + cwa4 times 8 dw 4 ; 3 + cwa8 times 8 dw 8 ; 4 + cwa16 times 8 dw 16 ; 5 + cwa32 times 8 dw 32 ; 6 + cwa64 times 8 dw 64 ; 7 + cwa128 times 8 dw 128 ; 8 + cwa256 times 8 dw 256 ; 9 + cwa512 times 8 dw 512 ; 10 + cwa1024 times 8 dw 1024 ; 11 + cwa2048 times 8 dw 2048 ; 12 + cwa4096 times 8 dw 4096 ; 13 + cwa8192 times 8 dw 8192 ; 14 + cwa16384 times 8 dw 16384 ; 15 + +section .text + +%macro PROC 1 + align 16 + global %1 + %1: +%endmacro + +%define LHI_ADD [esp + 1 * 16 + 4] +%define LHI_SFT [esp + 2 * 16 + 4] +%define LLO_ADD [esp + 3 * 16 + 4] +%define LLO_SFT [esp + 4 * 16 + 4] + +;****************************************************************************** +; source 16 bit signed, 16 pixel width +rfx_dwt_2d_encode_block_horiz_16_16: + mov ecx, 8 +loop1a: + ; pre / post + movdqa xmm1, [esi] ; src[2n] + movdqa xmm2, [esi + 16] + movdqa xmm6, xmm1 + movdqa xmm7, xmm2 + pand xmm1, [cdFFFF] + pand xmm2, [cdFFFF] + packusdw xmm1, xmm2 + movdqa xmm2, xmm6 ; src[2n + 1] + movdqa xmm3, xmm7 + psrldq xmm2, 2 + psrldq xmm3, 2 + pand xmm2, [cdFFFF] + pand xmm3, [cdFFFF] + packusdw xmm2, xmm3 + movdqa xmm3, xmm6 ; src[2n + 2] + movdqa xmm4, xmm7 + psrldq xmm3, 4 + psrldq xmm4, 4 + movd eax, xmm7 + movd xmm5, eax + pslldq xmm5, 12 + por xmm3, xmm5 + movdqa xmm5, xmm7 + psrldq xmm5, 12 + pslldq xmm5, 12 + por xmm4, xmm5 + pand xmm3, [cdFFFF] + pand xmm4, [cdFFFF] + packusdw xmm3, xmm4 + movdqa xmm4, xmm1 + movdqa xmm5, xmm2 + movdqa xmm6, xmm3 + ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 + paddw xmm4, xmm6 + psraw xmm4, 1 + psubw xmm5, xmm4 + psraw xmm5, 1 + movdqa xmm6, xmm5 ; out hi + paddw xmm6, LHI_ADD + psraw xmm6, LHI_SFT + movdqa [edi], xmm6 + ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) + movdqa xmm7, xmm5 + movd eax, xmm7 + pslldq xmm7, 2 + and eax, 0xFFFF + movd xmm6, eax + por xmm7, xmm6 + paddw xmm5, xmm7 + psraw xmm5, 1 + paddw xmm5, xmm1 + + movdqa xmm6, xmm5 ; out lo + paddw xmm6, LLO_ADD + psraw xmm6, LLO_SFT + movdqa [edx], xmm6 + + ; move right + lea esi, [esi + 16 * 2] + lea edi, [edi + 8 * 2] + lea edx, [edx + 8 * 2] + + ; move left + lea esi, [esi - 16 * 2] + lea edi, [edi - 8 * 2] + lea edx, [edx - 8 * 2] + + ; move down + lea esi, [esi + 16 * 2] + lea edi, [edi + 8 * 2] + lea edx, [edx + 8 * 2] + + dec ecx + jnz loop1a + + ret + +;****************************************************************************** +; source 16 bit signed, 16 pixel width +rfx_dwt_2d_encode_block_verti_16_16: + mov ecx, 2 +loop1b: + ; pre + movdqa xmm1, [esi] ; src[2n] + movdqa xmm2, [esi + 16 * 2] ; src[2n + 1] + movdqa xmm3, [esi + 16 * 2 * 2] ; src[2n + 2] + movdqa xmm4, xmm1 + movdqa xmm5, xmm2 + movdqa xmm6, xmm3 + ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 + paddw xmm4, xmm6 + psraw xmm4, 1 + psubw xmm5, xmm4 + psraw xmm5, 1 + movdqa [edi], xmm5 ; out hi + movdqa xmm6, xmm5 ; save hi + ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) + paddw xmm5, xmm1 + movdqa [edx], xmm5 ; out lo + movdqa xmm7, xmm6 ; save hi + ; move down + lea esi, [esi + 16 * 2 * 2] ; 2 rows + lea edi, [edi + 16 * 2] ; 1 row + lea edx, [edx + 16 * 2] ; 1 row + + ; loop + shl ecx, 16 + mov cx, 6 +loop2b: + movdqa xmm1, xmm3 ; src[2n] + movdqa xmm2, [esi + 16 * 2] ; src[2n + 1] + movdqa xmm3, [esi + 16 * 2 * 2] ; src[2n + 2] + movdqa xmm4, xmm1 + movdqa xmm5, xmm2 + movdqa xmm6, xmm3 + ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 + paddw xmm4, xmm6 + psraw xmm4, 1 + psubw xmm5, xmm4 + psraw xmm5, 1 + movdqa [edi], xmm5 ; out hi + movdqa xmm6, xmm5 ; save hi + ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) + paddw xmm5, xmm7 + psraw xmm5, 1 + paddw xmm5, xmm1 + movdqa [edx], xmm5 ; out lo + movdqa xmm7, xmm6 ; save hi + ; move down + lea esi, [esi + 16 * 2 * 2] ; 2 rows + lea edi, [edi + 16 * 2] ; 1 row + lea edx, [edx + 16 * 2] ; 1 row + + dec cx + jnz loop2b + shr ecx, 16 + + ; post + movdqa xmm1, xmm3 ; src[2n] + movdqa xmm2, [esi + 16 * 2] ; src[2n + 1] + movdqa xmm4, xmm1 + movdqa xmm5, xmm2 + movdqa xmm6, xmm3 + ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 + paddw xmm4, xmm6 + psraw xmm4, 1 + psubw xmm5, xmm4 + psraw xmm5, 1 + movdqa [edi], xmm5 ; out hi + ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) + paddw xmm5, xmm7 + psraw xmm5, 1 + paddw xmm5, xmm1 + movdqa [edx], xmm5 ; out lo + ; move down + lea esi, [esi + 16 * 2 * 2] ; 2 row + lea edi, [edi + 16 * 2] ; 1 row + lea edx, [edx + 16 * 2] ; 1 row + + ; move up + lea esi, [esi - 16 * 16 * 2] + lea edi, [edi - 8 * 16 * 2] + lea edx, [edx - 8 * 16 * 2] + + ; move right + lea esi, [esi + 16] + lea edi, [edi + 16] + lea edx, [edx + 16] + + dec ecx + jnz loop1b + + ret + +;****************************************************************************** +; source 16 bit signed, 32 pixel width +rfx_dwt_2d_encode_block_horiz_16_32: + mov ecx, 16 +loop1c: + ; pre + movdqa xmm1, [esi] ; src[2n] + movdqa xmm2, [esi + 16] + movdqa xmm6, xmm1 + movdqa xmm7, xmm2 + pand xmm1, [cdFFFF] + pand xmm2, [cdFFFF] + packusdw xmm1, xmm2 + movdqa xmm2, xmm6 ; src[2n + 1] + movdqa xmm3, xmm7 + psrldq xmm2, 2 + psrldq xmm3, 2 + pand xmm2, [cdFFFF] + pand xmm3, [cdFFFF] + packusdw xmm2, xmm3 + movdqa xmm3, xmm6 ; src[2n + 2] + movdqa xmm4, xmm7 + psrldq xmm3, 4 + psrldq xmm4, 4 + movd eax, xmm7 + movd xmm5, eax + pslldq xmm5, 12 + por xmm3, xmm5 + mov eax, [esi + 32] + movd xmm5, eax + pslldq xmm5, 12 + por xmm4, xmm5 + pand xmm3, [cdFFFF] + pand xmm4, [cdFFFF] + packusdw xmm3, xmm4 + movdqa xmm4, xmm1 + movdqa xmm5, xmm2 + movdqa xmm6, xmm3 + ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 + paddw xmm4, xmm6 + psraw xmm4, 1 + psubw xmm5, xmm4 + psraw xmm5, 1 + + movdqa xmm6, xmm5 ; out hi + paddw xmm6, LHI_ADD + psraw xmm6, LHI_SFT + movdqa [edi], xmm6 + movdqa xmm2, xmm5 ; save hi + + ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) + movdqa xmm7, xmm5 + movd eax, xmm7 + pslldq xmm7, 2 + and eax, 0xFFFF + movd xmm6, eax + por xmm7, xmm6 + paddw xmm5, xmm7 + psraw xmm5, 1 + paddw xmm5, xmm1 + + psrldq xmm2, 14 + movd ebx, xmm2 ; save hi + + movdqa xmm6, xmm5 ; out lo + paddw xmm6, LLO_ADD + psraw xmm6, LLO_SFT + movdqa [edx], xmm6 + + ; move right + lea esi, [esi + 16 * 2] + lea edi, [edi + 8 * 2] + lea edx, [edx + 8 * 2] + + ; post + movdqa xmm1, [esi] ; src[2n] + movdqa xmm2, [esi + 16] + movdqa xmm6, xmm1 + movdqa xmm7, xmm2 + pand xmm1, [cdFFFF] + pand xmm2, [cdFFFF] + packusdw xmm1, xmm2 + movdqa xmm2, xmm6 ; src[2n + 1] + movdqa xmm3, xmm7 + psrldq xmm2, 2 + psrldq xmm3, 2 + pand xmm2, [cdFFFF] + pand xmm3, [cdFFFF] + packusdw xmm2, xmm3 + movdqa xmm3, xmm6 ; src[2n + 2] + movdqa xmm4, xmm7 + psrldq xmm3, 4 + psrldq xmm4, 4 + movd eax, xmm7 + movd xmm5, eax + pslldq xmm5, 12 + por xmm3, xmm5 + movdqa xmm5, xmm7 + psrldq xmm5, 12 + pslldq xmm5, 12 + por xmm4, xmm5 + pand xmm3, [cdFFFF] + pand xmm4, [cdFFFF] + packusdw xmm3, xmm4 + movdqa xmm4, xmm1 + movdqa xmm5, xmm2 + movdqa xmm6, xmm3 + ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 + paddw xmm4, xmm6 + psraw xmm4, 1 + psubw xmm5, xmm4 + psraw xmm5, 1 + + movdqa xmm6, xmm5 ; out hi + paddw xmm6, LHI_ADD + psraw xmm6, LHI_SFT + movdqa [edi], xmm6 + + ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) + movdqa xmm7, xmm5 + pslldq xmm7, 2 + movd xmm6, ebx + por xmm7, xmm6 + paddw xmm5, xmm7 + psraw xmm5, 1 + paddw xmm5, xmm1 + + movdqa xmm6, xmm5 ; out lo + paddw xmm6, LLO_ADD + psraw xmm6, LLO_SFT + movdqa [edx], xmm6 + + ; move right + lea esi, [esi + 16 * 2] + lea edi, [edi + 8 * 2] + lea edx, [edx + 8 * 2] + + ; move left + lea esi, [esi - 32 * 2] + lea edi, [edi - 16 * 2] + lea edx, [edx - 16 * 2] + + ; move down + lea esi, [esi + 32 * 2] + lea edi, [edi + 16 * 2] + lea edx, [edx + 16 * 2] + + dec ecx + jnz loop1c + + ret + +;****************************************************************************** +; source 16 bit signed, 32 pixel width +rfx_dwt_2d_encode_block_horiz_16_32_no_lo: + mov ecx, 16 +loop1c1: + ; pre + movdqa xmm1, [esi] ; src[2n] + movdqa xmm2, [esi + 16] + movdqa xmm6, xmm1 + movdqa xmm7, xmm2 + pand xmm1, [cdFFFF] + pand xmm2, [cdFFFF] + packusdw xmm1, xmm2 + movdqa xmm2, xmm6 ; src[2n + 1] + movdqa xmm3, xmm7 + psrldq xmm2, 2 + psrldq xmm3, 2 + pand xmm2, [cdFFFF] + pand xmm3, [cdFFFF] + packusdw xmm2, xmm3 + movdqa xmm3, xmm6 ; src[2n + 2] + movdqa xmm4, xmm7 + psrldq xmm3, 4 + psrldq xmm4, 4 + movd eax, xmm7 + movd xmm5, eax + pslldq xmm5, 12 + por xmm3, xmm5 + mov eax, [esi + 32] + movd xmm5, eax + pslldq xmm5, 12 + por xmm4, xmm5 + pand xmm3, [cdFFFF] + pand xmm4, [cdFFFF] + packusdw xmm3, xmm4 + movdqa xmm4, xmm1 + movdqa xmm5, xmm2 + movdqa xmm6, xmm3 + ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 + paddw xmm4, xmm6 + psraw xmm4, 1 + psubw xmm5, xmm4 + psraw xmm5, 1 + + movdqa xmm6, xmm5 ; out hi + paddw xmm6, LHI_ADD + psraw xmm6, LHI_SFT + movdqa [edi], xmm6 + movdqa xmm2, xmm5 ; save hi + + ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) + movdqa xmm7, xmm5 + movd eax, xmm7 + pslldq xmm7, 2 + and eax, 0xFFFF + movd xmm6, eax + por xmm7, xmm6 + paddw xmm5, xmm7 + psraw xmm5, 1 + paddw xmm5, xmm1 + + psrldq xmm2, 14 + movd ebx, xmm2 ; save hi + + movdqa [edx], xmm5 ; out lo + + ; move right + lea esi, [esi + 16 * 2] + lea edi, [edi + 8 * 2] + lea edx, [edx + 8 * 2] + + ; post + movdqa xmm1, [esi] ; src[2n] + movdqa xmm2, [esi + 16] + movdqa xmm6, xmm1 + movdqa xmm7, xmm2 + pand xmm1, [cdFFFF] + pand xmm2, [cdFFFF] + packusdw xmm1, xmm2 + movdqa xmm2, xmm6 ; src[2n + 1] + movdqa xmm3, xmm7 + psrldq xmm2, 2 + psrldq xmm3, 2 + pand xmm2, [cdFFFF] + pand xmm3, [cdFFFF] + packusdw xmm2, xmm3 + movdqa xmm3, xmm6 ; src[2n + 2] + movdqa xmm4, xmm7 + psrldq xmm3, 4 + psrldq xmm4, 4 + movd eax, xmm7 + movd xmm5, eax + pslldq xmm5, 12 + por xmm3, xmm5 + movdqa xmm5, xmm7 + psrldq xmm5, 12 + pslldq xmm5, 12 + por xmm4, xmm5 + pand xmm3, [cdFFFF] + pand xmm4, [cdFFFF] + packusdw xmm3, xmm4 + movdqa xmm4, xmm1 + movdqa xmm5, xmm2 + movdqa xmm6, xmm3 + ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 + paddw xmm4, xmm6 + psraw xmm4, 1 + psubw xmm5, xmm4 + psraw xmm5, 1 + + movdqa xmm6, xmm5 ; out hi + paddw xmm6, LHI_ADD + psraw xmm6, LHI_SFT + movdqa [edi], xmm6 + + ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) + movdqa xmm7, xmm5 + pslldq xmm7, 2 + movd xmm6, ebx + por xmm7, xmm6 + paddw xmm5, xmm7 + psraw xmm5, 1 + paddw xmm5, xmm1 + + movdqa [edx], xmm5 ; out lo + + ; move right + lea esi, [esi + 16 * 2] + lea edi, [edi + 8 * 2] + lea edx, [edx + 8 * 2] + + ; move left + lea esi, [esi - 32 * 2] + lea edi, [edi - 16 * 2] + lea edx, [edx - 16 * 2] + + ; move down + lea esi, [esi + 32 * 2] + lea edi, [edi + 16 * 2] + lea edx, [edx + 16 * 2] + + dec ecx + jnz loop1c1 + + ret + +;****************************************************************************** +; source 16 bit signed, 32 pixel width +rfx_dwt_2d_encode_block_verti_16_32: + mov ecx, 4 +loop1d: + ; pre + movdqa xmm1, [esi] ; src[2n] + movdqa xmm2, [esi + 32 * 2] ; src[2n + 1] + movdqa xmm3, [esi + 32 * 2 * 2] ; src[2n + 2] + movdqa xmm4, xmm1 + movdqa xmm5, xmm2 + movdqa xmm6, xmm3 + ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 + paddw xmm4, xmm6 + psraw xmm4, 1 + psubw xmm5, xmm4 + psraw xmm5, 1 + movdqa [edi], xmm5 ; out hi + movdqa xmm6, xmm5 ; save hi + ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) + paddw xmm5, xmm1 + movdqa [edx], xmm5 ; out lo + movdqa xmm7, xmm6 ; save hi + ; move down + lea esi, [esi + 32 * 2 * 2] ; 2 rows + lea edi, [edi + 32 * 2] ; 1 row + lea edx, [edx + 32 * 2] ; 1 row + + ; loop + shl ecx, 16 + mov cx, 14 +loop2d: + movdqa xmm1, xmm3 ; src[2n] + movdqa xmm2, [esi + 32 * 2] ; src[2n + 1] + movdqa xmm3, [esi + 32 * 2 * 2] ; src[2n + 2] + movdqa xmm4, xmm1 + movdqa xmm5, xmm2 + movdqa xmm6, xmm3 + ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 + paddw xmm4, xmm6 + psraw xmm4, 1 + psubw xmm5, xmm4 + psraw xmm5, 1 + movdqa [edi], xmm5 ; out hi + movdqa xmm6, xmm5 ; save hi + ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) + paddw xmm5, xmm7 + psraw xmm5, 1 + paddw xmm5, xmm1 + movdqa [edx], xmm5 ; out lo + movdqa xmm7, xmm6 ; save hi + ; move down + lea esi, [esi + 32 * 2 * 2] ; 2 rows + lea edi, [edi + 32 * 2] ; 1 row + lea edx, [edx + 32 * 2] ; 1 row + + dec cx + jnz loop2d + shr ecx, 16 + + ; post + movdqa xmm1, xmm3 ; src[2n] + movdqa xmm2, [esi + 32 * 2] ; src[2n + 1] + movdqa xmm4, xmm1 + movdqa xmm5, xmm2 + movdqa xmm6, xmm3 + ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 + paddw xmm4, xmm6 + psraw xmm4, 1 + psubw xmm5, xmm4 + psraw xmm5, 1 + movdqa [edi], xmm5 ; out hi + ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) + paddw xmm5, xmm7 + psraw xmm5, 1 + paddw xmm5, xmm1 + movdqa [edx], xmm5 ; out lo + ; move down + lea esi, [esi + 32 * 2 * 2] ; 2 row + lea edi, [edi + 32 * 2] ; 1 row + lea edx, [edx + 32 * 2] ; 1 row + + ; move up + lea esi, [esi - 32 * 32 * 2] + lea edi, [edi - 16 * 32 * 2] + lea edx, [edx - 16 * 32 * 2] + + ; move right + lea esi, [esi + 16] + lea edi, [edi + 16] + lea edx, [edx + 16] + + dec ecx + jnz loop1d + + ret + +;****************************************************************************** +; source 16 bit signed, 64 pixel width +rfx_dwt_2d_encode_block_horiz_16_64: + mov ecx, 32 +loop1e: + ; pre + movdqa xmm1, [esi] ; src[2n] + movdqa xmm2, [esi + 16] + movdqa xmm6, xmm1 + movdqa xmm7, xmm2 + pand xmm1, [cdFFFF] + pand xmm2, [cdFFFF] + packusdw xmm1, xmm2 + movdqa xmm2, xmm6 ; src[2n + 1] + movdqa xmm3, xmm7 + psrldq xmm2, 2 + psrldq xmm3, 2 + pand xmm2, [cdFFFF] + pand xmm3, [cdFFFF] + packusdw xmm2, xmm3 + movdqa xmm3, xmm6 ; src[2n + 2] + movdqa xmm4, xmm7 + psrldq xmm3, 4 + psrldq xmm4, 4 + movd eax, xmm7 + movd xmm5, eax + pslldq xmm5, 12 + por xmm3, xmm5 + mov eax, [esi + 32] + movd xmm5, eax + pslldq xmm5, 12 + por xmm4, xmm5 + pand xmm3, [cdFFFF] + pand xmm4, [cdFFFF] + packusdw xmm3, xmm4 + movdqa xmm4, xmm1 + movdqa xmm5, xmm2 + movdqa xmm6, xmm3 + ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 + paddw xmm4, xmm6 + psraw xmm4, 1 + psubw xmm5, xmm4 + psraw xmm5, 1 + + movdqa xmm6, xmm5 ; out hi + paddw xmm6, LHI_ADD + psraw xmm6, LHI_SFT + movdqa [edi], xmm6 + movdqa xmm2, xmm5 ; save hi + + ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) + movdqa xmm7, xmm5 + movd eax, xmm7 + pslldq xmm7, 2 + and eax, 0xFFFF + movd xmm6, eax + por xmm7, xmm6 + paddw xmm5, xmm7 + psraw xmm5, 1 + paddw xmm5, xmm1 + + psrldq xmm2, 14 + movd ebx, xmm2 ; save hi + + movdqa xmm6, xmm5 ; out lo + paddw xmm6, LLO_ADD + psraw xmm6, LLO_SFT + movdqa [edx], xmm6 + + ; move right + lea esi, [esi + 16 * 2] + lea edi, [edi + 8 * 2] + lea edx, [edx + 8 * 2] + + ; loop + shl ecx, 16 + mov cx, 2 +loop2e: + movdqa xmm1, [esi] ; src[2n] + movdqa xmm2, [esi + 16] + movdqa xmm6, xmm1 + movdqa xmm7, xmm2 + pand xmm1, [cdFFFF] + pand xmm2, [cdFFFF] + packusdw xmm1, xmm2 + movdqa xmm2, xmm6 ; src[2n + 1] + movdqa xmm3, xmm7 + psrldq xmm2, 2 + psrldq xmm3, 2 + pand xmm2, [cdFFFF] + pand xmm3, [cdFFFF] + packusdw xmm2, xmm3 + movdqa xmm3, xmm6 ; src[2n + 2] + movdqa xmm4, xmm7 + psrldq xmm3, 4 + psrldq xmm4, 4 + movd eax, xmm7 + movd xmm5, eax + pslldq xmm5, 12 + por xmm3, xmm5 + mov eax, [esi + 32] + movd xmm5, eax + pslldq xmm5, 12 + por xmm4, xmm5 + pand xmm3, [cdFFFF] + pand xmm4, [cdFFFF] + packusdw xmm3, xmm4 + movdqa xmm4, xmm1 + movdqa xmm5, xmm2 + movdqa xmm6, xmm3 + ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 + paddw xmm4, xmm6 + psraw xmm4, 1 + psubw xmm5, xmm4 + psraw xmm5, 1 + + movdqa xmm6, xmm5 ; out hi + paddw xmm6, LHI_ADD + psraw xmm6, LHI_SFT + movdqa [edi], xmm6 + movdqa xmm2, xmm5 ; save hi + + ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) + movdqa xmm7, xmm5 + pslldq xmm7, 2 + movd xmm6, ebx + por xmm7, xmm6 + paddw xmm5, xmm7 + psraw xmm5, 1 + paddw xmm5, xmm1 + + psrldq xmm2, 14 + movd ebx, xmm2 ; save hi + + movdqa xmm6, xmm5 ; out lo + paddw xmm6, LLO_ADD + psraw xmm6, LLO_SFT + movdqa [edx], xmm6 + + ; move right + lea esi, [esi + 16 * 2] + lea edi, [edi + 8 * 2] + lea edx, [edx + 8 * 2] + + dec cx + jnz loop2e + shr ecx, 16 + + ; post + movdqa xmm1, [esi] ; src[2n] + movdqa xmm2, [esi + 16] + movdqa xmm6, xmm1 + movdqa xmm7, xmm2 + pand xmm1, [cdFFFF] + pand xmm2, [cdFFFF] + packusdw xmm1, xmm2 + movdqa xmm2, xmm6 ; src[2n + 1] + movdqa xmm3, xmm7 + psrldq xmm2, 2 + psrldq xmm3, 2 + pand xmm2, [cdFFFF] + pand xmm3, [cdFFFF] + packusdw xmm2, xmm3 + movdqa xmm3, xmm6 ; src[2n + 2] + movdqa xmm4, xmm7 + psrldq xmm3, 4 + psrldq xmm4, 4 + movd eax, xmm7 + movd xmm5, eax + pslldq xmm5, 12 + por xmm3, xmm5 + movdqa xmm5, xmm7 + psrldq xmm5, 12 + pslldq xmm5, 12 + por xmm4, xmm5 + pand xmm3, [cdFFFF] + pand xmm4, [cdFFFF] + packusdw xmm3, xmm4 + movdqa xmm4, xmm1 + movdqa xmm5, xmm2 + movdqa xmm6, xmm3 + ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 + paddw xmm4, xmm6 + psraw xmm4, 1 + psubw xmm5, xmm4 + psraw xmm5, 1 + + movdqa xmm6, xmm5 ; out hi + paddw xmm6, LHI_ADD + psraw xmm6, LHI_SFT + movdqa [edi], xmm6 + + ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) + movdqa xmm7, xmm5 + pslldq xmm7, 2 + movd xmm6, ebx + por xmm7, xmm6 + paddw xmm5, xmm7 + psraw xmm5, 1 + paddw xmm5, xmm1 + + movdqa xmm6, xmm5 ; out lo + paddw xmm6, LLO_ADD + psraw xmm6, LLO_SFT + movdqa [edx], xmm6 + + ; move right + lea esi, [esi + 16 * 2] + lea edi, [edi + 8 * 2] + lea edx, [edx + 8 * 2] + + ; move left + lea esi, [esi - 64 * 2] + lea edi, [edi - 32 * 2] + lea edx, [edx - 32 * 2] + + ; move down + lea esi, [esi + 64 * 2] + lea edi, [edi + 32 * 2] + lea edx, [edx + 32 * 2] + + dec ecx + jnz loop1e + + ret + +;****************************************************************************** +; source 16 bit signed, 64 pixel width +rfx_dwt_2d_encode_block_horiz_16_64_no_lo: + mov ecx, 32 +loop1e1: + ; pre + movdqa xmm1, [esi] ; src[2n] + movdqa xmm2, [esi + 16] + movdqa xmm6, xmm1 + movdqa xmm7, xmm2 + pand xmm1, [cdFFFF] + pand xmm2, [cdFFFF] + packusdw xmm1, xmm2 + movdqa xmm2, xmm6 ; src[2n + 1] + movdqa xmm3, xmm7 + psrldq xmm2, 2 + psrldq xmm3, 2 + pand xmm2, [cdFFFF] + pand xmm3, [cdFFFF] + packusdw xmm2, xmm3 + movdqa xmm3, xmm6 ; src[2n + 2] + movdqa xmm4, xmm7 + psrldq xmm3, 4 + psrldq xmm4, 4 + movd eax, xmm7 + movd xmm5, eax + pslldq xmm5, 12 + por xmm3, xmm5 + mov eax, [esi + 32] + movd xmm5, eax + pslldq xmm5, 12 + por xmm4, xmm5 + pand xmm3, [cdFFFF] + pand xmm4, [cdFFFF] + packusdw xmm3, xmm4 + movdqa xmm4, xmm1 + movdqa xmm5, xmm2 + movdqa xmm6, xmm3 + ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 + paddw xmm4, xmm6 + psraw xmm4, 1 + psubw xmm5, xmm4 + psraw xmm5, 1 + + movdqa xmm6, xmm5 ; out hi + paddw xmm6, LHI_ADD + psraw xmm6, LHI_SFT + movdqa [edi], xmm6 + movdqa xmm2, xmm5 ; save hi + + ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) + movdqa xmm7, xmm5 + movd eax, xmm7 + pslldq xmm7, 2 + and eax, 0xFFFF + movd xmm6, eax + por xmm7, xmm6 + paddw xmm5, xmm7 + psraw xmm5, 1 + paddw xmm5, xmm1 + + psrldq xmm2, 14 + movd ebx, xmm2 ; save hi + + movdqa [edx], xmm5 ; out lo + + ; move right + lea esi, [esi + 16 * 2] + lea edi, [edi + 8 * 2] + lea edx, [edx + 8 * 2] + + ; loop + shl ecx, 16 + mov cx, 2 +loop2e1: + movdqa xmm1, [esi] ; src[2n] + movdqa xmm2, [esi + 16] + movdqa xmm6, xmm1 + movdqa xmm7, xmm2 + pand xmm1, [cdFFFF] + pand xmm2, [cdFFFF] + packusdw xmm1, xmm2 + movdqa xmm2, xmm6 ; src[2n + 1] + movdqa xmm3, xmm7 + psrldq xmm2, 2 + psrldq xmm3, 2 + pand xmm2, [cdFFFF] + pand xmm3, [cdFFFF] + packusdw xmm2, xmm3 + movdqa xmm3, xmm6 ; src[2n + 2] + movdqa xmm4, xmm7 + psrldq xmm3, 4 + psrldq xmm4, 4 + movd eax, xmm7 + movd xmm5, eax + pslldq xmm5, 12 + por xmm3, xmm5 + mov eax, [esi + 32] + movd xmm5, eax + pslldq xmm5, 12 + por xmm4, xmm5 + pand xmm3, [cdFFFF] + pand xmm4, [cdFFFF] + packusdw xmm3, xmm4 + movdqa xmm4, xmm1 + movdqa xmm5, xmm2 + movdqa xmm6, xmm3 + ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 + paddw xmm4, xmm6 + psraw xmm4, 1 + psubw xmm5, xmm4 + psraw xmm5, 1 + + movdqa xmm6, xmm5 ; out hi + paddw xmm6, LHI_ADD + psraw xmm6, LHI_SFT + movdqa [edi], xmm6 + movdqa xmm2, xmm5 ; save hi + + ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) + movdqa xmm7, xmm5 + pslldq xmm7, 2 + movd xmm6, ebx + por xmm7, xmm6 + paddw xmm5, xmm7 + psraw xmm5, 1 + paddw xmm5, xmm1 + + psrldq xmm2, 14 + movd ebx, xmm2 ; save hi + + movdqa [edx], xmm5 ; out lo + + ; move right + lea esi, [esi + 16 * 2] + lea edi, [edi + 8 * 2] + lea edx, [edx + 8 * 2] + + dec cx + jnz loop2e1 + shr ecx, 16 + + ; post + movdqa xmm1, [esi] ; src[2n] + movdqa xmm2, [esi + 16] + movdqa xmm6, xmm1 + movdqa xmm7, xmm2 + pand xmm1, [cdFFFF] + pand xmm2, [cdFFFF] + packusdw xmm1, xmm2 + movdqa xmm2, xmm6 ; src[2n + 1] + movdqa xmm3, xmm7 + psrldq xmm2, 2 + psrldq xmm3, 2 + pand xmm2, [cdFFFF] + pand xmm3, [cdFFFF] + packusdw xmm2, xmm3 + movdqa xmm3, xmm6 ; src[2n + 2] + movdqa xmm4, xmm7 + psrldq xmm3, 4 + psrldq xmm4, 4 + movd eax, xmm7 + movd xmm5, eax + pslldq xmm5, 12 + por xmm3, xmm5 + movdqa xmm5, xmm7 + psrldq xmm5, 12 + pslldq xmm5, 12 + por xmm4, xmm5 + pand xmm3, [cdFFFF] + pand xmm4, [cdFFFF] + packusdw xmm3, xmm4 + movdqa xmm4, xmm1 + movdqa xmm5, xmm2 + movdqa xmm6, xmm3 + ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 + paddw xmm4, xmm6 + psraw xmm4, 1 + psubw xmm5, xmm4 + psraw xmm5, 1 + + movdqa xmm6, xmm5 ; out hi + paddw xmm6, LHI_ADD + psraw xmm6, LHI_SFT + movdqa [edi], xmm6 + + ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) + movdqa xmm7, xmm5 + pslldq xmm7, 2 + movd xmm6, ebx + por xmm7, xmm6 + paddw xmm5, xmm7 + psraw xmm5, 1 + paddw xmm5, xmm1 + + movdqa [edx], xmm5 ; out lo + + ; move right + lea esi, [esi + 16 * 2] + lea edi, [edi + 8 * 2] + lea edx, [edx + 8 * 2] + + ; move left + lea esi, [esi - 64 * 2] + lea edi, [edi - 32 * 2] + lea edx, [edx - 32 * 2] + + ; move down + lea esi, [esi + 64 * 2] + lea edi, [edi + 32 * 2] + lea edx, [edx + 32 * 2] + + dec ecx + jnz loop1e1 + + ret + +;****************************************************************************** +; source 8 bit unsigned, 64 pixel width +rfx_dwt_2d_encode_block_verti_8_64: + mov ecx, 8 +loop1f: + ; pre + movq xmm1, [esi] ; src[2n] + movq xmm2, [esi + 64 * 1] ; src[2n + 1] + movq xmm3, [esi + 64 * 1 * 2] ; src[2n + 2] + punpcklbw xmm1, xmm0 + punpcklbw xmm2, xmm0 + punpcklbw xmm3, xmm0 + psubw xmm1, [cw128] + psubw xmm2, [cw128] + psubw xmm3, [cw128] + psllw xmm1, 5 + psllw xmm2, 5 + psllw xmm3, 5 + movdqa xmm4, xmm1 + movdqa xmm5, xmm2 + movdqa xmm6, xmm3 + ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 + paddw xmm4, xmm6 + psraw xmm4, 1 + psubw xmm5, xmm4 + psraw xmm5, 1 + movdqa [edi], xmm5 ; out hi + movdqa xmm6, xmm5 ; save hi + ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) + paddw xmm5, xmm1 + movdqa [edx], xmm5 ; out lo + movdqa xmm7, xmm6 ; save hi + ; move down + lea esi, [esi + 64 * 1 * 2] ; 2 rows + lea edi, [edi + 64 * 2] ; 1 row + lea edx, [edx + 64 * 2] ; 1 row + + ; loop + shl ecx, 16 + mov cx, 30 +loop2f: + movdqa xmm1, xmm3 ; src[2n] + movq xmm2, [esi + 64 * 1] ; src[2n + 1] + movq xmm3, [esi + 64 * 1 * 2] ; src[2n + 2] + punpcklbw xmm2, xmm0 + punpcklbw xmm3, xmm0 + psubw xmm2, [cw128] + psubw xmm3, [cw128] + psllw xmm2, 5 + psllw xmm3, 5 + movdqa xmm4, xmm1 + movdqa xmm5, xmm2 + movdqa xmm6, xmm3 + ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 + paddw xmm4, xmm6 + psraw xmm4, 1 + psubw xmm5, xmm4 + psraw xmm5, 1 + movdqa [edi], xmm5 ; out hi + movdqa xmm6, xmm5 ; save hi + ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) + paddw xmm5, xmm7 + psraw xmm5, 1 + paddw xmm5, xmm1 + movdqa [edx], xmm5 ; out lo + movdqa xmm7, xmm6 ; save hi + ; move down + lea esi, [esi + 64 * 1 * 2] ; 2 rows + lea edi, [edi + 64 * 2] ; 1 row + lea edx, [edx + 64 * 2] ; 1 row + + dec cx + jnz loop2f + shr ecx, 16 + + ; post + movdqa xmm1, xmm3 ; src[2n] + movq xmm2, [esi + 64 * 1] ; src[2n + 1] + punpcklbw xmm2, xmm0 + psubw xmm2, [cw128] + psllw xmm2, 5 + movdqa xmm4, xmm1 + movdqa xmm5, xmm2 + movdqa xmm6, xmm3 + ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 + paddw xmm4, xmm6 + psraw xmm4, 1 + psubw xmm5, xmm4 + psraw xmm5, 1 + movdqa [edi], xmm5 ; out hi + ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) + paddw xmm5, xmm7 + psraw xmm5, 1 + paddw xmm5, xmm1 + movdqa [edx], xmm5 ; out lo + ; move down + lea esi, [esi + 64 * 1 * 2] ; 2 rows + lea edi, [edi + 64 * 2] ; 1 row + lea edx, [edx + 64 * 2] ; 1 row + + ; move up + lea esi, [esi - 64 * 1 * 64] + lea edi, [edi - 32 * 64 * 2] + lea edx, [edx - 32 * 64 * 2] + + ; move right + lea esi, [esi + 8] + lea edi, [edi + 16] + lea edx, [edx + 16] + + dec ecx + jnz loop1f + + ret + +set_quants_hi: + sub eax, 6 - 5 + movd xmm1, eax + movdqa LHI_SFT, xmm1 + imul eax, 16 + lea edx, [cwa0] + add edx, eax + movdqa xmm1, [edx] + movdqa LHI_ADD, xmm1 + ret + +set_quants_lo: + sub eax, 6 - 5 + movd xmm1, eax + movdqa LLO_SFT, xmm1 + imul eax, 16 + lea edx, [cwa0] + add edx, eax + movdqa xmm1, [edx] + movdqa LLO_ADD, xmm1 + ret + +%define LQTABLE [esp + 144] ; qtable +%define LIN_BUFFER [esp + 148] ; in_buffer +%define LOUT_BUFFER [esp + 152] ; out_buffer +%define LWORK_BUFFER [esp + 156] ; work_buffer + +;int +;rfxcodec_encode_dwt_shift_x86_sse41(const char *qtable, +; unsigned char *in_buffer, +; short *out_buffer, +; short *work_buffer); + +;****************************************************************************** +%ifidn __OUTPUT_FORMAT__,elf +PROC rfxcodec_encode_dwt_shift_x86_sse41 +%else +PROC _rfxcodec_encode_dwt_shift_x86_sse41 +%endif + ; align stack + mov eax, esp + sub eax, 0x10 + and eax, 0x0F + sub esp, eax + push eax + sub esp, 3 * 4 + sub esp, 4 * 4 + ; copy params to after align + movdqu xmm0, [esp + eax + 4 * 4 + 3 * 4 + 4 + 4] + movdqu [esp], xmm0 + ; save registers + push ebx + push esi + push edi + push ebp + sub esp, 16 * 8 + pxor xmm0, xmm0 + + ; verical DWT to work buffer, level 1 + mov esi, LIN_BUFFER ; src + mov edi, LWORK_BUFFER ; dst hi + lea edi, [edi + 64 * 32 * 2] ; dst hi + mov edx, LWORK_BUFFER ; dst lo + call rfx_dwt_2d_encode_block_verti_8_64 + + ; horizontal DWT to out buffer, level 1, part 1 + xor eax, eax + mov edx, LQTABLE + mov al, [edx + 4] + and al, 0xF + call set_quants_hi + mov esi, LWORK_BUFFER ; src + mov edi, LOUT_BUFFER ; dst hi - HL1 + mov edx, LOUT_BUFFER ; dst lo - LL1 + lea edx, [edx + 32 * 32 * 6] ; dst lo - LL1 + call rfx_dwt_2d_encode_block_horiz_16_64_no_lo + + ; horizontal DWT to out buffer, level 1, part 2 + xor eax, eax + mov edx, LQTABLE + mov al, [edx + 4] + shr al, 4 + call set_quants_hi + xor eax, eax + mov edx, LQTABLE + mov al, [edx + 3] + shr al, 4 + call set_quants_lo + mov esi, LWORK_BUFFER ; src + lea esi, [esi + 64 * 32 * 2] ; src + mov edi, LOUT_BUFFER ; dst hi - HH1 + lea edi, [edi + 32 * 32 * 4] ; dst hi - HH1 + mov edx, LOUT_BUFFER ; dst lo - LH1 + lea edx, [edx + 32 * 32 * 2] ; dst lo - LH1 + call rfx_dwt_2d_encode_block_horiz_16_64 + + ; verical DWT to work buffer, level 2 + mov esi, LOUT_BUFFER ; src + lea esi, [esi + 32 * 32 * 6] ; src + mov edi, LWORK_BUFFER ; dst hi + lea edi, [edi + 32 * 16 * 2] ; dst hi + mov edx, LWORK_BUFFER ; dst lo + call rfx_dwt_2d_encode_block_verti_16_32 + + ; horizontal DWT to out buffer, level 2, part 1 + xor eax, eax + mov edx, LQTABLE + mov al, [edx + 2] + shr al, 4 + call set_quants_hi + mov esi, LWORK_BUFFER ; src + ; 32 * 32 * 6 + 16 * 16 * 0 = 6144 + mov edi, LOUT_BUFFER ; dst hi - HL2 + lea edi, [edi + 6144] ; dst hi - HL2 + ; 32 * 32 * 6 + 16 * 16 * 6 = 7680 + mov edx, LOUT_BUFFER ; dst lo - LL2 + lea edx, [edx + 7680] ; dst lo - LL2 + call rfx_dwt_2d_encode_block_horiz_16_32_no_lo + + ; horizontal DWT to out buffer, level 2, part 2 + xor eax, eax + mov edx, LQTABLE + mov al, [edx + 3] + and al, 0xF + call set_quants_hi + xor eax, eax + mov edx, LQTABLE + mov al, [edx + 2] + and al, 0xF + call set_quants_lo + mov esi, LWORK_BUFFER ; src + lea esi, [esi + 32 * 16 * 2] ; src + ; 32 * 32 * 6 + 16 * 16 * 4 = 7168 + mov edi, LOUT_BUFFER ; dst hi - HH2 + lea edi, [edi + 7168] ; dst hi - HH2 + ; 32 * 32 * 6 + 16 * 16 * 2 = 6656 + mov edx, LOUT_BUFFER ; dst lo - LH2 + lea edx, [edx + 6656] ; dst lo - LH2 + call rfx_dwt_2d_encode_block_horiz_16_32 + + ; verical DWT to work buffer, level 3 + ; 32 * 32 * 6 + 16 * 16 * 6 = 7680 + mov esi, LOUT_BUFFER ; src + lea esi, [esi + 7680] ; src + mov edi, LWORK_BUFFER ; dst hi + lea edi, [edi + 16 * 8 * 2] ; dst hi + mov edx, LWORK_BUFFER ; dst lo + call rfx_dwt_2d_encode_block_verti_16_16 + + ; horizontal DWT to out buffer, level 3, part 1 + xor eax, eax + mov edx, LQTABLE + mov al, [edx + 1] + and al, 0xF + call set_quants_hi + xor eax, eax + mov edx, LQTABLE + mov al, [edx + 0] + and al, 0xF + call set_quants_lo + mov esi, LWORK_BUFFER ; src + ; 32 * 32 * 6 + 16 * 16 * 6 + 8 * 8 * 0 = 7680 + mov edi, LOUT_BUFFER ; dst hi - HL3 + lea edi, [edi + 7680] ; dst hi - HL3 + ; 32 * 32 * 6 + 16 * 16 * 6 + 8 * 8 * 6 = 8064 + mov edx, LOUT_BUFFER ; dst lo - LL3 + lea edx, [edx + 8064] ; dst lo - LL3 + call rfx_dwt_2d_encode_block_horiz_16_16 + + ; horizontal DWT to out buffer, level 3, part 2 + xor eax, eax + mov edx, LQTABLE + mov al, [edx + 1] + shr al, 4 + call set_quants_hi + xor eax, eax + mov edx, LQTABLE + mov al, [edx + 0] + shr al, 4 + call set_quants_lo + mov esi, LWORK_BUFFER ; src + lea esi, [esi + 16 * 8 * 2] ; src + ; 32 * 32 * 6 + 16 * 16 * 6 + 8 * 8 * 4 = 7936 + mov edi, LOUT_BUFFER ; dst hi - HH3 + lea edi, [edi + 7936] ; dst hi - HH3 + ; 32 * 32 * 6 + 16 * 16 * 6 + 8 * 8 * 2 = 7808 + mov edx, LOUT_BUFFER ; dst lo - LH3 + lea edx, [edx + 7808] ; dst lo - LH3 + call rfx_dwt_2d_encode_block_horiz_16_16 + + ; quants + add esp, 16 * 8 + ; restore registers + pop ebp + pop edi + pop esi + pop ebx + ; params + add esp, 3 * 4 + add esp, 4 * 4 + ; align + pop eax + add esp, eax + ; return value + mov eax, 0 + ret + align 16 + diff --git a/src/x86/rfxdwt_x86_sse2.asm b/src/x86/rfxdwt_x86_sse2.asm deleted file mode 100644 index dd2a2d9..0000000 --- a/src/x86/rfxdwt_x86_sse2.asm +++ /dev/null @@ -1,25 +0,0 @@ - -section .data - const1 times 8 dw 1 - -%macro PROC 1 - align 16 - global %1 - %1: -%endmacro - -;int -;dwt_shift_x86_sse2(const int* qtable, sint8* src, sint16* dst, sint16* temp) - -PROC dwt_shift_x86_sse2 - push ebx - push esi - push edi - - mov eax, 0 - pop edi - pop esi - pop ebx - ret - align 16 - diff --git a/src/x86/rfxrlgr1_x86.asm b/src/x86/rfxrlgr1_x86.asm deleted file mode 100644 index 8441051..0000000 --- a/src/x86/rfxrlgr1_x86.asm +++ /dev/null @@ -1,25 +0,0 @@ - -section .data - const1 times 8 dw 1 - -%macro PROC 1 - align 16 - global %1 - %1: -%endmacro - -;int -;diff_rlgr1_x86(sint16 *co, int num_co, uint8 *dst, int dst_bytes); - -PROC diff_rlgr1_x86 - push ebx - push esi - push edi - - mov eax, 0 - pop edi - pop esi - pop ebx - ret - align 16 - diff --git a/src/x86/rfxrlgr3_x86.asm b/src/x86/rfxrlgr3_x86.asm deleted file mode 100644 index 08b278d..0000000 --- a/src/x86/rfxrlgr3_x86.asm +++ /dev/null @@ -1,25 +0,0 @@ - -section .data - const1 times 8 dw 1 - -%macro PROC 1 - align 16 - global %1 - %1: -%endmacro - -;int -;diff_rlgr3_x86(sint16 *co, int num_co, uint8 *dst, int dst_bytes); - -PROC diff_rlgr3_x86 - push ebx - push esi - push edi - - mov eax, 0 - pop edi - pop esi - pop ebx - ret - align 16 - diff --git a/tests/Makefile b/tests/Makefile deleted file mode 100644 index 36cd57d..0000000 --- a/tests/Makefile +++ /dev/null @@ -1,22 +0,0 @@ - -OBJS = rfxcodectest.o - -CFLAGS = -g -O2 -Wall -fPIC -I../include - -# this for linking to .so -#LDFLAGS = $(PROFIL) -L../src -Wl,-rpath=../src -# this if using .a -LDFLAGS = $(PROFIL) - -# this for linking to .so -#LIBS = -lrfxencode -# this for using .a -LIBS = ../src/librfxencode.a - -all: rfxcodectest - -rfxcodectest: $(OBJS) Makefile - $(CC) -o rfxcodectest $(LDFLAGS) $(OBJS) $(LIBS) - -clean: - rm -f $(OBJS) rfxcodectest diff --git a/tests/Makefile.am b/tests/Makefile.am new file mode 100644 index 0000000..8e24edc --- /dev/null +++ b/tests/Makefile.am @@ -0,0 +1,11 @@ +EXTRA_DIST = readme.txt + +AM_CPPFLAGS = \ + -I$(top_srcdir)/include + +bin_PROGRAMS = rfxcodectest + +rfxcodectest_SOURCES = rfxcodectest.c + +rfxcodectest_LDADD = \ + $(top_builddir)/src/librfxencode.la diff --git a/tests/rfxcodectest.c b/tests/rfxcodectest.c index f959185..6733db8 100644 --- a/tests/rfxcodectest.c +++ b/tests/rfxcodectest.c @@ -1,7 +1,7 @@ /** * RFX codec encoder test * - * Copyright 2014 Jay Sorg + * Copyright 2014-2015 Jay Sorg * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -27,11 +27,11 @@ #include -static const int g_rfx_default_quantization_values[] = +static const unsigned char g_rfx_default_quantization_values[] = { /* LL3 LH3 HL3 HH3 LH2 HL2 HH2 LH1 HL1 HH1 */ - 6, 6, 6, 6, 7, 7, 8, 8, 8, 9, - 9, 9, 9, 9, 10, 10, 12, 12, 12, 13 + 0x66, 0x66, 0x77, 0x88, 0x98, + 0x99, 0x99, 0xaa, 0xcc, 0xdc }; /*****************************************************************************/ @@ -46,7 +46,7 @@ get_mstime(void) /******************************************************************************/ static int -speed_random(int count, const int *quants) +speed_random(int count, const char *quants) { void *han; int error; @@ -56,34 +56,44 @@ speed_random(int count, const int *quants) char *cdata; char *buf; struct rfx_rect regions[1]; - struct rfx_tile tiles[1]; + struct rfx_tile tiles[2]; int stime; int etime; int tiles_per_second; int num_regions; int num_tiles; int num_quants; + int flags; printf("speed_random:\n"); - han = rfxcodec_encode_create(1920, 1024, RFX_FORMAT_BGRA, RFX_FLAGS_RLGR1); - if (han == 0) + //flags = RFX_FLAGS_RLGR1 | RFX_FLAGS_NOACCEL; + flags = RFX_FLAGS_RLGR1; + //flags = RFX_FLAGS_RLGR3; + //flags = RFX_FLAGS_RLGR1 | RFX_FLAGS_ALPHAV1; + error = rfxcodec_encode_create_ex(1920, 1024, RFX_FORMAT_BGRA, flags, &han); + if (error != 0) { - printf("speed_random: rfxcodec_encode_create failed\n"); + printf("speed_random: rfxcodec_encode_create_ex failed\n"); return 1; } - printf("speed_random: rfxcodec_encode_create ok\n"); - cdata = (char *) malloc(64 * 64 * 4); - cdata_bytes = 64 * 64 * 4; - buf = (char *) malloc(64 * 64 * 4); + printf("speed_random: rfxcodec_encode_create_ex ok\n"); + cdata = (char *) malloc(128 * 64 * 4); + cdata_bytes = 128 * 64 * 4; + buf = (char *) malloc(128 * 64 * 4); +#if 1 fd = open("/dev/urandom", O_RDONLY); - if (read(fd, buf, 64 * 64 * 4) != 64 * 64 * 4) + //fd = open("/dev/zero", O_RDONLY); + if (read(fd, buf, 128 * 64 * 4) != 128 * 64 * 4) { printf("speed_random: read error\n"); } close(fd); +#else + memset(buf, 0x7f, 128 * 64 * 4); +#endif regions[0].x = 0; regions[0].y = 0; - regions[0].cx = 64; + regions[0].cx = 128; regions[0].cy = 64; num_regions = 1; tiles[0].x = 0; @@ -93,22 +103,31 @@ speed_random(int count, const int *quants) tiles[0].quant_y = 0; tiles[0].quant_cb = 0; tiles[0].quant_cr = 0; + tiles[1].x = 64; + tiles[1].y = 0; + tiles[1].cx = 64; + tiles[1].cy = 64; + tiles[1].quant_y = 0; + tiles[1].quant_cb = 0; + tiles[1].quant_cr = 0; num_tiles = 1; num_quants = 1; error = 0; stime = get_mstime(); + flags = 0; + //flags = RFX_FLAGS_ALPHAV1; for (index = 0; index < count; index++) { - error = rfxcodec_encode(han, cdata, &cdata_bytes, buf, 64, 64, 64 * 4, - regions, num_regions, tiles, num_tiles, - quants, num_quants); + error = rfxcodec_encode_ex(han, cdata, &cdata_bytes, buf, 64, 64, 64 * 4, + regions, num_regions, tiles, num_tiles, + quants, num_quants, flags); if (error != 0) { break; } } etime = get_mstime(); - tiles_per_second = count * 1000 / (etime - stime); + tiles_per_second = count * num_tiles * 1000 / (etime - stime + 1); printf("speed_random: cdata_bytes %d count %d ms time %d " "tiles_per_second %d\n", cdata_bytes, count, etime - stime, tiles_per_second); @@ -221,7 +240,7 @@ load_bmp_file(int in_fd, char **data, int *width, int *height) /******************************************************************************/ static int encode_file(char *data, int width, int height, char *cdata, int *cdata_bytes, - const int *quants, int num_quants) + const char *quants, int num_quants) { int awidth; int aheight; @@ -235,10 +254,10 @@ encode_file(char *data, int width, int height, char *cdata, int *cdata_bytes, void *han; struct rfx_rect regions[1]; - han = rfxcodec_encode_create(1920, 1024, RFX_FORMAT_BGRA, RFX_FLAGS_RLGR1); - if (han == 0) + error = rfxcodec_encode_create_ex(1920, 1024, RFX_FORMAT_BGRA, RFX_FLAGS_RLGR1, &han); + if (error != 0) { - printf("encode_file: rfxcodec_encode_create failed\n"); + printf("encode_file: rfxcodec_encode_create_ex failed\n"); return 1; } @@ -269,9 +288,9 @@ encode_file(char *data, int width, int height, char *cdata, int *cdata_bytes, regions[0].cy = height; num_regions = 1; - error = rfxcodec_encode(han, cdata, cdata_bytes, data, width, height, width * 4, - regions, num_regions, tiles, num_tiles, - quants, num_quants); + error = rfxcodec_encode_ex(han, cdata, cdata_bytes, data, width, height, width * 4, + regions, num_regions, tiles, num_tiles, + quants, num_quants, 0); if (error != 0) { printf("encode_file: rfxcodec_encode failed error %d\n", error); @@ -287,7 +306,7 @@ encode_file(char *data, int width, int height, char *cdata, int *cdata_bytes, /******************************************************************************/ static int -read_file(int count, const int *quants, int num_quants, +read_file(int count, const char *quants, int num_quants, const char *in_file, const char *out_file) { int in_fd; @@ -380,7 +399,7 @@ main(int argc, char **argv) int count; char in_file[256]; char out_file[256]; - const int *quants = g_rfx_default_quantization_values; + const char *quants = (const char *) g_rfx_default_quantization_values; do_speed = 0; do_read = 0;