diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..70869b3
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,32 @@
+*~
+aclocal.m4
+AUTHORS
+autom4te.cache/
+ChangeLog
+config_ac.h
+config_ac-h.in
+config.c
+config.guess
+config.log
+config.status
+config.sub
+configure
+compile
+depcomp
+.deps/
+install-sh
+*.la
+.libs
+libtool
+*.lo
+ltmain.sh
+Makefile
+Makefile.in
+missing
+NEWS
+*.o
+*.pc
+README
+stamp-h1
+rfxcodectest
+.dirstamp
diff --git a/Makefile b/Makefile
deleted file mode 100644
index 0d2e4ce..0000000
--- a/Makefile
+++ /dev/null
@@ -1,12 +0,0 @@
-
-all: allmake
-
-allmake:
-	cd src; $(MAKE) $(MFLAGS)
-	cd tests; $(MAKE) $(MFLAGS)
-
-clean: allclean
-
-allclean:
-	cd src; $(MAKE) clean
-	cd tests; $(MAKE) clean
diff --git a/Makefile.am b/Makefile.am
new file mode 100644
index 0000000..d0034d0
--- /dev/null
+++ b/Makefile.am
@@ -0,0 +1,14 @@
+ACLOCAL_AMFLAGS = -I m4
+
+pkgconfig_DATA = rfxcodec.pc
+
+EXTRA_DIST = bootstrap readme.txt
+
+SUBDIRS = \
+  src \
+  tests
+
+include_HEADERS = \
+  include/rfxcodec_encode.h \
+  include/rfxcodec_decode.h \
+  include/rfxcodec_common.h
diff --git a/acinclude.m4 b/acinclude.m4
new file mode 100644
index 0000000..fbfc98d
--- /dev/null
+++ b/acinclude.m4
@@ -0,0 +1,137 @@
+# AC_PROG_NASM
+# --------------------------
+# Check that NASM exists and determine flags
+AC_DEFUN([AC_PROG_NASM],[
+
+AC_CHECK_PROGS(NASM, [nasm nasmw yasm])
+test -z "$NASM" && AC_MSG_ERROR([no nasm (Netwide Assembler) found])
+
+AC_MSG_CHECKING([for object file format of host system])
+case "$host_os" in
+  cygwin* | mingw* | pw32* | interix*)
+    case "$host_cpu" in
+      x86_64)
+        objfmt='Win64-COFF'
+        ;;
+      *)
+        objfmt='Win32-COFF'
+        ;;
+    esac
+  ;;
+  msdosdjgpp* | go32*)
+    objfmt='COFF'
+  ;;
+  os2-emx*)			# not tested
+    objfmt='MSOMF'		# obj
+  ;;
+  linux*coff* | linux*oldld*)
+    objfmt='COFF'		# ???
+  ;;
+  linux*aout*)
+    objfmt='a.out'
+  ;;
+  linux*)
+    case "$host_cpu" in
+      x86_64)
+        objfmt='ELF64'
+        ;;
+      *)
+        objfmt='ELF'
+        ;;
+    esac
+  ;;
+  freebsd* | netbsd* | openbsd*)
+    if echo __ELF__ | $CC -E - | grep __ELF__ > /dev/null; then
+      objfmt='BSD-a.out'
+    else
+      case "$host_cpu" in
+        x86_64 | amd64)
+          objfmt='ELF64'
+          ;;
+        *)
+          objfmt='ELF'
+          ;;
+      esac
+    fi
+  ;;
+  solaris* | sunos* | sysv* | sco*)
+    case "$host_cpu" in
+      x86_64)
+        objfmt='ELF64'
+        ;;
+      *)
+        objfmt='ELF'
+        ;;
+    esac
+  ;;
+  darwin* | rhapsody* | nextstep* | openstep* | macos*)
+    case "$host_cpu" in
+      x86_64)
+        objfmt='Mach-O64'
+        ;;
+      *)
+        objfmt='Mach-O'
+        ;;
+    esac
+  ;;
+  *)
+    objfmt='ELF ?'
+  ;;
+esac
+
+AC_MSG_RESULT([$objfmt])
+if test "$objfmt" = 'ELF ?'; then
+  objfmt='ELF'
+  AC_MSG_WARN([unexpected host system. assumed that the format is $objfmt.])
+fi
+
+AC_MSG_CHECKING([for object file format specifier (NAFLAGS) ])
+case "$objfmt" in
+  MSOMF)      NAFLAGS='-fobj -DOBJ32';;
+  Win32-COFF) NAFLAGS='-fwin32 -DWIN32';;
+  Win64-COFF) NAFLAGS='-fwin64 -DWIN64 -D__x86_64__';;
+  COFF)       NAFLAGS='-fcoff -DCOFF';;
+  a.out)      NAFLAGS='-faout -DAOUT';;
+  BSD-a.out)  NAFLAGS='-faoutb -DAOUT';;
+  ELF)        NAFLAGS='-felf -DELF';;
+  ELF64)      NAFLAGS='-felf64 -DELF -D__x86_64__';;
+  RDF)        NAFLAGS='-frdf -DRDF';;
+  Mach-O)     NAFLAGS='-fmacho -DMACHO';;
+  Mach-O64)   NAFLAGS='-fmacho64 -DMACHO -D__x86_64__';;
+esac
+AC_MSG_RESULT([$NAFLAGS])
+AC_SUBST([NAFLAGS])
+
+AC_MSG_CHECKING([whether the assembler ($NASM $NAFLAGS) works])
+cat > conftest.asm <<EOF
+[%line __oline__ "configure"
+        section .text
+        global  _main,main
+_main:
+main:   xor     eax,eax
+        ret
+]EOF
+try_nasm='$NASM $NAFLAGS -o conftest.o conftest.asm'
+if AC_TRY_EVAL(try_nasm) && test -s conftest.o; then
+  AC_MSG_RESULT(yes)
+else
+  echo "configure: failed program was:" >&AC_FD_CC
+  cat conftest.asm >&AC_FD_CC
+  rm -rf conftest*
+  AC_MSG_RESULT(no)
+  AC_MSG_ERROR([installation or configuration problem: assembler cannot create object files.])
+fi
+
+AC_MSG_CHECKING([whether the linker accepts assembler output])
+try_nasm='${CC-cc} -o conftest${ac_exeext} $LDFLAGS conftest.o $LIBS 1>&AC_FD_CC'
+if AC_TRY_EVAL(try_nasm) && test -s conftest${ac_exeext}; then
+  rm -rf conftest*
+  AC_MSG_RESULT(yes)
+else
+  rm -rf conftest*
+  AC_MSG_RESULT(no)
+  AC_MSG_ERROR([configuration problem: maybe object file format mismatch.])
+fi
+
+])
+
diff --git a/bootstrap b/bootstrap
new file mode 100755
index 0000000..a5ef9dd
--- /dev/null
+++ b/bootstrap
@@ -0,0 +1,32 @@
+#!/bin/sh
+
+which autoconf
+if ! test $? -eq 0
+then
+  echo "error, install autoconf"
+  exit 1
+fi
+
+which automake
+if ! test $? -eq 0
+then
+  echo "error, install automake"
+  exit 1
+fi
+
+which libtool || which libtoolize
+if ! test $? -eq 0
+then
+  echo "error, install libtool"
+  exit 1
+fi
+
+which pkg-config
+if ! test $? -eq 0
+then
+  echo "error, install pkg-config"
+  exit 1
+fi
+
+touch configure.ac
+autoreconf -fvi
diff --git a/configure.ac b/configure.ac
new file mode 100644
index 0000000..e5bc1d8
--- /dev/null
+++ b/configure.ac
@@ -0,0 +1,53 @@
+# Process this file with autoconf to produce a configure script
+
+AC_PREREQ(2.59)
+AC_INIT([rfxcodec], [0.1.0], [jay.sorg@gmail.com])
+AC_CONFIG_HEADERS(config_ac.h:config_ac-h.in)
+AM_INIT_AUTOMAKE([1.6 foreign])
+AC_CONFIG_MACRO_DIR([m4])
+m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES])
+AC_PROG_CC
+AC_C_CONST
+AC_PROG_LIBTOOL
+PKG_INSTALLDIR
+
+# SIMD is optional
+AC_ARG_WITH([simd],
+    AC_HELP_STRING([--without-simd],[Omit SIMD extensions.]))
+if test "x${with_simd}" != "xno"; then
+  # Check if we're on a supported CPU
+  AC_MSG_CHECKING([if we have SIMD optimisations for cpu type])
+  case "$host_cpu" in
+    x86_64 | amd64)
+      AC_MSG_RESULT([yes (x86_64)])
+      AC_PROG_NASM
+      simd_arch=x86_64
+    ;;
+    i*86 | x86 | ia32)
+      AC_MSG_RESULT([yes (i386)])
+      AC_PROG_NASM
+      simd_arch=i386
+    ;;
+    *)
+      AC_MSG_RESULT([no ("$host_cpu")])
+      AC_MSG_WARN([SIMD support not available for this CPU.  Performance will suffer.])
+      with_simd=no;
+    ;;
+  esac
+  if test "x${with_simd}" != "xno"; then
+    AC_DEFINE([WITH_SIMD], [1], [Use accelerated SIMD routines.])
+  fi
+fi
+
+AM_CONDITIONAL(WITH_SIMD_AMD64, [test x$simd_arch = xx86_64])
+AM_CONDITIONAL(WITH_SIMD_X86, [test x$simd_arch = xi386])
+
+AC_CONFIG_FILES([Makefile
+                 src/Makefile
+                 tests/Makefile
+                 rfxcodec.pc
+                 rfxcodec-uninstalled.pc
+])
+
+AC_OUTPUT
+
diff --git a/include/rfxcodec_common.h b/include/rfxcodec_common.h
new file mode 100644
index 0000000..0411c73
--- /dev/null
+++ b/include/rfxcodec_common.h
@@ -0,0 +1,40 @@
+/**
+ * RFX codec
+ *
+ * Copyright 2015 Jay Sorg <jay.sorg@gmail.com>
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __RFXCODEC_COMMON_H
+#define __RFXCODEC_COMMON_H
+
+#define RFX_FORMAT_BGRA 0
+#define RFX_FORMAT_RGBA 1
+#define RFX_FORMAT_BGR  2
+#define RFX_FORMAT_RGB  3
+#define RFX_FORMAT_YUV  4 /* YUV444 linear tiled mode */
+
+#define RFX_FLAGS_NONE  0 /* default RFX_FLAGS_RLGR3 and RFX_FLAGS_SAFE */
+
+#define RFX_FLAGS_SAFE     0 /* default */
+#define RFX_FLAGS_OPT1    (1 << 3)
+#define RFX_FLAGS_OPT2    (1 << 4)
+#define RFX_FLAGS_NOACCEL (1 << 6)
+
+#define RFX_FLAGS_RLGR3 0 /* default */
+#define RFX_FLAGS_RLGR1 1
+
+#define RFX_FLAGS_ALPHAV1 1 /* used in flags for rfxcodec_encode */
+
+#endif
diff --git a/include/rfxcodec_decode.h b/include/rfxcodec_decode.h
new file mode 100644
index 0000000..ed4b0a8
--- /dev/null
+++ b/include/rfxcodec_decode.h
@@ -0,0 +1,33 @@
+/**
+ * RFX codec decoder
+ *
+ * Copyright 2014-2015 Jay Sorg <jay.sorg@gmail.com>
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __RFXCODEC_DECODE_H
+#define __RFXCODEC_DECODE_H
+
+#include <rfxcodec_common.h>
+
+int
+rfxcodec_decode_create(int width, int height, int format, int flags,
+                       void **handle);
+int
+rfxcodec_decode_destroy(void *handle);
+int
+rfxcodec_decode(void *handle, char *cdata, int cdata_bytes,
+                char *data, int width, int height, int stride_bytes);
+
+#endif
diff --git a/include/rfxcodec_encode.h b/include/rfxcodec_encode.h
index f082dd2..04112b0 100644
--- a/include/rfxcodec_encode.h
+++ b/include/rfxcodec_encode.h
@@ -1,7 +1,7 @@
 /**
  * RFX codec encoder
  *
- * Copyright 2014 Jay Sorg <jay.sorg@gmail.com>
+ * Copyright 2014-2015 Jay Sorg <jay.sorg@gmail.com>
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,23 +19,7 @@
 #ifndef __RFXCODEC_ENCODE_H
 #define __RFXCODEC_ENCODE_H
 
-#define RFX_USE_ACCEL 0
-
-#define RFX_FORMAT_BGRA 0
-#define RFX_FORMAT_RGBA 1
-#define RFX_FORMAT_BGR  2
-#define RFX_FORMAT_RGB  3
-#define RFX_FORMAT_YUV  4 /* YUV444 linear tiled mode */
-
-#define RFX_FLAGS_NONE  0 /* default RFX_FLAGS_RLGR3 and RFX_FLAGS_SAFE */
-
-#define RFX_FLAGS_RLGR3 0 /* default */
-#define RFX_FLAGS_RLGR1 1
-
-#define RFX_FLAGS_SAFE  0 /* default */
-#define RFX_FLAGS_OPT1    (1 << 3)
-#define RFX_FLAGS_OPT2    (1 << 4)
-#define RFX_FLAGS_NOACCEL (1 << 6)
+#include <rfxcodec_common.h>
 
 struct rfx_rect
 {
@@ -49,8 +33,8 @@ struct rfx_tile
 {
     int x; /* multiple of 64 */
     int y; /* multiple of 64 */
-    int cx; /* must be 64 */
-    int cy; /* must be 64 */
+    int cx; /* must be 64 or less */
+    int cy; /* must be 64 or less */
     int quant_y;
     int quant_cb;
     int quant_cr;
@@ -59,8 +43,12 @@ struct rfx_tile
 void *
 rfxcodec_encode_create(int width, int height, int format, int flags);
 int
-rfxcodec_encode_destroy(void * handle);
-/* quants, 10 ints per set, should be num_quants * 10 ints in quants)
+rfxcodec_encode_create_ex(int width, int height, int format, int flags,
+                          void **handle);
+int
+rfxcodec_encode_destroy(void *handle);
+/* quants, 5 ints per set, should be num_quants * 5 chars in quants)
+ * each char is 2 quant values
  * quantizer order is
  * 0 - LL3
  * 1 - LH3
@@ -75,8 +63,14 @@ rfxcodec_encode_destroy(void * handle);
 int
 rfxcodec_encode(void *handle, char *cdata, int *cdata_bytes,
                 char *buf, int width, int height, int stride_bytes,
-                struct rfx_rect *region, int num_region,
-                struct rfx_tile *tiles, int num_tiles,
-                const int *quants, int num_quants);
+                const struct rfx_rect *region, int num_region,
+                const struct rfx_tile *tiles, int num_tiles,
+                const char *quants, int num_quants);
+int
+rfxcodec_encode_ex(void *handle, char *cdata, int *cdata_bytes,
+                   char *buf, int width, int height, int stride_bytes,
+                   const struct rfx_rect *region, int num_region,
+                   const struct rfx_tile *tiles, int num_tiles,
+                   const char *quants, int num_quants, int flags);
 
 #endif
diff --git a/m4/pkg.m4 b/m4/pkg.m4
new file mode 100644
index 0000000..82bea96
--- /dev/null
+++ b/m4/pkg.m4
@@ -0,0 +1,275 @@
+dnl pkg.m4 - Macros to locate and utilise pkg-config.   -*- Autoconf -*-
+dnl serial 11 (pkg-config-0.29.1)
+dnl
+dnl Copyright © 2004 Scott James Remnant <scott@netsplit.com>.
+dnl Copyright © 2012-2015 Dan Nicholson <dbn.lists@gmail.com>
+dnl
+dnl This program is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU General Public License as published by
+dnl the Free Software Foundation; either version 2 of the License, or
+dnl (at your option) any later version.
+dnl
+dnl This program is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU General Public License
+dnl along with this program; if not, write to the Free Software
+dnl Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+dnl 02111-1307, USA.
+dnl
+dnl As a special exception to the GNU General Public License, if you
+dnl distribute this file as part of a program that contains a
+dnl configuration script generated by Autoconf, you may include it under
+dnl the same distribution terms that you use for the rest of that
+dnl program.
+
+dnl PKG_PREREQ(MIN-VERSION)
+dnl -----------------------
+dnl Since: 0.29
+dnl
+dnl Verify that the version of the pkg-config macros are at least
+dnl MIN-VERSION. Unlike PKG_PROG_PKG_CONFIG, which checks the user's
+dnl installed version of pkg-config, this checks the developer's version
+dnl of pkg.m4 when generating configure.
+dnl
+dnl To ensure that this macro is defined, also add:
+dnl m4_ifndef([PKG_PREREQ],
+dnl     [m4_fatal([must install pkg-config 0.29 or later before running autoconf/autogen])])
+dnl
+dnl See the "Since" comment for each macro you use to see what version
+dnl of the macros you require.
+m4_defun([PKG_PREREQ],
+[m4_define([PKG_MACROS_VERSION], [0.29.1])
+m4_if(m4_version_compare(PKG_MACROS_VERSION, [$1]), -1,
+    [m4_fatal([pkg.m4 version $1 or higher is required but ]PKG_MACROS_VERSION[ found])])
+])dnl PKG_PREREQ
+
+dnl PKG_PROG_PKG_CONFIG([MIN-VERSION])
+dnl ----------------------------------
+dnl Since: 0.16
+dnl
+dnl Search for the pkg-config tool and set the PKG_CONFIG variable to
+dnl first found in the path. Checks that the version of pkg-config found
+dnl is at least MIN-VERSION. If MIN-VERSION is not specified, 0.9.0 is
+dnl used since that's the first version where most current features of
+dnl pkg-config existed.
+AC_DEFUN([PKG_PROG_PKG_CONFIG],
+[m4_pattern_forbid([^_?PKG_[A-Z_]+$])
+m4_pattern_allow([^PKG_CONFIG(_(PATH|LIBDIR|SYSROOT_DIR|ALLOW_SYSTEM_(CFLAGS|LIBS)))?$])
+m4_pattern_allow([^PKG_CONFIG_(DISABLE_UNINSTALLED|TOP_BUILD_DIR|DEBUG_SPEW)$])
+AC_ARG_VAR([PKG_CONFIG], [path to pkg-config utility])
+AC_ARG_VAR([PKG_CONFIG_PATH], [directories to add to pkg-config's search path])
+AC_ARG_VAR([PKG_CONFIG_LIBDIR], [path overriding pkg-config's built-in search path])
+
+if test "x$ac_cv_env_PKG_CONFIG_set" != "xset"; then
+	AC_PATH_TOOL([PKG_CONFIG], [pkg-config])
+fi
+if test -n "$PKG_CONFIG"; then
+	_pkg_min_version=m4_default([$1], [0.9.0])
+	AC_MSG_CHECKING([pkg-config is at least version $_pkg_min_version])
+	if $PKG_CONFIG --atleast-pkgconfig-version $_pkg_min_version; then
+		AC_MSG_RESULT([yes])
+	else
+		AC_MSG_RESULT([no])
+		PKG_CONFIG=""
+	fi
+fi[]dnl
+])dnl PKG_PROG_PKG_CONFIG
+
+dnl PKG_CHECK_EXISTS(MODULES, [ACTION-IF-FOUND], [ACTION-IF-NOT-FOUND])
+dnl -------------------------------------------------------------------
+dnl Since: 0.18
+dnl
+dnl Check to see whether a particular set of modules exists. Similar to
+dnl PKG_CHECK_MODULES(), but does not set variables or print errors.
+dnl
+dnl Please remember that m4 expands AC_REQUIRE([PKG_PROG_PKG_CONFIG])
+dnl only at the first occurence in configure.ac, so if the first place
+dnl it's called might be skipped (such as if it is within an "if", you
+dnl have to call PKG_CHECK_EXISTS manually
+AC_DEFUN([PKG_CHECK_EXISTS],
+[AC_REQUIRE([PKG_PROG_PKG_CONFIG])dnl
+if test -n "$PKG_CONFIG" && \
+    AC_RUN_LOG([$PKG_CONFIG --exists --print-errors "$1"]); then
+  m4_default([$2], [:])
+m4_ifvaln([$3], [else
+  $3])dnl
+fi])
+
+dnl _PKG_CONFIG([VARIABLE], [COMMAND], [MODULES])
+dnl ---------------------------------------------
+dnl Internal wrapper calling pkg-config via PKG_CONFIG and setting
+dnl pkg_failed based on the result.
+m4_define([_PKG_CONFIG],
+[if test -n "$$1"; then
+    pkg_cv_[]$1="$$1"
+ elif test -n "$PKG_CONFIG"; then
+    PKG_CHECK_EXISTS([$3],
+                     [pkg_cv_[]$1=`$PKG_CONFIG --[]$2 "$3" 2>/dev/null`
+		      test "x$?" != "x0" && pkg_failed=yes ],
+		     [pkg_failed=yes])
+ else
+    pkg_failed=untried
+fi[]dnl
+])dnl _PKG_CONFIG
+
+dnl _PKG_SHORT_ERRORS_SUPPORTED
+dnl ---------------------------
+dnl Internal check to see if pkg-config supports short errors.
+AC_DEFUN([_PKG_SHORT_ERRORS_SUPPORTED],
+[AC_REQUIRE([PKG_PROG_PKG_CONFIG])
+if $PKG_CONFIG --atleast-pkgconfig-version 0.20; then
+        _pkg_short_errors_supported=yes
+else
+        _pkg_short_errors_supported=no
+fi[]dnl
+])dnl _PKG_SHORT_ERRORS_SUPPORTED
+
+
+dnl PKG_CHECK_MODULES(VARIABLE-PREFIX, MODULES, [ACTION-IF-FOUND],
+dnl   [ACTION-IF-NOT-FOUND])
+dnl --------------------------------------------------------------
+dnl Since: 0.4.0
+dnl
+dnl Note that if there is a possibility the first call to
+dnl PKG_CHECK_MODULES might not happen, you should be sure to include an
+dnl explicit call to PKG_PROG_PKG_CONFIG in your configure.ac
+AC_DEFUN([PKG_CHECK_MODULES],
+[AC_REQUIRE([PKG_PROG_PKG_CONFIG])dnl
+AC_ARG_VAR([$1][_CFLAGS], [C compiler flags for $1, overriding pkg-config])dnl
+AC_ARG_VAR([$1][_LIBS], [linker flags for $1, overriding pkg-config])dnl
+
+pkg_failed=no
+AC_MSG_CHECKING([for $1])
+
+_PKG_CONFIG([$1][_CFLAGS], [cflags], [$2])
+_PKG_CONFIG([$1][_LIBS], [libs], [$2])
+
+m4_define([_PKG_TEXT], [Alternatively, you may set the environment variables $1[]_CFLAGS
+and $1[]_LIBS to avoid the need to call pkg-config.
+See the pkg-config man page for more details.])
+
+if test $pkg_failed = yes; then
+   	AC_MSG_RESULT([no])
+        _PKG_SHORT_ERRORS_SUPPORTED
+        if test $_pkg_short_errors_supported = yes; then
+	        $1[]_PKG_ERRORS=`$PKG_CONFIG --short-errors --print-errors --cflags --libs "$2" 2>&1`
+        else 
+	        $1[]_PKG_ERRORS=`$PKG_CONFIG --print-errors --cflags --libs "$2" 2>&1`
+        fi
+	# Put the nasty error message in config.log where it belongs
+	echo "$$1[]_PKG_ERRORS" >&AS_MESSAGE_LOG_FD
+
+	m4_default([$4], [AC_MSG_ERROR(
+[Package requirements ($2) were not met:
+
+$$1_PKG_ERRORS
+
+Consider adjusting the PKG_CONFIG_PATH environment variable if you
+installed software in a non-standard prefix.
+
+_PKG_TEXT])[]dnl
+        ])
+elif test $pkg_failed = untried; then
+     	AC_MSG_RESULT([no])
+	m4_default([$4], [AC_MSG_FAILURE(
+[The pkg-config script could not be found or is too old.  Make sure it
+is in your PATH or set the PKG_CONFIG environment variable to the full
+path to pkg-config.
+
+_PKG_TEXT
+
+To get pkg-config, see <http://pkg-config.freedesktop.org/>.])[]dnl
+        ])
+else
+	$1[]_CFLAGS=$pkg_cv_[]$1[]_CFLAGS
+	$1[]_LIBS=$pkg_cv_[]$1[]_LIBS
+        AC_MSG_RESULT([yes])
+	$3
+fi[]dnl
+])dnl PKG_CHECK_MODULES
+
+
+dnl PKG_CHECK_MODULES_STATIC(VARIABLE-PREFIX, MODULES, [ACTION-IF-FOUND],
+dnl   [ACTION-IF-NOT-FOUND])
+dnl ---------------------------------------------------------------------
+dnl Since: 0.29
+dnl
+dnl Checks for existence of MODULES and gathers its build flags with
+dnl static libraries enabled. Sets VARIABLE-PREFIX_CFLAGS from --cflags
+dnl and VARIABLE-PREFIX_LIBS from --libs.
+dnl
+dnl Note that if there is a possibility the first call to
+dnl PKG_CHECK_MODULES_STATIC might not happen, you should be sure to
+dnl include an explicit call to PKG_PROG_PKG_CONFIG in your
+dnl configure.ac.
+AC_DEFUN([PKG_CHECK_MODULES_STATIC],
+[AC_REQUIRE([PKG_PROG_PKG_CONFIG])dnl
+_save_PKG_CONFIG=$PKG_CONFIG
+PKG_CONFIG="$PKG_CONFIG --static"
+PKG_CHECK_MODULES($@)
+PKG_CONFIG=$_save_PKG_CONFIG[]dnl
+])dnl PKG_CHECK_MODULES_STATIC
+
+
+dnl PKG_INSTALLDIR([DIRECTORY])
+dnl -------------------------
+dnl Since: 0.27
+dnl
+dnl Substitutes the variable pkgconfigdir as the location where a module
+dnl should install pkg-config .pc files. By default the directory is
+dnl $libdir/pkgconfig, but the default can be changed by passing
+dnl DIRECTORY. The user can override through the --with-pkgconfigdir
+dnl parameter.
+AC_DEFUN([PKG_INSTALLDIR],
+[m4_pushdef([pkg_default], [m4_default([$1], ['${libdir}/pkgconfig'])])
+m4_pushdef([pkg_description],
+    [pkg-config installation directory @<:@]pkg_default[@:>@])
+AC_ARG_WITH([pkgconfigdir],
+    [AS_HELP_STRING([--with-pkgconfigdir], pkg_description)],,
+    [with_pkgconfigdir=]pkg_default)
+AC_SUBST([pkgconfigdir], [$with_pkgconfigdir])
+m4_popdef([pkg_default])
+m4_popdef([pkg_description])
+])dnl PKG_INSTALLDIR
+
+
+dnl PKG_NOARCH_INSTALLDIR([DIRECTORY])
+dnl --------------------------------
+dnl Since: 0.27
+dnl
+dnl Substitutes the variable noarch_pkgconfigdir as the location where a
+dnl module should install arch-independent pkg-config .pc files. By
+dnl default the directory is $datadir/pkgconfig, but the default can be
+dnl changed by passing DIRECTORY. The user can override through the
+dnl --with-noarch-pkgconfigdir parameter.
+AC_DEFUN([PKG_NOARCH_INSTALLDIR],
+[m4_pushdef([pkg_default], [m4_default([$1], ['${datadir}/pkgconfig'])])
+m4_pushdef([pkg_description],
+    [pkg-config arch-independent installation directory @<:@]pkg_default[@:>@])
+AC_ARG_WITH([noarch-pkgconfigdir],
+    [AS_HELP_STRING([--with-noarch-pkgconfigdir], pkg_description)],,
+    [with_noarch_pkgconfigdir=]pkg_default)
+AC_SUBST([noarch_pkgconfigdir], [$with_noarch_pkgconfigdir])
+m4_popdef([pkg_default])
+m4_popdef([pkg_description])
+])dnl PKG_NOARCH_INSTALLDIR
+
+
+dnl PKG_CHECK_VAR(VARIABLE, MODULE, CONFIG-VARIABLE,
+dnl [ACTION-IF-FOUND], [ACTION-IF-NOT-FOUND])
+dnl -------------------------------------------
+dnl Since: 0.28
+dnl
+dnl Retrieves the value of the pkg-config variable for the given module.
+AC_DEFUN([PKG_CHECK_VAR],
+[AC_REQUIRE([PKG_PROG_PKG_CONFIG])dnl
+AC_ARG_VAR([$1], [value of $3 for $2, overriding pkg-config])dnl
+
+_PKG_CONFIG([$1], [variable="][$3]["], [$2])
+AS_VAR_COPY([$1], [pkg_cv_][$1])
+
+AS_VAR_IF([$1], [""], [$5], [$4])dnl
+])dnl PKG_CHECK_VAR
diff --git a/rfxcodec-uninstalled.pc.in b/rfxcodec-uninstalled.pc.in
new file mode 100644
index 0000000..9aed766
--- /dev/null
+++ b/rfxcodec-uninstalled.pc.in
@@ -0,0 +1,5 @@
+Name: rfxcodec
+Description: Fast jpeg2000 codec compatible with MS RDP servers and xrdp
+Version: @PACKAGE_VERSION@
+Cflags: -I${pc_top_builddir}/${pcfiledir}/include
+Libs: ${pc_top_builddir}/${pcfiledir}/src/librfxencode.la
diff --git a/rfxcodec.pc.in b/rfxcodec.pc.in
new file mode 100644
index 0000000..8bd611c
--- /dev/null
+++ b/rfxcodec.pc.in
@@ -0,0 +1,10 @@
+prefix=@prefix@
+exec_prefix=@exec_prefix@
+libdir=@libdir@
+includedir=@includedir@
+
+Name: rfxcodec
+Description: Fast jpeg2000 codec compatible with MS RDP servers and xrdp
+Version: @PACKAGE_VERSION@
+Cflags: -I${includedir}
+Libs: -L${libdir} -lrfxencode
diff --git a/src/Makefile b/src/Makefile
deleted file mode 100644
index 4c657c7..0000000
--- a/src/Makefile
+++ /dev/null
@@ -1,47 +0,0 @@
-
-OBJS = rfxencode.o rfxcompose.o rfxencode_tile.o rfxencode_dwt.o \
-rfxencode_quantization.o rfxencode_differential.o \
-rfxencode_rlgr1.o rfxencode_rlgr3.o
-
-#OBJS += cpuid_x86.o rfxrlgr1_x86.o rfxrlgr3_x86.o rfxdwt_x86_sse2.o
-#OBJS += cpuid_amd64.o rfxrlgr1_amd64.o rfxrlgr3_amd64.o rfxdwt_amd64_sse2.o
-
-CFLAGS = $(PROFIL) -g -O2 -Wall -fPIC -I../include
-#-DRFX_USE_ACCEL
-
-LDFLAGS =
-
-LIBS =
-
-all: librfxencode.so
-
-librfxencode.so: $(OBJS) Makefile
-	$(CC) -shared -o librfxencode.so $(LDFLAGS) $(OBJS) $(LIBS)
-	$(AR) -rv librfxencode.a $(OBJS)
-
-cpuid_x86.o: x86/cpuid_x86.asm
-	yasm -f elf32 -g dwarf2 x86/cpuid_x86.asm
-
-rfxrlgr1_x86.o: x86/rfxrlgr1_x86.asm
-	yasm -f elf32 -g dwarf2 x86/rfxrlgr1_x86.asm
-
-rfxrlgr3_x86.o: x86/rfxrlgr3_x86.asm
-	yasm -f elf32 -g dwarf2 x86/rfxrlgr3_x86.asm
-
-rfxdwt_x86_sse2.o: x86/rfxdwt_x86_sse2.asm
-	yasm -f elf32 -g dwarf2 x86/rfxdwt_x86_sse2.asm
-
-cpuid_amd64.o: amd64/cpuid_amd64.asm
-	yasm -f elf64 -g dwarf2 amd64/cpuid_amd64.asm
-
-rfxrlgr1_amd64.o: amd64/rfxrlgr1_amd64.asm
-	yasm -f elf64 -g dwarf2 amd64/rfxrlgr1_amd64.asm
-
-rfxrlgr3_amd64.o: amd64/rfxrlgr3_amd64.asm
-	yasm -f elf64 -g dwarf2 amd64/rfxrlgr3_amd64.asm
-
-rfxdwt_amd64_sse2.o: amd64/rfxdwt_amd64_sse2.asm
-	yasm -f elf64 -g dwarf2 amd64/rfxdwt_amd64_sse2.asm
-
-clean:
-	rm -f $(OBJS) librfxencode.so librfxencode.a
diff --git a/src/Makefile.am b/src/Makefile.am
new file mode 100644
index 0000000..54eb6fe
--- /dev/null
+++ b/src/Makefile.am
@@ -0,0 +1,57 @@
+EXTRA_DIST = $(AMD64_ASM) $(X86_ASM) nasm_lt.sh
+
+AMD64_ASM = \
+  amd64/cpuid_amd64.asm \
+  amd64/rfxcodec_encode_diff_rlgr1_amd64_sse2.asm \
+  amd64/rfxcodec_encode_diff_rlgr3_amd64_sse2.asm \
+  amd64/rfxcodec_encode_dwt_shift_amd64_sse2.asm \
+  amd64/rfxcodec_encode_dwt_shift_amd64_sse41.asm
+
+X86_ASM = \
+  x86/cpuid_x86.asm \
+  x86/rfxcodec_encode_diff_rlgr1_x86_sse2.asm \
+  x86/rfxcodec_encode_diff_rlgr3_x86_sse2.asm \
+  x86/rfxcodec_encode_dwt_shift_x86_sse2.asm \
+  x86/rfxcodec_encode_dwt_shift_x86_sse41.asm
+
+ASM_SOURCES =
+
+AM_CPPFLAGS = \
+  -I$(top_srcdir)/include \
+  -I../include
+
+if WITH_SIMD_AMD64
+ASM_SOURCES += $(AMD64_ASM)
+AM_CPPFLAGS += -DSIMD_USE_ACCEL=1 -DRFX_USE_ACCEL_AMD64=1
+endif
+
+if WITH_SIMD_X86
+ASM_SOURCES += $(X86_ASM)
+AM_CPPFLAGS += -DSIMD_USE_ACCEL=1 -DRFX_USE_ACCEL_X86=1
+endif
+
+noinst_HEADERS = \
+  rfx_bitstream.h \
+  rfxcommon.h \
+  rfxcompose.h \
+  rfxconstants.h \
+  rfxencode_alpha.h \
+  rfxencode_differential.h \
+  rfxencode_dwt.h \
+  rfxencode.h \
+  rfxencode_quantization.h \
+  rfxencode_rlgr1.h \
+  rfxencode_rlgr3.h \
+  rfxencode_tile.h \
+  amd64/funcs_amd64.h \
+  x86/funcs_x86.h
+
+lib_LTLIBRARIES = librfxencode.la
+
+librfxencode_la_SOURCES = $(noinst_HEADERS) rfxencode.c \
+  rfxcompose.c rfxencode_tile.c rfxencode_dwt.c \
+  rfxencode_quantization.c rfxencode_differential.c \
+  rfxencode_rlgr1.c rfxencode_rlgr3.c rfxencode_alpha.c $(ASM_SOURCES)
+
+.asm.lo:
+	$(LIBTOOL) --mode=compile $(srcdir)/nasm_lt.sh $(NASM) $(NAFLAGS) -I$(srcdir) -I. $< -o $@
diff --git a/src/amd64/cpuid_amd64.asm b/src/amd64/cpuid_amd64.asm
index b97937a..e561b2d 100644
--- a/src/amd64/cpuid_amd64.asm
+++ b/src/amd64/cpuid_amd64.asm
@@ -1,3 +1,6 @@
+%ifidn __OUTPUT_FORMAT__,elf64
+SECTION .note.GNU-stack noalloc noexec nowrite progbits
+%endif
 
 SECTION .text
 
@@ -13,10 +16,14 @@ SECTION .text
 ;int
 ;cpuid_amd64(int eax_in, int ecx_in, int *eax, int *ebx, int *ecx, int *edx)
 
+%ifidn __OUTPUT_FORMAT__,elf64
 PROC cpuid_amd64
+%else
+PROC _cpuid_amd64
+%endif
     ; save registers
     push rbx
-    
+
     push rdx
     push rcx
     push r8
@@ -33,9 +40,9 @@ PROC cpuid_amd64
     mov [rdi], ebx
     pop rdi
     mov [rdi], eax
-    mov eax, 0
+    mov rax, 0
     ; restore registers
     pop rbx
-    ret;
+    ret
     align 16
 
diff --git a/src/amd64/funcs_amd64.h b/src/amd64/funcs_amd64.h
index 02cf6c8..124f838 100644
--- a/src/amd64/funcs_amd64.h
+++ b/src/amd64/funcs_amd64.h
@@ -1,5 +1,5 @@
 /*
-Copyright 2014 Jay Sorg
+Copyright 2014-2015 Jay Sorg
 
 Permission to use, copy, modify, distribute, and sell this software and its
 documentation for any purpose is hereby granted without fee, provided that
@@ -24,12 +24,48 @@ amd64 asm files
 #ifndef __FUNCS_AMD64_H
 #define __FUNCS_AMD64_H
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 int
 cpuid_amd64(int eax_in, int ecx_in, int *eax, int *ebx, int *ecx, int *edx);
+
+int
+rfxcodec_encode_dwt_shift_amd64_sse2(const char *qtable,
+                                     unsigned char *data,
+                                     short *dwt_buffer1,
+                                     short *dwt_buffer);
+int
+rfxcodec_encode_dwt_shift_amd64_sse41(const char *qtable,
+                                      unsigned char *data,
+                                      short *dwt_buffer1,
+                                      short *dwt_buffer);
+int
+rfxcodec_encode_diff_rlgr1_amd64_sse2(short *co,
+                                      void *dst, int dst_bytes);
 int
-dwt_shift_amd64_sse2(unsigned char *yuvs, int width, int height, int *rgbs);
+rfxcodec_encode_diff_rlgr3_amd64_sse2(short *co,
+                                      void *dst, int dst_bytes);
+
+int
+rfxcodec_decode_rlgr1_diff_amd64_sse2(void *data, int data_bytes,
+                                      short *out_data);
+int
+rfxcodec_decode_rlgr3_diff_amd64_sse2(void *data, int data_bytes,
+                                      short *out_data);
 int
-diff_rlgr3_amd64(sint16 *co, int num_co, uint8 *dst, int dst_bytes);
+rfxcodec_decode_shift_idwt_amd64_sse2(char *qtable, short *src, short *dst);
+int
+rfxcodec_decode_yuv2rgb_amd64_sse2(short *ydata, short *udata, short *vdata,
+                                   unsigned int *rgbdata, int stride);
+int
+rfxcodec_decode_yuva2argb_amd64_sse2(short *ydata, short *udata,
+                                     short *vdata, char *adata,
+                                     unsigned int *rgbdata, int stride);
 
+#ifdef __cplusplus
+}
 #endif
 
+#endif
diff --git a/src/amd64/rfxcodec_encode_diff_rlgr1_amd64_sse2.asm b/src/amd64/rfxcodec_encode_diff_rlgr1_amd64_sse2.asm
new file mode 100644
index 0000000..b2de84f
--- /dev/null
+++ b/src/amd64/rfxcodec_encode_diff_rlgr1_amd64_sse2.asm
@@ -0,0 +1,36 @@
+%ifidn __OUTPUT_FORMAT__,elf64
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
+
+section .data
+    const1 times 8 dw 1
+
+section .text
+
+%macro PROC 1
+    align 16
+    global %1
+    %1:
+%endmacro
+
+;The first six integer or pointer arguments are passed in registers
+;RDI, RSI, RDX, RCX, R8, and R9
+
+;int
+;rfxcodec_encode_diff_rlgr1_amd64_sse2(short *co,
+;                                      void *dst, int dst_bytes);
+
+%ifidn __OUTPUT_FORMAT__,elf64
+PROC rfxcodec_encode_diff_rlgr1_amd64_sse2
+%else
+PROC _rfxcodec_encode_diff_rlgr1_amd64_sse2
+%endif
+    ; save registers
+    push rbx
+
+    mov rax, 0
+    ; restore registers
+    pop rbx
+    ret
+    align 16
+
diff --git a/src/amd64/rfxcodec_encode_diff_rlgr3_amd64_sse2.asm b/src/amd64/rfxcodec_encode_diff_rlgr3_amd64_sse2.asm
new file mode 100644
index 0000000..f5712be
--- /dev/null
+++ b/src/amd64/rfxcodec_encode_diff_rlgr3_amd64_sse2.asm
@@ -0,0 +1,31 @@
+%ifidn __OUTPUT_FORMAT__,elf64
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
+
+section .data
+    const1 times 8 dw 1
+
+section .text
+
+%macro PROC 1
+    align 16
+    global %1
+    %1:
+%endmacro
+
+;int
+;rfxcodec_encode_diff_rlgr3_amd64_sse2(short *co,
+;                                      void *dst, int dst_bytes);
+
+%ifidn __OUTPUT_FORMAT__,elf64
+PROC rfxcodec_encode_diff_rlgr3_amd64_sse2
+%else
+PROC _rfxcodec_encode_diff_rlgr3_amd64_sse2
+%endif
+    ; save registers
+    push rbx
+    mov rax, 0
+    pop rbx
+    ret
+    align 16
+
diff --git a/src/amd64/rfxcodec_encode_dwt_shift_amd64_sse2.asm b/src/amd64/rfxcodec_encode_dwt_shift_amd64_sse2.asm
new file mode 100644
index 0000000..ee97588
--- /dev/null
+++ b/src/amd64/rfxcodec_encode_dwt_shift_amd64_sse2.asm
@@ -0,0 +1,1503 @@
+;
+;Copyright 2016 Jay Sorg
+;
+;Permission to use, copy, modify, distribute, and sell this software and its
+;documentation for any purpose is hereby granted without fee, provided that
+;the above copyright notice appear in all copies and that both that
+;copyright notice and this permission notice appear in supporting
+;documentation.
+;
+;The above copyright notice and this permission notice shall be included in
+;all copies or substantial portions of the Software.
+;
+;THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+;IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+;FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+;OPEN GROUP BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+;AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+;CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+;
+;amd64 asm dwt
+
+%ifidn __OUTPUT_FORMAT__,elf64
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
+
+section .data
+    align 16
+    cw128    times 8 dw 128
+    cdFFFF   times 4 dd 65535
+    ; these are 1 << (factor - 1) 0 to 15 is factor
+    cwa0     times 8 dw 0     ; 0
+    cwa1     times 8 dw 1     ; 1
+    cwa2     times 8 dw 2     ; 2
+    cwa4     times 8 dw 4     ; 3
+    cwa8     times 8 dw 8     ; 4
+    cwa16    times 8 dw 16    ; 5
+    cwa32    times 8 dw 32    ; 6
+    cwa64    times 8 dw 64    ; 7
+    cwa128   times 8 dw 128   ; 8
+    cwa256   times 8 dw 256   ; 9
+    cwa512   times 8 dw 512   ; 10
+    cwa1024  times 8 dw 1024  ; 11
+    cwa2048  times 8 dw 2048  ; 12
+    cwa4096  times 8 dw 4096  ; 13
+    cwa8192  times 8 dw 8192  ; 14
+    cwa16384 times 8 dw 16384 ; 15
+
+section .text
+
+%macro PROC 1
+    align 16
+    global %1
+    %1:
+%endmacro
+
+;******************************************************************************
+; source 16 bit signed, 16 pixel width
+rfx_dwt_2d_encode_block_horiz_16_16:
+    mov ecx, 8
+loop1a:
+    ; pre / post
+    movdqa xmm1, [rsi]                  ; src[2n]
+    movdqa xmm2, [rsi + 16]
+    movdqa xmm6, xmm1
+    movdqa xmm7, xmm2
+    pand xmm1, [rel cdFFFF]
+    pand xmm2, [rel cdFFFF]
+    pslld xmm1, 16
+    pslld xmm2, 16
+    psrad xmm1, 16
+    psrad xmm2, 16
+    packssdw xmm1, xmm2
+    movdqa xmm2, xmm6                   ; src[2n + 1]
+    movdqa xmm3, xmm7
+    psrldq xmm2, 2
+    psrldq xmm3, 2
+    pand xmm2, [rel cdFFFF]
+    pand xmm3, [rel cdFFFF]
+    pslld xmm2, 16
+    pslld xmm3, 16
+    psrad xmm2, 16
+    psrad xmm3, 16
+    packssdw xmm2, xmm3
+    movdqa xmm3, xmm6                   ; src[2n + 2]
+    movdqa xmm4, xmm7
+    psrldq xmm3, 4
+    psrldq xmm4, 4
+    movd eax, xmm7
+    movd xmm5, eax
+    pslldq xmm5, 12
+    por xmm3, xmm5
+    movdqa xmm5, xmm7
+    psrldq xmm5, 12
+    pslldq xmm5, 12
+    por xmm4, xmm5
+    pand xmm3, [rel cdFFFF]
+    pand xmm4, [rel cdFFFF]
+    pslld xmm3, 16
+    pslld xmm4, 16
+    psrad xmm3, 16
+    psrad xmm4, 16
+    packssdw xmm3, xmm4
+    movdqa xmm4, xmm1
+    movdqa xmm5, xmm2
+    movdqa xmm6, xmm3
+    ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+    paddw xmm4, xmm6
+    psraw xmm4, 1
+    psubw xmm5, xmm4
+    psraw xmm5, 1
+    movdqa xmm6, xmm5                   ; out hi
+    paddw xmm6, xmm8
+    psraw xmm6, xmm9
+    movdqa [rdi], xmm6
+    ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+    movdqa xmm7, xmm5
+    movd eax, xmm7
+    pslldq xmm7, 2
+    and eax, 0xFFFF
+    movd xmm6, eax
+    por xmm7, xmm6
+    paddw xmm5, xmm7
+    psraw xmm5, 1
+    paddw xmm5, xmm1
+
+    movdqa xmm6, xmm5                   ; out lo
+    paddw xmm6, xmm10
+    psraw xmm6, xmm11
+    movdqa [rdx], xmm6
+
+    ; move right
+    lea rsi, [rsi + 16 * 2]
+    lea rdi, [rdi + 8 * 2]
+    lea rdx, [rdx + 8 * 2]
+
+    ; move left
+    lea rsi, [rsi - 16 * 2]
+    lea rdi, [rdi - 8 * 2]
+    lea rdx, [rdx - 8 * 2]
+
+    ; move down
+    lea rsi, [rsi + 16 * 2]
+    lea rdi, [rdi + 8 * 2]
+    lea rdx, [rdx + 8 * 2]
+
+    dec ecx
+    jnz loop1a
+
+    ret
+
+;******************************************************************************
+; source 16 bit signed, 16 pixel width
+rfx_dwt_2d_encode_block_verti_16_16:
+    mov ecx, 2
+loop1b:
+    ; pre
+    movdqa xmm1, [rsi]                  ; src[2n]
+    movdqa xmm2, [rsi + 16 * 2]         ; src[2n + 1]
+    movdqa xmm3, [rsi + 16 * 2 * 2]     ; src[2n + 2]
+    movdqa xmm4, xmm1
+    movdqa xmm5, xmm2
+    movdqa xmm6, xmm3
+    ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+    paddw xmm4, xmm6
+    psraw xmm4, 1
+    psubw xmm5, xmm4
+    psraw xmm5, 1
+    movdqa [rdi], xmm5                  ; out hi
+    movdqa xmm6, xmm5                   ; save hi
+    ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+    paddw xmm5, xmm1
+    movdqa [rdx], xmm5                  ; out lo
+    movdqa xmm7, xmm6                   ; save hi
+    ; move down
+    lea rsi, [rsi + 16 * 2 * 2]         ; 2 rows
+    lea rdi, [rdi + 16 * 2]             ; 1 row
+    lea rdx, [rdx + 16 * 2]             ; 1 row
+
+    ; loop
+    shl ecx, 16
+    mov cx, 6
+loop2b:
+    movdqa xmm1, xmm3                   ; src[2n]
+    movdqa xmm2, [rsi + 16 * 2]         ; src[2n + 1]
+    movdqa xmm3, [rsi + 16 * 2 * 2]     ; src[2n + 2]
+    movdqa xmm4, xmm1
+    movdqa xmm5, xmm2
+    movdqa xmm6, xmm3
+    ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+    paddw xmm4, xmm6
+    psraw xmm4, 1
+    psubw xmm5, xmm4
+    psraw xmm5, 1
+    movdqa [rdi], xmm5                  ; out hi
+    movdqa xmm6, xmm5                   ; save hi
+    ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+    paddw xmm5, xmm7
+    psraw xmm5, 1
+    paddw xmm5, xmm1
+    movdqa [rdx], xmm5                  ; out lo
+    movdqa xmm7, xmm6                   ; save hi
+    ; move down
+    lea rsi, [rsi + 16 * 2 * 2]         ; 2 rows
+    lea rdi, [rdi + 16 * 2]             ; 1 row
+    lea rdx, [rdx + 16 * 2]             ; 1 row
+
+    dec cx
+    jnz loop2b
+    shr ecx, 16
+
+    ; post
+    movdqa xmm1, xmm3                   ; src[2n]
+    movdqa xmm2, [rsi + 16 * 2]         ; src[2n + 1]
+    movdqa xmm4, xmm1
+    movdqa xmm5, xmm2
+    movdqa xmm6, xmm3
+    ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+    paddw xmm4, xmm6
+    psraw xmm4, 1
+    psubw xmm5, xmm4
+    psraw xmm5, 1
+    movdqa [rdi], xmm5                  ; out hi
+    ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+    paddw xmm5, xmm7
+    psraw xmm5, 1
+    paddw xmm5, xmm1
+    movdqa [rdx], xmm5                  ; out lo
+    ; move down
+    lea rsi, [rsi + 16 * 2 * 2]         ; 2 row
+    lea rdi, [rdi + 16 * 2]             ; 1 row
+    lea rdx, [rdx + 16 * 2]             ; 1 row
+
+    ; move up
+    lea rsi, [rsi - 16 * 16 * 2]
+    lea rdi, [rdi - 8 * 16 * 2]
+    lea rdx, [rdx - 8 * 16 * 2]
+
+    ; move right
+    lea rsi, [rsi + 16]
+    lea rdi, [rdi + 16]
+    lea rdx, [rdx + 16]
+
+    dec ecx
+    jnz loop1b
+
+    ret
+
+;******************************************************************************
+; source 16 bit signed, 32 pixel width
+rfx_dwt_2d_encode_block_horiz_16_32:
+    mov ecx, 16
+loop1c:
+    ; pre
+    movdqa xmm1, [rsi]                  ; src[2n]
+    movdqa xmm2, [rsi + 16]
+    movdqa xmm6, xmm1
+    movdqa xmm7, xmm2
+    pand xmm1, [rel cdFFFF]
+    pand xmm2, [rel cdFFFF]
+    pslld xmm1, 16
+    pslld xmm2, 16
+    psrad xmm1, 16
+    psrad xmm2, 16
+    packssdw xmm1, xmm2
+    movdqa xmm2, xmm6                   ; src[2n + 1]
+    movdqa xmm3, xmm7
+    psrldq xmm2, 2
+    psrldq xmm3, 2
+    pand xmm2, [rel cdFFFF]
+    pand xmm3, [rel cdFFFF]
+    pslld xmm2, 16
+    pslld xmm3, 16
+    psrad xmm2, 16
+    psrad xmm3, 16
+    packssdw xmm2, xmm3
+    movdqa xmm3, xmm6                   ; src[2n + 2]
+    movdqa xmm4, xmm7
+    psrldq xmm3, 4
+    psrldq xmm4, 4
+    movd eax, xmm7
+    movd xmm5, eax
+    pslldq xmm5, 12
+    por xmm3, xmm5
+    mov eax, [rsi + 32]
+    movd xmm5, eax
+    pslldq xmm5, 12
+    por xmm4, xmm5
+    pand xmm3, [rel cdFFFF]
+    pand xmm4, [rel cdFFFF]
+    pslld xmm3, 16
+    pslld xmm4, 16
+    psrad xmm3, 16
+    psrad xmm4, 16
+    packssdw xmm3, xmm4
+    movdqa xmm4, xmm1
+    movdqa xmm5, xmm2
+    movdqa xmm6, xmm3
+    ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+    paddw xmm4, xmm6
+    psraw xmm4, 1
+    psubw xmm5, xmm4
+    psraw xmm5, 1
+
+    movdqa xmm6, xmm5                   ; out hi
+    paddw xmm6, xmm8
+    psraw xmm6, xmm9
+    movdqa [rdi], xmm6
+    movdqa xmm2, xmm5                   ; save hi
+
+    ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+    movdqa xmm7, xmm5
+    movd eax, xmm7
+    pslldq xmm7, 2
+    and eax, 0xFFFF
+    movd xmm6, eax
+    por xmm7, xmm6
+    paddw xmm5, xmm7
+    psraw xmm5, 1
+    paddw xmm5, xmm1
+
+    psrldq xmm2, 14
+    movd ebx, xmm2                      ; save hi
+
+    movdqa xmm6, xmm5                   ; out lo
+    paddw xmm6, xmm10
+    psraw xmm6, xmm11
+    movdqa [rdx], xmm6
+
+    ; move right
+    lea rsi, [rsi + 16 * 2]
+    lea rdi, [rdi + 8 * 2]
+    lea rdx, [rdx + 8 * 2]
+
+    ; post
+    movdqa xmm1, [rsi]                  ; src[2n]
+    movdqa xmm2, [rsi + 16]
+    movdqa xmm6, xmm1
+    movdqa xmm7, xmm2
+    pand xmm1, [rel cdFFFF]
+    pand xmm2, [rel cdFFFF]
+    pslld xmm1, 16
+    pslld xmm2, 16
+    psrad xmm1, 16
+    psrad xmm2, 16
+    packssdw xmm1, xmm2
+    movdqa xmm2, xmm6                   ; src[2n + 1]
+    movdqa xmm3, xmm7
+    psrldq xmm2, 2
+    psrldq xmm3, 2
+    pand xmm2, [rel cdFFFF]
+    pand xmm3, [rel cdFFFF]
+    pslld xmm2, 16
+    pslld xmm3, 16
+    psrad xmm2, 16
+    psrad xmm3, 16
+    packssdw xmm2, xmm3
+    movdqa xmm3, xmm6                   ; src[2n + 2]
+    movdqa xmm4, xmm7
+    psrldq xmm3, 4
+    psrldq xmm4, 4
+    movd eax, xmm7
+    movd xmm5, eax
+    pslldq xmm5, 12
+    por xmm3, xmm5
+    movdqa xmm5, xmm7
+    psrldq xmm5, 12
+    pslldq xmm5, 12
+    por xmm4, xmm5
+    pand xmm3, [rel cdFFFF]
+    pand xmm4, [rel cdFFFF]
+    pslld xmm3, 16
+    pslld xmm4, 16
+    psrad xmm3, 16
+    psrad xmm4, 16
+    packssdw xmm3, xmm4
+    movdqa xmm4, xmm1
+    movdqa xmm5, xmm2
+    movdqa xmm6, xmm3
+    ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+    paddw xmm4, xmm6
+    psraw xmm4, 1
+    psubw xmm5, xmm4
+    psraw xmm5, 1
+
+    movdqa xmm6, xmm5                   ; out hi
+    paddw xmm6, xmm8
+    psraw xmm6, xmm9
+    movdqa [rdi], xmm6
+
+    ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+    movdqa xmm7, xmm5
+    pslldq xmm7, 2
+    movd xmm6, ebx
+    por xmm7, xmm6
+    paddw xmm5, xmm7
+    psraw xmm5, 1
+    paddw xmm5, xmm1
+
+    movdqa xmm6, xmm5                   ; out lo
+    paddw xmm6, xmm10
+    psraw xmm6, xmm11
+    movdqa [rdx], xmm6
+
+    ; move right
+    lea rsi, [rsi + 16 * 2]
+    lea rdi, [rdi + 8 * 2]
+    lea rdx, [rdx + 8 * 2]
+
+    ; move left
+    lea rsi, [rsi - 32 * 2]
+    lea rdi, [rdi - 16 * 2]
+    lea rdx, [rdx - 16 * 2]
+
+    ; move down
+    lea rsi, [rsi + 32 * 2]
+    lea rdi, [rdi + 16 * 2]
+    lea rdx, [rdx + 16 * 2]
+
+    dec ecx
+    jnz loop1c
+
+    ret
+
+;******************************************************************************
+; source 16 bit signed, 32 pixel width
+rfx_dwt_2d_encode_block_horiz_16_32_no_lo:
+    mov ecx, 16
+loop1c1:
+    ; pre
+    movdqa xmm1, [rsi]                  ; src[2n]
+    movdqa xmm2, [rsi + 16]
+    movdqa xmm6, xmm1
+    movdqa xmm7, xmm2
+    pand xmm1, [rel cdFFFF]
+    pand xmm2, [rel cdFFFF]
+    pslld xmm1, 16
+    pslld xmm2, 16
+    psrad xmm1, 16
+    psrad xmm2, 16
+    packssdw xmm1, xmm2
+    movdqa xmm2, xmm6                   ; src[2n + 1]
+    movdqa xmm3, xmm7
+    psrldq xmm2, 2
+    psrldq xmm3, 2
+    pand xmm2, [rel cdFFFF]
+    pand xmm3, [rel cdFFFF]
+    pslld xmm2, 16
+    pslld xmm3, 16
+    psrad xmm2, 16
+    psrad xmm3, 16
+    packssdw xmm2, xmm3
+    movdqa xmm3, xmm6                   ; src[2n + 2]
+    movdqa xmm4, xmm7
+    psrldq xmm3, 4
+    psrldq xmm4, 4
+    movd eax, xmm7
+    movd xmm5, eax
+    pslldq xmm5, 12
+    por xmm3, xmm5
+    mov eax, [rsi + 32]
+    movd xmm5, eax
+    pslldq xmm5, 12
+    por xmm4, xmm5
+    pand xmm3, [rel cdFFFF]
+    pand xmm4, [rel cdFFFF]
+    pslld xmm3, 16
+    pslld xmm4, 16
+    psrad xmm3, 16
+    psrad xmm4, 16
+    packssdw xmm3, xmm4
+    movdqa xmm4, xmm1
+    movdqa xmm5, xmm2
+    movdqa xmm6, xmm3
+    ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+    paddw xmm4, xmm6
+    psraw xmm4, 1
+    psubw xmm5, xmm4
+    psraw xmm5, 1
+
+    movdqa xmm6, xmm5                   ; out hi
+    paddw xmm6, xmm8
+    psraw xmm6, xmm9
+    movdqa [rdi], xmm6
+    movdqa xmm2, xmm5                   ; save hi
+
+    ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+    movdqa xmm7, xmm5
+    movd eax, xmm7
+    pslldq xmm7, 2
+    and eax, 0xFFFF
+    movd xmm6, eax
+    por xmm7, xmm6
+    paddw xmm5, xmm7
+    psraw xmm5, 1
+    paddw xmm5, xmm1
+
+    psrldq xmm2, 14
+    movd ebx, xmm2                      ; save hi
+
+    movdqa [rdx], xmm5                  ; out lo
+
+    ; move right
+    lea rsi, [rsi + 16 * 2]
+    lea rdi, [rdi + 8 * 2]
+    lea rdx, [rdx + 8 * 2]
+
+    ; post
+    movdqa xmm1, [rsi]                  ; src[2n]
+    movdqa xmm2, [rsi + 16]
+    movdqa xmm6, xmm1
+    movdqa xmm7, xmm2
+    pand xmm1, [rel cdFFFF]
+    pand xmm2, [rel cdFFFF]
+    pslld xmm1, 16
+    pslld xmm2, 16
+    psrad xmm1, 16
+    psrad xmm2, 16
+    packssdw xmm1, xmm2
+    movdqa xmm2, xmm6                   ; src[2n + 1]
+    movdqa xmm3, xmm7
+    psrldq xmm2, 2
+    psrldq xmm3, 2
+    pand xmm2, [rel cdFFFF]
+    pand xmm3, [rel cdFFFF]
+    pslld xmm2, 16
+    pslld xmm3, 16
+    psrad xmm2, 16
+    psrad xmm3, 16
+    packssdw xmm2, xmm3
+    movdqa xmm3, xmm6                   ; src[2n + 2]
+    movdqa xmm4, xmm7
+    psrldq xmm3, 4
+    psrldq xmm4, 4
+    movd eax, xmm7
+    movd xmm5, eax
+    pslldq xmm5, 12
+    por xmm3, xmm5
+    movdqa xmm5, xmm7
+    psrldq xmm5, 12
+    pslldq xmm5, 12
+    por xmm4, xmm5
+    pand xmm3, [rel cdFFFF]
+    pand xmm4, [rel cdFFFF]
+    pslld xmm3, 16
+    pslld xmm4, 16
+    psrad xmm3, 16
+    psrad xmm4, 16
+    packssdw xmm3, xmm4
+    movdqa xmm4, xmm1
+    movdqa xmm5, xmm2
+    movdqa xmm6, xmm3
+    ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+    paddw xmm4, xmm6
+    psraw xmm4, 1
+    psubw xmm5, xmm4
+    psraw xmm5, 1
+
+    movdqa xmm6, xmm5                   ; out hi
+    paddw xmm6, xmm8
+    psraw xmm6, xmm9
+    movdqa [rdi], xmm6
+
+    ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+    movdqa xmm7, xmm5
+    pslldq xmm7, 2
+    movd xmm6, ebx
+    por xmm7, xmm6
+    paddw xmm5, xmm7
+    psraw xmm5, 1
+    paddw xmm5, xmm1
+
+    movdqa [rdx], xmm5                  ; out lo
+
+    ; move right
+    lea rsi, [rsi + 16 * 2]
+    lea rdi, [rdi + 8 * 2]
+    lea rdx, [rdx + 8 * 2]
+
+    ; move left
+    lea rsi, [rsi - 32 * 2]
+    lea rdi, [rdi - 16 * 2]
+    lea rdx, [rdx - 16 * 2]
+
+    ; move down
+    lea rsi, [rsi + 32 * 2]
+    lea rdi, [rdi + 16 * 2]
+    lea rdx, [rdx + 16 * 2]
+
+    dec ecx
+    jnz loop1c1
+
+    ret
+
+;******************************************************************************
+; source 16 bit signed, 32 pixel width
+rfx_dwt_2d_encode_block_verti_16_32:
+    mov ecx, 4
+loop1d:
+    ; pre
+    movdqa xmm1, [rsi]                  ; src[2n]
+    movdqa xmm2, [rsi + 32 * 2]         ; src[2n + 1]
+    movdqa xmm3, [rsi + 32 * 2 * 2]     ; src[2n + 2]
+    movdqa xmm4, xmm1
+    movdqa xmm5, xmm2
+    movdqa xmm6, xmm3
+    ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+    paddw xmm4, xmm6
+    psraw xmm4, 1
+    psubw xmm5, xmm4
+    psraw xmm5, 1
+    movdqa [rdi], xmm5                  ; out hi
+    movdqa xmm6, xmm5                   ; save hi
+    ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+    paddw xmm5, xmm1
+    movdqa [rdx], xmm5                  ; out lo
+    movdqa xmm7, xmm6                   ; save hi
+    ; move down
+    lea rsi, [rsi + 32 * 2 * 2]         ; 2 rows
+    lea rdi, [rdi + 32 * 2]             ; 1 row
+    lea rdx, [rdx + 32 * 2]             ; 1 row
+
+    ; loop
+    shl ecx, 16
+    mov cx, 14
+loop2d:
+    movdqa xmm1, xmm3                   ; src[2n]
+    movdqa xmm2, [rsi + 32 * 2]         ; src[2n + 1]
+    movdqa xmm3, [rsi + 32 * 2 * 2]     ; src[2n + 2]
+    movdqa xmm4, xmm1
+    movdqa xmm5, xmm2
+    movdqa xmm6, xmm3
+    ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+    paddw xmm4, xmm6
+    psraw xmm4, 1
+    psubw xmm5, xmm4
+    psraw xmm5, 1
+    movdqa [rdi], xmm5                  ; out hi
+    movdqa xmm6, xmm5                   ; save hi
+    ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+    paddw xmm5, xmm7
+    psraw xmm5, 1
+    paddw xmm5, xmm1
+    movdqa [rdx], xmm5                  ; out lo
+    movdqa xmm7, xmm6                   ; save hi
+    ; move down
+    lea rsi, [rsi + 32 * 2 * 2]         ; 2 rows
+    lea rdi, [rdi + 32 * 2]             ; 1 row
+    lea rdx, [rdx + 32 * 2]             ; 1 row
+
+    dec cx
+    jnz loop2d
+    shr ecx, 16
+
+    ; post
+    movdqa xmm1, xmm3                   ; src[2n]
+    movdqa xmm2, [rsi + 32 * 2]         ; src[2n + 1]
+    movdqa xmm4, xmm1
+    movdqa xmm5, xmm2
+    movdqa xmm6, xmm3
+    ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+    paddw xmm4, xmm6
+    psraw xmm4, 1
+    psubw xmm5, xmm4
+    psraw xmm5, 1
+    movdqa [rdi], xmm5                  ; out hi
+    ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+    paddw xmm5, xmm7
+    psraw xmm5, 1
+    paddw xmm5, xmm1
+    movdqa [rdx], xmm5                  ; out lo
+    ; move down
+    lea rsi, [rsi + 32 * 2 * 2]         ; 2 row
+    lea rdi, [rdi + 32 * 2]             ; 1 row
+    lea rdx, [rdx + 32 * 2]             ; 1 row
+
+    ; move up
+    lea rsi, [rsi - 32 * 32 * 2]
+    lea rdi, [rdi - 16 * 32 * 2]
+    lea rdx, [rdx - 16 * 32 * 2]
+
+    ; move right
+    lea rsi, [rsi + 16]
+    lea rdi, [rdi + 16]
+    lea rdx, [rdx + 16]
+
+    dec ecx
+    jnz loop1d
+
+    ret
+
+;******************************************************************************
+; source 16 bit signed, 64 pixel width
+rfx_dwt_2d_encode_block_horiz_16_64:
+    mov ecx, 32
+loop1e:
+    ; pre
+    movdqa xmm1, [rsi]                  ; src[2n]
+    movdqa xmm2, [rsi + 16]
+    movdqa xmm6, xmm1
+    movdqa xmm7, xmm2
+    pand xmm1, [rel cdFFFF]
+    pand xmm2, [rel cdFFFF]
+    pslld xmm1, 16
+    pslld xmm2, 16
+    psrad xmm1, 16
+    psrad xmm2, 16
+    packssdw xmm1, xmm2
+    movdqa xmm2, xmm6                   ; src[2n + 1]
+    movdqa xmm3, xmm7
+    psrldq xmm2, 2
+    psrldq xmm3, 2
+    pand xmm2, [rel cdFFFF]
+    pand xmm3, [rel cdFFFF]
+    pslld xmm2, 16
+    pslld xmm3, 16
+    psrad xmm2, 16
+    psrad xmm3, 16
+    packssdw xmm2, xmm3
+    movdqa xmm3, xmm6                   ; src[2n + 2]
+    movdqa xmm4, xmm7
+    psrldq xmm3, 4
+    psrldq xmm4, 4
+    movd eax, xmm7
+    movd xmm5, eax
+    pslldq xmm5, 12
+    por xmm3, xmm5
+    mov eax, [rsi + 32]
+    movd xmm5, eax
+    pslldq xmm5, 12
+    por xmm4, xmm5
+    pand xmm3, [rel cdFFFF]
+    pand xmm4, [rel cdFFFF]
+    pslld xmm3, 16
+    pslld xmm4, 16
+    psrad xmm3, 16
+    psrad xmm4, 16
+    packssdw xmm3, xmm4
+    movdqa xmm4, xmm1
+    movdqa xmm5, xmm2
+    movdqa xmm6, xmm3
+    ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+    paddw xmm4, xmm6
+    psraw xmm4, 1
+    psubw xmm5, xmm4
+    psraw xmm5, 1
+
+    movdqa xmm6, xmm5                   ; out hi
+    paddw xmm6, xmm8
+    psraw xmm6, xmm9
+    movdqa [rdi], xmm6
+    movdqa xmm2, xmm5                   ; save hi
+
+    ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+    movdqa xmm7, xmm5
+    movd eax, xmm7
+    pslldq xmm7, 2
+    and eax, 0xFFFF
+    movd xmm6, eax
+    por xmm7, xmm6
+    paddw xmm5, xmm7
+    psraw xmm5, 1
+    paddw xmm5, xmm1
+
+    psrldq xmm2, 14
+    movd ebx, xmm2                      ; save hi
+
+    movdqa xmm6, xmm5                   ; out lo
+    paddw xmm6, xmm10
+    psraw xmm6, xmm11
+    movdqa [rdx], xmm6
+
+    ; move right
+    lea rsi, [rsi + 16 * 2]
+    lea rdi, [rdi + 8 * 2]
+    lea rdx, [rdx + 8 * 2]
+
+    ; loop
+    shl ecx, 16
+    mov cx, 2
+loop2e:
+    movdqa xmm1, [rsi]                  ; src[2n]
+    movdqa xmm2, [rsi + 16]
+    movdqa xmm6, xmm1
+    movdqa xmm7, xmm2
+    pand xmm1, [rel cdFFFF]
+    pand xmm2, [rel cdFFFF]
+    pslld xmm1, 16
+    pslld xmm2, 16
+    psrad xmm1, 16
+    psrad xmm2, 16
+    packssdw xmm1, xmm2
+    movdqa xmm2, xmm6                   ; src[2n + 1]
+    movdqa xmm3, xmm7
+    psrldq xmm2, 2
+    psrldq xmm3, 2
+    pand xmm2, [rel cdFFFF]
+    pand xmm3, [rel cdFFFF]
+    pslld xmm2, 16
+    pslld xmm3, 16
+    psrad xmm2, 16
+    psrad xmm3, 16
+    packssdw xmm2, xmm3
+    movdqa xmm3, xmm6                   ; src[2n + 2]
+    movdqa xmm4, xmm7
+    psrldq xmm3, 4
+    psrldq xmm4, 4
+    movd eax, xmm7
+    movd xmm5, eax
+    pslldq xmm5, 12
+    por xmm3, xmm5
+    mov eax, [rsi + 32]
+    movd xmm5, eax
+    pslldq xmm5, 12
+    por xmm4, xmm5
+    pand xmm3, [rel cdFFFF]
+    pand xmm4, [rel cdFFFF]
+    pslld xmm3, 16
+    pslld xmm4, 16
+    psrad xmm3, 16
+    psrad xmm4, 16
+    packssdw xmm3, xmm4
+    movdqa xmm4, xmm1
+    movdqa xmm5, xmm2
+    movdqa xmm6, xmm3
+    ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+    paddw xmm4, xmm6
+    psraw xmm4, 1
+    psubw xmm5, xmm4
+    psraw xmm5, 1
+
+    movdqa xmm6, xmm5                   ; out hi
+    paddw xmm6, xmm8
+    psraw xmm6, xmm9
+    movdqa [rdi], xmm6
+    movdqa xmm2, xmm5                   ; save hi
+
+    ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+    movdqa xmm7, xmm5
+    pslldq xmm7, 2
+    movd xmm6, ebx
+    por xmm7, xmm6
+    paddw xmm5, xmm7
+    psraw xmm5, 1
+    paddw xmm5, xmm1
+
+    psrldq xmm2, 14
+    movd ebx, xmm2                      ; save hi
+
+    movdqa xmm6, xmm5                   ; out lo
+    paddw xmm6, xmm10
+    psraw xmm6, xmm11
+    movdqa [rdx], xmm6
+
+    ; move right
+    lea rsi, [rsi + 16 * 2]
+    lea rdi, [rdi + 8 * 2]
+    lea rdx, [rdx + 8 * 2]
+
+    dec cx
+    jnz loop2e
+    shr ecx, 16
+
+    ; post
+    movdqa xmm1, [rsi]                  ; src[2n]
+    movdqa xmm2, [rsi + 16]
+    movdqa xmm6, xmm1
+    movdqa xmm7, xmm2
+    pand xmm1, [rel cdFFFF]
+    pand xmm2, [rel cdFFFF]
+    pslld xmm1, 16
+    pslld xmm2, 16
+    psrad xmm1, 16
+    psrad xmm2, 16
+    packssdw xmm1, xmm2
+    movdqa xmm2, xmm6                   ; src[2n + 1]
+    movdqa xmm3, xmm7
+    psrldq xmm2, 2
+    psrldq xmm3, 2
+    pand xmm2, [rel cdFFFF]
+    pand xmm3, [rel cdFFFF]
+    pslld xmm2, 16
+    pslld xmm3, 16
+    psrad xmm2, 16
+    psrad xmm3, 16
+    packssdw xmm2, xmm3
+    movdqa xmm3, xmm6                   ; src[2n + 2]
+    movdqa xmm4, xmm7
+    psrldq xmm3, 4
+    psrldq xmm4, 4
+    movd eax, xmm7
+    movd xmm5, eax
+    pslldq xmm5, 12
+    por xmm3, xmm5
+    movdqa xmm5, xmm7
+    psrldq xmm5, 12
+    pslldq xmm5, 12
+    por xmm4, xmm5
+    pand xmm3, [rel cdFFFF]
+    pand xmm4, [rel cdFFFF]
+    pslld xmm3, 16
+    pslld xmm4, 16
+    psrad xmm3, 16
+    psrad xmm4, 16
+    packssdw xmm3, xmm4
+    movdqa xmm4, xmm1
+    movdqa xmm5, xmm2
+    movdqa xmm6, xmm3
+    ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+    paddw xmm4, xmm6
+    psraw xmm4, 1
+    psubw xmm5, xmm4
+    psraw xmm5, 1
+
+    movdqa xmm6, xmm5                   ; out hi
+    paddw xmm6, xmm8
+    psraw xmm6, xmm9
+    movdqa [rdi], xmm6
+
+    ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+    movdqa xmm7, xmm5
+    pslldq xmm7, 2
+    movd xmm6, ebx
+    por xmm7, xmm6
+    paddw xmm5, xmm7
+    psraw xmm5, 1
+    paddw xmm5, xmm1
+
+    movdqa xmm6, xmm5                   ; out lo
+    paddw xmm6, xmm10
+    psraw xmm6, xmm11
+    movdqa [rdx], xmm6
+
+    ; move right
+    lea rsi, [rsi + 16 * 2]
+    lea rdi, [rdi + 8 * 2]
+    lea rdx, [rdx + 8 * 2]
+
+    ; move left
+    lea rsi, [rsi - 64 * 2]
+    lea rdi, [rdi - 32 * 2]
+    lea rdx, [rdx - 32 * 2]
+
+    ; move down
+    lea rsi, [rsi + 64 * 2]
+    lea rdi, [rdi + 32 * 2]
+    lea rdx, [rdx + 32 * 2]
+
+    dec ecx
+    jnz loop1e
+
+    ret
+
+;******************************************************************************
+; source 16 bit signed, 64 pixel width
+rfx_dwt_2d_encode_block_horiz_16_64_no_lo:
+    mov ecx, 32
+loop1e1:
+    ; pre
+    movdqa xmm1, [rsi]                  ; src[2n]
+    movdqa xmm2, [rsi + 16]
+    movdqa xmm6, xmm1
+    movdqa xmm7, xmm2
+    pand xmm1, [rel cdFFFF]
+    pand xmm2, [rel cdFFFF]
+    pslld xmm1, 16
+    pslld xmm2, 16
+    psrad xmm1, 16
+    psrad xmm2, 16
+    packssdw xmm1, xmm2
+    movdqa xmm2, xmm6                   ; src[2n + 1]
+    movdqa xmm3, xmm7
+    psrldq xmm2, 2
+    psrldq xmm3, 2
+    pand xmm2, [rel cdFFFF]
+    pand xmm3, [rel cdFFFF]
+    pslld xmm2, 16
+    pslld xmm3, 16
+    psrad xmm2, 16
+    psrad xmm3, 16
+    packssdw xmm2, xmm3
+    movdqa xmm3, xmm6                   ; src[2n + 2]
+    movdqa xmm4, xmm7
+    psrldq xmm3, 4
+    psrldq xmm4, 4
+    movd eax, xmm7
+    movd xmm5, eax
+    pslldq xmm5, 12
+    por xmm3, xmm5
+    mov eax, [rsi + 32]
+    movd xmm5, eax
+    pslldq xmm5, 12
+    por xmm4, xmm5
+    pand xmm3, [rel cdFFFF]
+    pand xmm4, [rel cdFFFF]
+    pslld xmm3, 16
+    pslld xmm4, 16
+    psrad xmm3, 16
+    psrad xmm4, 16
+    packssdw xmm3, xmm4
+    movdqa xmm4, xmm1
+    movdqa xmm5, xmm2
+    movdqa xmm6, xmm3
+    ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+    paddw xmm4, xmm6
+    psraw xmm4, 1
+    psubw xmm5, xmm4
+    psraw xmm5, 1
+
+    movdqa xmm6, xmm5                   ; out hi
+    paddw xmm6, xmm8
+    psraw xmm6, xmm9
+    movdqa [rdi], xmm6
+    movdqa xmm2, xmm5                   ; save hi
+
+    ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+    movdqa xmm7, xmm5
+    movd eax, xmm7
+    pslldq xmm7, 2
+    and eax, 0xFFFF
+    movd xmm6, eax
+    por xmm7, xmm6
+    paddw xmm5, xmm7
+    psraw xmm5, 1
+    paddw xmm5, xmm1
+
+    psrldq xmm2, 14
+    movd ebx, xmm2                      ; save hi
+
+    movdqa [rdx], xmm5                  ; out lo
+
+    ; move right
+    lea rsi, [rsi + 16 * 2]
+    lea rdi, [rdi + 8 * 2]
+    lea rdx, [rdx + 8 * 2]
+
+    ; loop
+    shl ecx, 16
+    mov cx, 2
+loop2e1:
+    movdqa xmm1, [rsi]                  ; src[2n]
+    movdqa xmm2, [rsi + 16]
+    movdqa xmm6, xmm1
+    movdqa xmm7, xmm2
+    pand xmm1, [rel cdFFFF]
+    pand xmm2, [rel cdFFFF]
+    pslld xmm1, 16
+    pslld xmm2, 16
+    psrad xmm1, 16
+    psrad xmm2, 16
+    packssdw xmm1, xmm2
+    movdqa xmm2, xmm6                   ; src[2n + 1]
+    movdqa xmm3, xmm7
+    psrldq xmm2, 2
+    psrldq xmm3, 2
+    pand xmm2, [rel cdFFFF]
+    pand xmm3, [rel cdFFFF]
+    pslld xmm2, 16
+    pslld xmm3, 16
+    psrad xmm2, 16
+    psrad xmm3, 16
+    packssdw xmm2, xmm3
+    movdqa xmm3, xmm6                   ; src[2n + 2]
+    movdqa xmm4, xmm7
+    psrldq xmm3, 4
+    psrldq xmm4, 4
+    movd eax, xmm7
+    movd xmm5, eax
+    pslldq xmm5, 12
+    por xmm3, xmm5
+    mov eax, [rsi + 32]
+    movd xmm5, eax
+    pslldq xmm5, 12
+    por xmm4, xmm5
+    pand xmm3, [rel cdFFFF]
+    pand xmm4, [rel cdFFFF]
+    pslld xmm3, 16
+    pslld xmm4, 16
+    psrad xmm3, 16
+    psrad xmm4, 16
+    packssdw xmm3, xmm4
+    movdqa xmm4, xmm1
+    movdqa xmm5, xmm2
+    movdqa xmm6, xmm3
+    ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+    paddw xmm4, xmm6
+    psraw xmm4, 1
+    psubw xmm5, xmm4
+    psraw xmm5, 1
+
+    movdqa xmm6, xmm5                   ; out hi
+    paddw xmm6, xmm8
+    psraw xmm6, xmm9
+    movdqa [rdi], xmm6
+    movdqa xmm2, xmm5                   ; save hi
+
+    ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+    movdqa xmm7, xmm5
+    pslldq xmm7, 2
+    movd xmm6, ebx
+    por xmm7, xmm6
+    paddw xmm5, xmm7
+    psraw xmm5, 1
+    paddw xmm5, xmm1
+
+    psrldq xmm2, 14
+    movd ebx, xmm2                      ; save hi
+
+    movdqa [rdx], xmm5                  ; out lo
+
+    ; move right
+    lea rsi, [rsi + 16 * 2]
+    lea rdi, [rdi + 8 * 2]
+    lea rdx, [rdx + 8 * 2]
+
+    dec cx
+    jnz loop2e1
+    shr ecx, 16
+
+    ; post
+    movdqa xmm1, [rsi]                  ; src[2n]
+    movdqa xmm2, [rsi + 16]
+    movdqa xmm6, xmm1
+    movdqa xmm7, xmm2
+    pand xmm1, [rel cdFFFF]
+    pand xmm2, [rel cdFFFF]
+    pslld xmm1, 16
+    pslld xmm2, 16
+    psrad xmm1, 16
+    psrad xmm2, 16
+    packssdw xmm1, xmm2
+    movdqa xmm2, xmm6                   ; src[2n + 1]
+    movdqa xmm3, xmm7
+    psrldq xmm2, 2
+    psrldq xmm3, 2
+    pand xmm2, [rel cdFFFF]
+    pand xmm3, [rel cdFFFF]
+    pslld xmm2, 16
+    pslld xmm3, 16
+    psrad xmm2, 16
+    psrad xmm3, 16
+    packssdw xmm2, xmm3
+    movdqa xmm3, xmm6                   ; src[2n + 2]
+    movdqa xmm4, xmm7
+    psrldq xmm3, 4
+    psrldq xmm4, 4
+    movd eax, xmm7
+    movd xmm5, eax
+    pslldq xmm5, 12
+    por xmm3, xmm5
+    movdqa xmm5, xmm7
+    psrldq xmm5, 12
+    pslldq xmm5, 12
+    por xmm4, xmm5
+    pand xmm3, [rel cdFFFF]
+    pand xmm4, [rel cdFFFF]
+    pslld xmm3, 16
+    pslld xmm4, 16
+    psrad xmm3, 16
+    psrad xmm4, 16
+    packssdw xmm3, xmm4
+    movdqa xmm4, xmm1
+    movdqa xmm5, xmm2
+    movdqa xmm6, xmm3
+    ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+    paddw xmm4, xmm6
+    psraw xmm4, 1
+    psubw xmm5, xmm4
+    psraw xmm5, 1
+
+    movdqa xmm6, xmm5                   ; out hi
+    paddw xmm6, xmm8
+    psraw xmm6, xmm9
+    movdqa [rdi], xmm6
+
+    ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+    movdqa xmm7, xmm5
+    pslldq xmm7, 2
+    movd xmm6, ebx
+    por xmm7, xmm6
+    paddw xmm5, xmm7
+    psraw xmm5, 1
+    paddw xmm5, xmm1
+
+    movdqa [rdx], xmm5                  ; out lo
+
+    ; move right
+    lea rsi, [rsi + 16 * 2]
+    lea rdi, [rdi + 8 * 2]
+    lea rdx, [rdx + 8 * 2]
+
+    ; move left
+    lea rsi, [rsi - 64 * 2]
+    lea rdi, [rdi - 32 * 2]
+    lea rdx, [rdx - 32 * 2]
+
+    ; move down
+    lea rsi, [rsi + 64 * 2]
+    lea rdi, [rdi + 32 * 2]
+    lea rdx, [rdx + 32 * 2]
+
+    dec ecx
+    jnz loop1e1
+
+    ret
+
+;******************************************************************************
+; source 8 bit unsigned, 64 pixel width
+rfx_dwt_2d_encode_block_verti_8_64:
+    mov ecx, 8
+loop1f:
+    ; pre
+    movq xmm1, [rsi]                    ; src[2n]
+    movq xmm2, [rsi + 64 * 1]           ; src[2n + 1]
+    movq xmm3, [rsi + 64 * 1 * 2]       ; src[2n + 2]
+    punpcklbw xmm1, xmm0
+    punpcklbw xmm2, xmm0
+    punpcklbw xmm3, xmm0
+    psubw xmm1, [rel cw128]
+    psubw xmm2, [rel cw128]
+    psubw xmm3, [rel cw128]
+    psllw xmm1, 5
+    psllw xmm2, 5
+    psllw xmm3, 5
+    movdqa xmm4, xmm1
+    movdqa xmm5, xmm2
+    movdqa xmm6, xmm3
+    ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+    paddw xmm4, xmm6
+    psraw xmm4, 1
+    psubw xmm5, xmm4
+    psraw xmm5, 1
+    movdqa [rdi], xmm5                  ; out hi
+    movdqa xmm6, xmm5                   ; save hi
+    ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+    paddw xmm5, xmm1
+    movdqa [rdx], xmm5                  ; out lo
+    movdqa xmm7, xmm6                   ; save hi
+    ; move down
+    lea rsi, [rsi + 64 * 1 * 2]         ; 2 rows
+    lea rdi, [rdi + 64 * 2]             ; 1 row
+    lea rdx, [rdx + 64 * 2]             ; 1 row
+
+    ; loop
+    shl ecx, 16
+    mov cx, 30
+loop2f:
+    movdqa xmm1, xmm3                   ; src[2n]
+    movq xmm2, [rsi + 64 * 1]           ; src[2n + 1]
+    movq xmm3, [rsi + 64 * 1 * 2]       ; src[2n + 2]
+    punpcklbw xmm2, xmm0
+    punpcklbw xmm3, xmm0
+    psubw xmm2, [rel cw128]
+    psubw xmm3, [rel cw128]
+    psllw xmm2, 5
+    psllw xmm3, 5
+    movdqa xmm4, xmm1
+    movdqa xmm5, xmm2
+    movdqa xmm6, xmm3
+    ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+    paddw xmm4, xmm6
+    psraw xmm4, 1
+    psubw xmm5, xmm4
+    psraw xmm5, 1
+    movdqa [rdi], xmm5                  ; out hi
+    movdqa xmm6, xmm5                   ; save hi
+    ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+    paddw xmm5, xmm7
+    psraw xmm5, 1
+    paddw xmm5, xmm1
+    movdqa [rdx], xmm5                  ; out lo
+    movdqa xmm7, xmm6                   ; save hi
+    ; move down
+    lea rsi, [rsi + 64 * 1 * 2]         ; 2 rows
+    lea rdi, [rdi + 64 * 2]             ; 1 row
+    lea rdx, [rdx + 64 * 2]             ; 1 row
+
+    dec cx
+    jnz loop2f
+    shr ecx, 16
+
+    ; post
+    movdqa xmm1, xmm3                   ; src[2n]
+    movq xmm2, [rsi + 64 * 1]           ; src[2n + 1]
+    punpcklbw xmm2, xmm0
+    psubw xmm2, [rel cw128]
+    psllw xmm2, 5
+    movdqa xmm4, xmm1
+    movdqa xmm5, xmm2
+    movdqa xmm6, xmm3
+    ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+    paddw xmm4, xmm6
+    psraw xmm4, 1
+    psubw xmm5, xmm4
+    psraw xmm5, 1
+    movdqa [rdi], xmm5                  ; out hi
+    ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+    paddw xmm5, xmm7
+    psraw xmm5, 1
+    paddw xmm5, xmm1
+    movdqa [rdx], xmm5                  ; out lo
+    ; move down
+    lea rsi, [rsi + 64 * 1 * 2]         ; 2 rows
+    lea rdi, [rdi + 64 * 2]             ; 1 row
+    lea rdx, [rdx + 64 * 2]             ; 1 row
+
+    ; move up
+    lea rsi, [rsi - 64 * 1 * 64]
+    lea rdi, [rdi - 32 * 64 * 2]
+    lea rdx, [rdx - 32 * 64 * 2]
+
+    ; move right
+    lea rsi, [rsi + 8]
+    lea rdi, [rdi + 16]
+    lea rdx, [rdx + 16]
+
+    dec ecx
+    jnz loop1f
+
+    ret
+
+set_quants_hi:
+    sub rax, 6 - 5
+    movd xmm9, eax
+    imul rax, 16
+    lea rdx, [rel cwa0]
+    add rdx, rax
+    movdqa xmm8, [rdx]
+    ret
+
+set_quants_lo:
+    sub rax, 6 - 5
+    movd xmm11, eax
+    imul rax, 16
+    lea rdx, [rel cwa0]
+    add rdx, rax
+    movdqa xmm10, [rdx]
+    ret
+
+;The first six integer or pointer arguments are passed in registers
+;RDI, RSI, RDX, RCX, R8, and R9
+
+;int
+;rfxcodec_encode_dwt_shift_amd64_sse2(const char *qtable,
+;                                     unsigned char *in_buffer,
+;                                     short *out_buffer,
+;                                     short *work_buffer);
+
+;******************************************************************************
+%ifidn __OUTPUT_FORMAT__,elf64
+PROC rfxcodec_encode_dwt_shift_amd64_sse2
+%else
+PROC _rfxcodec_encode_dwt_shift_amd64_sse2
+%endif
+    ; save registers
+    push rbx
+    push rdx
+    push rcx
+    push rsi
+    push rdi
+    pxor xmm0, xmm0
+
+    ; verical DWT to work buffer, level 1
+    mov rsi, [rsp + 8]                  ; src
+    mov rdi, [rsp + 16]                 ; dst hi
+    lea rdi, [rdi + 64 * 32 * 2]        ; dst hi
+    mov rdx, [rsp + 16]                 ; dst lo
+    call rfx_dwt_2d_encode_block_verti_8_64
+
+    ; horizontal DWT to out buffer, level 1, part 1
+    xor rax, rax
+    mov rdx, [rsp]
+    mov al, [rdx + 4]
+    and al, 0xF
+    call set_quants_hi
+    mov rsi, [rsp + 16]                 ; src
+    mov rdi, [rsp + 24]                 ; dst hi - HL1
+    mov rdx, [rsp + 24]                 ; dst lo - LL1
+    lea rdx, [rdx + 32 * 32 * 6]        ; dst lo - LL1
+    call rfx_dwt_2d_encode_block_horiz_16_64_no_lo
+
+    ; horizontal DWT to out buffer, level 1, part 2
+    xor rax, rax
+    mov rdx, [rsp]
+    mov al, [rdx + 4]
+    shr al, 4
+    call set_quants_hi
+    xor rax, rax
+    mov rdx, [rsp]
+    mov al, [rdx + 3]
+    shr al, 4
+    call set_quants_lo
+    mov rsi, [rsp + 16]                 ; src
+    lea rsi, [rsi + 64 * 32 * 2]        ; src
+    mov rdi, [rsp + 24]                 ; dst hi - HH1
+    lea rdi, [rdi + 32 * 32 * 4]        ; dst hi - HH1
+    mov rdx, [rsp + 24]                 ; dst lo - LH1
+    lea rdx, [rdx + 32 * 32 * 2]        ; dst lo - LH1
+    call rfx_dwt_2d_encode_block_horiz_16_64
+
+    ; verical DWT to work buffer, level 2
+    mov rsi, [rsp + 24]                 ; src
+    lea rsi, [rsi + 32 * 32 * 6]        ; src
+    mov rdi, [rsp + 16]                 ; dst hi
+    lea rdi, [rdi + 32 * 16 * 2]        ; dst hi
+    mov rdx, [rsp + 16]                 ; dst lo
+    call rfx_dwt_2d_encode_block_verti_16_32
+
+    ; horizontal DWT to out buffer, level 2, part 1
+    xor rax, rax
+    mov rdx, [rsp]
+    mov al, [rdx + 2]
+    shr al, 4
+    call set_quants_hi
+    mov rsi, [rsp + 16]                 ; src
+    ; 32 * 32 * 6 + 16 * 16 * 0 = 6144
+    mov rdi, [rsp + 24]                 ; dst hi - HL2
+    lea rdi, [rdi + 6144]               ; dst hi - HL2
+    ; 32 * 32 * 6 + 16 * 16 * 6 = 7680
+    mov rdx, [rsp + 24]                 ; dst lo - LL2
+    lea rdx, [rdx + 7680]               ; dst lo - LL2
+    call rfx_dwt_2d_encode_block_horiz_16_32_no_lo
+
+    ; horizontal DWT to out buffer, level 2, part 2
+    xor rax, rax
+    mov rdx, [rsp]
+    mov al, [rdx + 3]
+    and al, 0xF
+    call set_quants_hi
+    xor rax, rax
+    mov rdx, [rsp]
+    mov al, [rdx + 2]
+    and al, 0xF
+    call set_quants_lo
+    mov rsi, [rsp + 16]                 ; src
+    lea rsi, [rsi + 32 * 16 * 2]        ; src
+    ; 32 * 32 * 6 + 16 * 16 * 4 = 7168
+    mov rdi, [rsp + 24]                 ; dst hi - HH2
+    lea rdi, [rdi + 7168]               ; dst hi - HH2
+    ; 32 * 32 * 6 + 16 * 16 * 2 = 6656
+    mov rdx, [rsp + 24]                 ; dst lo - LH2
+    lea rdx, [rdx + 6656]               ; dst lo - LH2
+    call rfx_dwt_2d_encode_block_horiz_16_32
+
+    ; verical DWT to work buffer, level 3
+    ; 32 * 32 * 6 + 16 * 16 * 6 = 7680
+    mov rsi, [rsp + 24]                 ; src
+    lea rsi, [rsi + 7680]               ; src
+    mov rdi, [rsp + 16]                 ; dst hi
+    lea rdi, [rdi + 16 * 8 * 2]         ; dst hi
+    mov rdx, [rsp + 16]                 ; dst lo
+    call rfx_dwt_2d_encode_block_verti_16_16
+
+    ; horizontal DWT to out buffer, level 3, part 1
+    xor rax, rax
+    mov rdx, [rsp]
+    mov al, [rdx + 1]
+    and al, 0xF
+    call set_quants_hi
+    xor rax, rax
+    mov rdx, [rsp]
+    mov al, [rdx + 0]
+    and al, 0xF
+    call set_quants_lo
+    mov rsi, [rsp + 16]                 ; src
+    ; 32 * 32 * 6 + 16 * 16 * 6 + 8 * 8 * 0 = 7680
+    mov rdi, [rsp + 24]                 ; dst hi - HL3
+    lea rdi, [rdi + 7680]               ; dst hi - HL3
+    ; 32 * 32 * 6 + 16 * 16 * 6 + 8 * 8 * 6 = 8064
+    mov rdx, [rsp + 24]                 ; dst lo - LL3
+    lea rdx, [rdx + 8064]               ; dst lo - LL3
+    call rfx_dwt_2d_encode_block_horiz_16_16
+
+    ; horizontal DWT to out buffer, level 3, part 2
+    xor rax, rax
+    mov rdx, [rsp]
+    mov al, [rdx + 1]
+    shr al, 4
+    call set_quants_hi
+    xor rax, rax
+    mov rdx, [rsp]
+    mov al, [rdx + 0]
+    shr al, 4
+    call set_quants_lo
+    mov rsi, [rsp + 16]                 ; src
+    lea rsi, [rsi + 16 * 8 * 2]         ; src
+    ; 32 * 32 * 6 + 16 * 16 * 6 + 8 * 8 * 4 = 7936
+    mov rdi, [rsp + 24]                 ; dst hi - HH3
+    lea rdi, [rdi + 7936]               ; dst hi - HH3
+    ; 32 * 32 * 6 + 16 * 16 * 6 + 8 * 8 * 2 = 7808
+    mov rdx, [rsp + 24]                 ; dst lo - LH3
+    lea rdx, [rdx + 7808]               ; dst lo - LH3
+    call rfx_dwt_2d_encode_block_horiz_16_16
+
+    mov rax, 0
+    ; restore registers
+    pop rdi
+    pop rsi
+    pop rcx
+    pop rdx
+    pop rbx
+    ret
+    align 16
+
diff --git a/src/amd64/rfxcodec_encode_dwt_shift_amd64_sse41.asm b/src/amd64/rfxcodec_encode_dwt_shift_amd64_sse41.asm
new file mode 100644
index 0000000..ab52808
--- /dev/null
+++ b/src/amd64/rfxcodec_encode_dwt_shift_amd64_sse41.asm
@@ -0,0 +1,1371 @@
+;
+;Copyright 2016 Jay Sorg
+;
+;Permission to use, copy, modify, distribute, and sell this software and its
+;documentation for any purpose is hereby granted without fee, provided that
+;the above copyright notice appear in all copies and that both that
+;copyright notice and this permission notice appear in supporting
+;documentation.
+;
+;The above copyright notice and this permission notice shall be included in
+;all copies or substantial portions of the Software.
+;
+;THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+;IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+;FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+;OPEN GROUP BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+;AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+;CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+;
+;amd64 asm dwt
+
+%ifidn __OUTPUT_FORMAT__,elf64
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
+
+section .data
+    align 16
+    cw128    times 8 dw 128
+    cdFFFF   times 4 dd 65535
+    ; these are 1 << (factor - 1) 0 to 15 is factor
+    cwa0     times 8 dw 0     ; 0
+    cwa1     times 8 dw 1     ; 1
+    cwa2     times 8 dw 2     ; 2
+    cwa4     times 8 dw 4     ; 3
+    cwa8     times 8 dw 8     ; 4
+    cwa16    times 8 dw 16    ; 5
+    cwa32    times 8 dw 32    ; 6
+    cwa64    times 8 dw 64    ; 7
+    cwa128   times 8 dw 128   ; 8
+    cwa256   times 8 dw 256   ; 9
+    cwa512   times 8 dw 512   ; 10
+    cwa1024  times 8 dw 1024  ; 11
+    cwa2048  times 8 dw 2048  ; 12
+    cwa4096  times 8 dw 4096  ; 13
+    cwa8192  times 8 dw 8192  ; 14
+    cwa16384 times 8 dw 16384 ; 15
+
+section .text
+
+%macro PROC 1
+    align 16
+    global %1
+    %1:
+%endmacro
+
+;******************************************************************************
+; source 16 bit signed, 16 pixel width
+rfx_dwt_2d_encode_block_horiz_16_16:
+    mov ecx, 8
+loop1a:
+    ; pre / post
+    movdqa xmm1, [rsi]                  ; src[2n]
+    movdqa xmm2, [rsi + 16]
+    movdqa xmm6, xmm1
+    movdqa xmm7, xmm2
+    pand xmm1, [rel cdFFFF]
+    pand xmm2, [rel cdFFFF]
+    packusdw xmm1, xmm2
+    movdqa xmm2, xmm6                   ; src[2n + 1]
+    movdqa xmm3, xmm7
+    psrldq xmm2, 2
+    psrldq xmm3, 2
+    pand xmm2, [rel cdFFFF]
+    pand xmm3, [rel cdFFFF]
+    packusdw xmm2, xmm3
+    movdqa xmm3, xmm6                   ; src[2n + 2]
+    movdqa xmm4, xmm7
+    psrldq xmm3, 4
+    psrldq xmm4, 4
+    movd eax, xmm7
+    movd xmm5, eax
+    pslldq xmm5, 12
+    por xmm3, xmm5
+    movdqa xmm5, xmm7
+    psrldq xmm5, 12
+    pslldq xmm5, 12
+    por xmm4, xmm5
+    pand xmm3, [rel cdFFFF]
+    pand xmm4, [rel cdFFFF]
+    packusdw xmm3, xmm4
+    movdqa xmm4, xmm1
+    movdqa xmm5, xmm2
+    movdqa xmm6, xmm3
+    ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+    paddw xmm4, xmm6
+    psraw xmm4, 1
+    psubw xmm5, xmm4
+    psraw xmm5, 1
+    movdqa xmm6, xmm5                   ; out hi
+    paddw xmm6, xmm8
+    psraw xmm6, xmm9
+    movdqa [rdi], xmm6
+    ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+    movdqa xmm7, xmm5
+    movd eax, xmm7
+    pslldq xmm7, 2
+    and eax, 0xFFFF
+    movd xmm6, eax
+    por xmm7, xmm6
+    paddw xmm5, xmm7
+    psraw xmm5, 1
+    paddw xmm5, xmm1
+
+    movdqa xmm6, xmm5                   ; out lo
+    paddw xmm6, xmm10
+    psraw xmm6, xmm11
+    movdqa [rdx], xmm6
+
+    ; move right
+    lea rsi, [rsi + 16 * 2]
+    lea rdi, [rdi + 8 * 2]
+    lea rdx, [rdx + 8 * 2]
+
+    ; move left
+    lea rsi, [rsi - 16 * 2]
+    lea rdi, [rdi - 8 * 2]
+    lea rdx, [rdx - 8 * 2]
+
+    ; move down
+    lea rsi, [rsi + 16 * 2]
+    lea rdi, [rdi + 8 * 2]
+    lea rdx, [rdx + 8 * 2]
+
+    dec ecx
+    jnz loop1a
+
+    ret
+
+;******************************************************************************
+; source 16 bit signed, 16 pixel width
+rfx_dwt_2d_encode_block_verti_16_16:
+    mov ecx, 2
+loop1b:
+    ; pre
+    movdqa xmm1, [rsi]                  ; src[2n]
+    movdqa xmm2, [rsi + 16 * 2]         ; src[2n + 1]
+    movdqa xmm3, [rsi + 16 * 2 * 2]     ; src[2n + 2]
+    movdqa xmm4, xmm1
+    movdqa xmm5, xmm2
+    movdqa xmm6, xmm3
+    ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+    paddw xmm4, xmm6
+    psraw xmm4, 1
+    psubw xmm5, xmm4
+    psraw xmm5, 1
+    movdqa [rdi], xmm5                  ; out hi
+    movdqa xmm6, xmm5                   ; save hi
+    ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+    paddw xmm5, xmm1
+    movdqa [rdx], xmm5                  ; out lo
+    movdqa xmm7, xmm6                   ; save hi
+    ; move down
+    lea rsi, [rsi + 16 * 2 * 2]         ; 2 rows
+    lea rdi, [rdi + 16 * 2]             ; 1 row
+    lea rdx, [rdx + 16 * 2]             ; 1 row
+
+    ; loop
+    shl ecx, 16
+    mov cx, 6
+loop2b:
+    movdqa xmm1, xmm3                   ; src[2n]
+    movdqa xmm2, [rsi + 16 * 2]         ; src[2n + 1]
+    movdqa xmm3, [rsi + 16 * 2 * 2]     ; src[2n + 2]
+    movdqa xmm4, xmm1
+    movdqa xmm5, xmm2
+    movdqa xmm6, xmm3
+    ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+    paddw xmm4, xmm6
+    psraw xmm4, 1
+    psubw xmm5, xmm4
+    psraw xmm5, 1
+    movdqa [rdi], xmm5                  ; out hi
+    movdqa xmm6, xmm5                   ; save hi
+    ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+    paddw xmm5, xmm7
+    psraw xmm5, 1
+    paddw xmm5, xmm1
+    movdqa [rdx], xmm5                  ; out lo
+    movdqa xmm7, xmm6                   ; save hi
+    ; move down
+    lea rsi, [rsi + 16 * 2 * 2]         ; 2 rows
+    lea rdi, [rdi + 16 * 2]             ; 1 row
+    lea rdx, [rdx + 16 * 2]             ; 1 row
+
+    dec cx
+    jnz loop2b
+    shr ecx, 16
+
+    ; post
+    movdqa xmm1, xmm3                   ; src[2n]
+    movdqa xmm2, [rsi + 16 * 2]         ; src[2n + 1]
+    movdqa xmm4, xmm1
+    movdqa xmm5, xmm2
+    movdqa xmm6, xmm3
+    ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+    paddw xmm4, xmm6
+    psraw xmm4, 1
+    psubw xmm5, xmm4
+    psraw xmm5, 1
+    movdqa [rdi], xmm5                  ; out hi
+    ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+    paddw xmm5, xmm7
+    psraw xmm5, 1
+    paddw xmm5, xmm1
+    movdqa [rdx], xmm5                  ; out lo
+    ; move down
+    lea rsi, [rsi + 16 * 2 * 2]         ; 2 row
+    lea rdi, [rdi + 16 * 2]             ; 1 row
+    lea rdx, [rdx + 16 * 2]             ; 1 row
+
+    ; move up
+    lea rsi, [rsi - 16 * 16 * 2]
+    lea rdi, [rdi - 8 * 16 * 2]
+    lea rdx, [rdx - 8 * 16 * 2]
+
+    ; move right
+    lea rsi, [rsi + 16]
+    lea rdi, [rdi + 16]
+    lea rdx, [rdx + 16]
+
+    dec ecx
+    jnz loop1b
+
+    ret
+
+;******************************************************************************
+; source 16 bit signed, 32 pixel width
+rfx_dwt_2d_encode_block_horiz_16_32:
+    mov ecx, 16
+loop1c:
+    ; pre
+    movdqa xmm1, [rsi]                  ; src[2n]
+    movdqa xmm2, [rsi + 16]
+    movdqa xmm6, xmm1
+    movdqa xmm7, xmm2
+    pand xmm1, [rel cdFFFF]
+    pand xmm2, [rel cdFFFF]
+    packusdw xmm1, xmm2
+    movdqa xmm2, xmm6                   ; src[2n + 1]
+    movdqa xmm3, xmm7
+    psrldq xmm2, 2
+    psrldq xmm3, 2
+    pand xmm2, [rel cdFFFF]
+    pand xmm3, [rel cdFFFF]
+    packusdw xmm2, xmm3
+    movdqa xmm3, xmm6                   ; src[2n + 2]
+    movdqa xmm4, xmm7
+    psrldq xmm3, 4
+    psrldq xmm4, 4
+    movd eax, xmm7
+    movd xmm5, eax
+    pslldq xmm5, 12
+    por xmm3, xmm5
+    mov eax, [rsi + 32]
+    movd xmm5, eax
+    pslldq xmm5, 12
+    por xmm4, xmm5
+    pand xmm3, [rel cdFFFF]
+    pand xmm4, [rel cdFFFF]
+    packusdw xmm3, xmm4
+    movdqa xmm4, xmm1
+    movdqa xmm5, xmm2
+    movdqa xmm6, xmm3
+    ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+    paddw xmm4, xmm6
+    psraw xmm4, 1
+    psubw xmm5, xmm4
+    psraw xmm5, 1
+
+    movdqa xmm6, xmm5                   ; out hi
+    paddw xmm6, xmm8
+    psraw xmm6, xmm9
+    movdqa [rdi], xmm6
+    movdqa xmm2, xmm5                   ; save hi
+
+    ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+    movdqa xmm7, xmm5
+    movd eax, xmm7
+    pslldq xmm7, 2
+    and eax, 0xFFFF
+    movd xmm6, eax
+    por xmm7, xmm6
+    paddw xmm5, xmm7
+    psraw xmm5, 1
+    paddw xmm5, xmm1
+
+    psrldq xmm2, 14
+    movd ebx, xmm2                      ; save hi
+
+    movdqa xmm6, xmm5                   ; out lo
+    paddw xmm6, xmm10
+    psraw xmm6, xmm11
+    movdqa [rdx], xmm6
+
+    ; move right
+    lea rsi, [rsi + 16 * 2]
+    lea rdi, [rdi + 8 * 2]
+    lea rdx, [rdx + 8 * 2]
+
+    ; post
+    movdqa xmm1, [rsi]                  ; src[2n]
+    movdqa xmm2, [rsi + 16]
+    movdqa xmm6, xmm1
+    movdqa xmm7, xmm2
+    pand xmm1, [rel cdFFFF]
+    pand xmm2, [rel cdFFFF]
+    packusdw xmm1, xmm2
+    movdqa xmm2, xmm6                   ; src[2n + 1]
+    movdqa xmm3, xmm7
+    psrldq xmm2, 2
+    psrldq xmm3, 2
+    pand xmm2, [rel cdFFFF]
+    pand xmm3, [rel cdFFFF]
+    packusdw xmm2, xmm3
+    movdqa xmm3, xmm6                   ; src[2n + 2]
+    movdqa xmm4, xmm7
+    psrldq xmm3, 4
+    psrldq xmm4, 4
+    movd eax, xmm7
+    movd xmm5, eax
+    pslldq xmm5, 12
+    por xmm3, xmm5
+    movdqa xmm5, xmm7
+    psrldq xmm5, 12
+    pslldq xmm5, 12
+    por xmm4, xmm5
+    pand xmm3, [rel cdFFFF]
+    pand xmm4, [rel cdFFFF]
+    packusdw xmm3, xmm4
+    movdqa xmm4, xmm1
+    movdqa xmm5, xmm2
+    movdqa xmm6, xmm3
+    ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+    paddw xmm4, xmm6
+    psraw xmm4, 1
+    psubw xmm5, xmm4
+    psraw xmm5, 1
+
+    movdqa xmm6, xmm5                   ; out hi
+    paddw xmm6, xmm8
+    psraw xmm6, xmm9
+    movdqa [rdi], xmm6
+
+    ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+    movdqa xmm7, xmm5
+    pslldq xmm7, 2
+    movd xmm6, ebx
+    por xmm7, xmm6
+    paddw xmm5, xmm7
+    psraw xmm5, 1
+    paddw xmm5, xmm1
+
+    movdqa xmm6, xmm5                   ; out lo
+    paddw xmm6, xmm10
+    psraw xmm6, xmm11
+    movdqa [rdx], xmm6
+
+    ; move right
+    lea rsi, [rsi + 16 * 2]
+    lea rdi, [rdi + 8 * 2]
+    lea rdx, [rdx + 8 * 2]
+
+    ; move left
+    lea rsi, [rsi - 32 * 2]
+    lea rdi, [rdi - 16 * 2]
+    lea rdx, [rdx - 16 * 2]
+
+    ; move down
+    lea rsi, [rsi + 32 * 2]
+    lea rdi, [rdi + 16 * 2]
+    lea rdx, [rdx + 16 * 2]
+
+    dec ecx
+    jnz loop1c
+
+    ret
+
+;******************************************************************************
+; source 16 bit signed, 32 pixel width
+rfx_dwt_2d_encode_block_horiz_16_32_no_lo:
+    mov ecx, 16
+loop1c1:
+    ; pre
+    movdqa xmm1, [rsi]                  ; src[2n]
+    movdqa xmm2, [rsi + 16]
+    movdqa xmm6, xmm1
+    movdqa xmm7, xmm2
+    pand xmm1, [rel cdFFFF]
+    pand xmm2, [rel cdFFFF]
+    packusdw xmm1, xmm2
+    movdqa xmm2, xmm6                   ; src[2n + 1]
+    movdqa xmm3, xmm7
+    psrldq xmm2, 2
+    psrldq xmm3, 2
+    pand xmm2, [rel cdFFFF]
+    pand xmm3, [rel cdFFFF]
+    packusdw xmm2, xmm3
+    movdqa xmm3, xmm6                   ; src[2n + 2]
+    movdqa xmm4, xmm7
+    psrldq xmm3, 4
+    psrldq xmm4, 4
+    movd eax, xmm7
+    movd xmm5, eax
+    pslldq xmm5, 12
+    por xmm3, xmm5
+    mov eax, [rsi + 32]
+    movd xmm5, eax
+    pslldq xmm5, 12
+    por xmm4, xmm5
+    pand xmm3, [rel cdFFFF]
+    pand xmm4, [rel cdFFFF]
+    packusdw xmm3, xmm4
+    movdqa xmm4, xmm1
+    movdqa xmm5, xmm2
+    movdqa xmm6, xmm3
+    ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+    paddw xmm4, xmm6
+    psraw xmm4, 1
+    psubw xmm5, xmm4
+    psraw xmm5, 1
+
+    movdqa xmm6, xmm5                   ; out hi
+    paddw xmm6, xmm8
+    psraw xmm6, xmm9
+    movdqa [rdi], xmm6
+    movdqa xmm2, xmm5                   ; save hi
+
+    ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+    movdqa xmm7, xmm5
+    movd eax, xmm7
+    pslldq xmm7, 2
+    and eax, 0xFFFF
+    movd xmm6, eax
+    por xmm7, xmm6
+    paddw xmm5, xmm7
+    psraw xmm5, 1
+    paddw xmm5, xmm1
+
+    psrldq xmm2, 14
+    movd ebx, xmm2                      ; save hi
+
+    movdqa [rdx], xmm5                  ; out lo
+
+    ; move right
+    lea rsi, [rsi + 16 * 2]
+    lea rdi, [rdi + 8 * 2]
+    lea rdx, [rdx + 8 * 2]
+
+    ; post
+    movdqa xmm1, [rsi]                  ; src[2n]
+    movdqa xmm2, [rsi + 16]
+    movdqa xmm6, xmm1
+    movdqa xmm7, xmm2
+    pand xmm1, [rel cdFFFF]
+    pand xmm2, [rel cdFFFF]
+    packusdw xmm1, xmm2
+    movdqa xmm2, xmm6                   ; src[2n + 1]
+    movdqa xmm3, xmm7
+    psrldq xmm2, 2
+    psrldq xmm3, 2
+    pand xmm2, [rel cdFFFF]
+    pand xmm3, [rel cdFFFF]
+    packusdw xmm2, xmm3
+    movdqa xmm3, xmm6                   ; src[2n + 2]
+    movdqa xmm4, xmm7
+    psrldq xmm3, 4
+    psrldq xmm4, 4
+    movd eax, xmm7
+    movd xmm5, eax
+    pslldq xmm5, 12
+    por xmm3, xmm5
+    movdqa xmm5, xmm7
+    psrldq xmm5, 12
+    pslldq xmm5, 12
+    por xmm4, xmm5
+    pand xmm3, [rel cdFFFF]
+    pand xmm4, [rel cdFFFF]
+    packusdw xmm3, xmm4
+    movdqa xmm4, xmm1
+    movdqa xmm5, xmm2
+    movdqa xmm6, xmm3
+    ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+    paddw xmm4, xmm6
+    psraw xmm4, 1
+    psubw xmm5, xmm4
+    psraw xmm5, 1
+
+    movdqa xmm6, xmm5                   ; out hi
+    paddw xmm6, xmm8
+    psraw xmm6, xmm9
+    movdqa [rdi], xmm6
+
+    ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+    movdqa xmm7, xmm5
+    pslldq xmm7, 2
+    movd xmm6, ebx
+    por xmm7, xmm6
+    paddw xmm5, xmm7
+    psraw xmm5, 1
+    paddw xmm5, xmm1
+
+    movdqa [rdx], xmm5                  ; out lo
+
+    ; move right
+    lea rsi, [rsi + 16 * 2]
+    lea rdi, [rdi + 8 * 2]
+    lea rdx, [rdx + 8 * 2]
+
+    ; move left
+    lea rsi, [rsi - 32 * 2]
+    lea rdi, [rdi - 16 * 2]
+    lea rdx, [rdx - 16 * 2]
+
+    ; move down
+    lea rsi, [rsi + 32 * 2]
+    lea rdi, [rdi + 16 * 2]
+    lea rdx, [rdx + 16 * 2]
+
+    dec ecx
+    jnz loop1c1
+
+    ret
+
+;******************************************************************************
+; source 16 bit signed, 32 pixel width
+rfx_dwt_2d_encode_block_verti_16_32:
+    mov ecx, 4
+loop1d:
+    ; pre
+    movdqa xmm1, [rsi]                  ; src[2n]
+    movdqa xmm2, [rsi + 32 * 2]         ; src[2n + 1]
+    movdqa xmm3, [rsi + 32 * 2 * 2]     ; src[2n + 2]
+    movdqa xmm4, xmm1
+    movdqa xmm5, xmm2
+    movdqa xmm6, xmm3
+    ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+    paddw xmm4, xmm6
+    psraw xmm4, 1
+    psubw xmm5, xmm4
+    psraw xmm5, 1
+    movdqa [rdi], xmm5                  ; out hi
+    movdqa xmm6, xmm5                   ; save hi
+    ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+    paddw xmm5, xmm1
+    movdqa [rdx], xmm5                  ; out lo
+    movdqa xmm7, xmm6                   ; save hi
+    ; move down
+    lea rsi, [rsi + 32 * 2 * 2]         ; 2 rows
+    lea rdi, [rdi + 32 * 2]             ; 1 row
+    lea rdx, [rdx + 32 * 2]             ; 1 row
+
+    ; loop
+    shl ecx, 16
+    mov cx, 14
+loop2d:
+    movdqa xmm1, xmm3                   ; src[2n]
+    movdqa xmm2, [rsi + 32 * 2]         ; src[2n + 1]
+    movdqa xmm3, [rsi + 32 * 2 * 2]     ; src[2n + 2]
+    movdqa xmm4, xmm1
+    movdqa xmm5, xmm2
+    movdqa xmm6, xmm3
+    ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+    paddw xmm4, xmm6
+    psraw xmm4, 1
+    psubw xmm5, xmm4
+    psraw xmm5, 1
+    movdqa [rdi], xmm5                  ; out hi
+    movdqa xmm6, xmm5                   ; save hi
+    ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+    paddw xmm5, xmm7
+    psraw xmm5, 1
+    paddw xmm5, xmm1
+    movdqa [rdx], xmm5                  ; out lo
+    movdqa xmm7, xmm6                   ; save hi
+    ; move down
+    lea rsi, [rsi + 32 * 2 * 2]         ; 2 rows
+    lea rdi, [rdi + 32 * 2]             ; 1 row
+    lea rdx, [rdx + 32 * 2]             ; 1 row
+
+    dec cx
+    jnz loop2d
+    shr ecx, 16
+
+    ; post
+    movdqa xmm1, xmm3                   ; src[2n]
+    movdqa xmm2, [rsi + 32 * 2]         ; src[2n + 1]
+    movdqa xmm4, xmm1
+    movdqa xmm5, xmm2
+    movdqa xmm6, xmm3
+    ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+    paddw xmm4, xmm6
+    psraw xmm4, 1
+    psubw xmm5, xmm4
+    psraw xmm5, 1
+    movdqa [rdi], xmm5                  ; out hi
+    ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+    paddw xmm5, xmm7
+    psraw xmm5, 1
+    paddw xmm5, xmm1
+    movdqa [rdx], xmm5                  ; out lo
+    ; move down
+    lea rsi, [rsi + 32 * 2 * 2]         ; 2 row
+    lea rdi, [rdi + 32 * 2]             ; 1 row
+    lea rdx, [rdx + 32 * 2]             ; 1 row
+
+    ; move up
+    lea rsi, [rsi - 32 * 32 * 2]
+    lea rdi, [rdi - 16 * 32 * 2]
+    lea rdx, [rdx - 16 * 32 * 2]
+
+    ; move right
+    lea rsi, [rsi + 16]
+    lea rdi, [rdi + 16]
+    lea rdx, [rdx + 16]
+
+    dec ecx
+    jnz loop1d
+
+    ret
+
+;******************************************************************************
+; source 16 bit signed, 64 pixel width
+rfx_dwt_2d_encode_block_horiz_16_64:
+    mov ecx, 32
+loop1e:
+    ; pre
+    movdqa xmm1, [rsi]                  ; src[2n]
+    movdqa xmm2, [rsi + 16]
+    movdqa xmm6, xmm1
+    movdqa xmm7, xmm2
+    pand xmm1, [rel cdFFFF]
+    pand xmm2, [rel cdFFFF]
+    packusdw xmm1, xmm2
+    movdqa xmm2, xmm6                   ; src[2n + 1]
+    movdqa xmm3, xmm7
+    psrldq xmm2, 2
+    psrldq xmm3, 2
+    pand xmm2, [rel cdFFFF]
+    pand xmm3, [rel cdFFFF]
+    packusdw xmm2, xmm3
+    movdqa xmm3, xmm6                   ; src[2n + 2]
+    movdqa xmm4, xmm7
+    psrldq xmm3, 4
+    psrldq xmm4, 4
+    movd eax, xmm7
+    movd xmm5, eax
+    pslldq xmm5, 12
+    por xmm3, xmm5
+    mov eax, [rsi + 32]
+    movd xmm5, eax
+    pslldq xmm5, 12
+    por xmm4, xmm5
+    pand xmm3, [rel cdFFFF]
+    pand xmm4, [rel cdFFFF]
+    packusdw xmm3, xmm4
+    movdqa xmm4, xmm1
+    movdqa xmm5, xmm2
+    movdqa xmm6, xmm3
+    ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+    paddw xmm4, xmm6
+    psraw xmm4, 1
+    psubw xmm5, xmm4
+    psraw xmm5, 1
+
+    movdqa xmm6, xmm5                   ; out hi
+    paddw xmm6, xmm8
+    psraw xmm6, xmm9
+    movdqa [rdi], xmm6
+    movdqa xmm2, xmm5                   ; save hi
+
+    ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+    movdqa xmm7, xmm5
+    movd eax, xmm7
+    pslldq xmm7, 2
+    and eax, 0xFFFF
+    movd xmm6, eax
+    por xmm7, xmm6
+    paddw xmm5, xmm7
+    psraw xmm5, 1
+    paddw xmm5, xmm1
+
+    psrldq xmm2, 14
+    movd ebx, xmm2                      ; save hi
+
+    movdqa xmm6, xmm5                   ; out lo
+    paddw xmm6, xmm10
+    psraw xmm6, xmm11
+    movdqa [rdx], xmm6
+
+    ; move right
+    lea rsi, [rsi + 16 * 2]
+    lea rdi, [rdi + 8 * 2]
+    lea rdx, [rdx + 8 * 2]
+
+    ; loop
+    shl ecx, 16
+    mov cx, 2
+loop2e:
+    movdqa xmm1, [rsi]                  ; src[2n]
+    movdqa xmm2, [rsi + 16]
+    movdqa xmm6, xmm1
+    movdqa xmm7, xmm2
+    pand xmm1, [rel cdFFFF]
+    pand xmm2, [rel cdFFFF]
+    packusdw xmm1, xmm2
+    movdqa xmm2, xmm6                   ; src[2n + 1]
+    movdqa xmm3, xmm7
+    psrldq xmm2, 2
+    psrldq xmm3, 2
+    pand xmm2, [rel cdFFFF]
+    pand xmm3, [rel cdFFFF]
+    packusdw xmm2, xmm3
+    movdqa xmm3, xmm6                   ; src[2n + 2]
+    movdqa xmm4, xmm7
+    psrldq xmm3, 4
+    psrldq xmm4, 4
+    movd eax, xmm7
+    movd xmm5, eax
+    pslldq xmm5, 12
+    por xmm3, xmm5
+    mov eax, [rsi + 32]
+    movd xmm5, eax
+    pslldq xmm5, 12
+    por xmm4, xmm5
+    pand xmm3, [rel cdFFFF]
+    pand xmm4, [rel cdFFFF]
+    packusdw xmm3, xmm4
+    movdqa xmm4, xmm1
+    movdqa xmm5, xmm2
+    movdqa xmm6, xmm3
+    ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+    paddw xmm4, xmm6
+    psraw xmm4, 1
+    psubw xmm5, xmm4
+    psraw xmm5, 1
+
+    movdqa xmm6, xmm5                   ; out hi
+    paddw xmm6, xmm8
+    psraw xmm6, xmm9
+    movdqa [rdi], xmm6
+    movdqa xmm2, xmm5                   ; save hi
+
+    ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+    movdqa xmm7, xmm5
+    pslldq xmm7, 2
+    movd xmm6, ebx
+    por xmm7, xmm6
+    paddw xmm5, xmm7
+    psraw xmm5, 1
+    paddw xmm5, xmm1
+
+    psrldq xmm2, 14
+    movd ebx, xmm2                      ; save hi
+
+    movdqa xmm6, xmm5                   ; out lo
+    paddw xmm6, xmm10
+    psraw xmm6, xmm11
+    movdqa [rdx], xmm6
+
+    ; move right
+    lea rsi, [rsi + 16 * 2]
+    lea rdi, [rdi + 8 * 2]
+    lea rdx, [rdx + 8 * 2]
+
+    dec cx
+    jnz loop2e
+    shr ecx, 16
+
+    ; post
+    movdqa xmm1, [rsi]                  ; src[2n]
+    movdqa xmm2, [rsi + 16]
+    movdqa xmm6, xmm1
+    movdqa xmm7, xmm2
+    pand xmm1, [rel cdFFFF]
+    pand xmm2, [rel cdFFFF]
+    packusdw xmm1, xmm2
+    movdqa xmm2, xmm6                   ; src[2n + 1]
+    movdqa xmm3, xmm7
+    psrldq xmm2, 2
+    psrldq xmm3, 2
+    pand xmm2, [rel cdFFFF]
+    pand xmm3, [rel cdFFFF]
+    packusdw xmm2, xmm3
+    movdqa xmm3, xmm6                   ; src[2n + 2]
+    movdqa xmm4, xmm7
+    psrldq xmm3, 4
+    psrldq xmm4, 4
+    movd eax, xmm7
+    movd xmm5, eax
+    pslldq xmm5, 12
+    por xmm3, xmm5
+    movdqa xmm5, xmm7
+    psrldq xmm5, 12
+    pslldq xmm5, 12
+    por xmm4, xmm5
+    pand xmm3, [rel cdFFFF]
+    pand xmm4, [rel cdFFFF]
+    packusdw xmm3, xmm4
+    movdqa xmm4, xmm1
+    movdqa xmm5, xmm2
+    movdqa xmm6, xmm3
+    ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+    paddw xmm4, xmm6
+    psraw xmm4, 1
+    psubw xmm5, xmm4
+    psraw xmm5, 1
+
+    movdqa xmm6, xmm5                   ; out hi
+    paddw xmm6, xmm8
+    psraw xmm6, xmm9
+    movdqa [rdi], xmm6
+
+    ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+    movdqa xmm7, xmm5
+    pslldq xmm7, 2
+    movd xmm6, ebx
+    por xmm7, xmm6
+    paddw xmm5, xmm7
+    psraw xmm5, 1
+    paddw xmm5, xmm1
+
+    movdqa xmm6, xmm5                   ; out lo
+    paddw xmm6, xmm10
+    psraw xmm6, xmm11
+    movdqa [rdx], xmm6
+
+    ; move right
+    lea rsi, [rsi + 16 * 2]
+    lea rdi, [rdi + 8 * 2]
+    lea rdx, [rdx + 8 * 2]
+
+    ; move left
+    lea rsi, [rsi - 64 * 2]
+    lea rdi, [rdi - 32 * 2]
+    lea rdx, [rdx - 32 * 2]
+
+    ; move down
+    lea rsi, [rsi + 64 * 2]
+    lea rdi, [rdi + 32 * 2]
+    lea rdx, [rdx + 32 * 2]
+
+    dec ecx
+    jnz loop1e
+
+    ret
+
+;******************************************************************************
+; source 16 bit signed, 64 pixel width
+rfx_dwt_2d_encode_block_horiz_16_64_no_lo:
+    mov ecx, 32
+loop1e1:
+    ; pre
+    movdqa xmm1, [rsi]                  ; src[2n]
+    movdqa xmm2, [rsi + 16]
+    movdqa xmm6, xmm1
+    movdqa xmm7, xmm2
+    pand xmm1, [rel cdFFFF]
+    pand xmm2, [rel cdFFFF]
+    packusdw xmm1, xmm2
+    movdqa xmm2, xmm6                   ; src[2n + 1]
+    movdqa xmm3, xmm7
+    psrldq xmm2, 2
+    psrldq xmm3, 2
+    pand xmm2, [rel cdFFFF]
+    pand xmm3, [rel cdFFFF]
+    packusdw xmm2, xmm3
+    movdqa xmm3, xmm6                   ; src[2n + 2]
+    movdqa xmm4, xmm7
+    psrldq xmm3, 4
+    psrldq xmm4, 4
+    movd eax, xmm7
+    movd xmm5, eax
+    pslldq xmm5, 12
+    por xmm3, xmm5
+    mov eax, [rsi + 32]
+    movd xmm5, eax
+    pslldq xmm5, 12
+    por xmm4, xmm5
+    pand xmm3, [rel cdFFFF]
+    pand xmm4, [rel cdFFFF]
+    packusdw xmm3, xmm4
+    movdqa xmm4, xmm1
+    movdqa xmm5, xmm2
+    movdqa xmm6, xmm3
+    ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+    paddw xmm4, xmm6
+    psraw xmm4, 1
+    psubw xmm5, xmm4
+    psraw xmm5, 1
+
+    movdqa xmm6, xmm5                   ; out hi
+    paddw xmm6, xmm8
+    psraw xmm6, xmm9
+    movdqa [rdi], xmm6
+    movdqa xmm2, xmm5                   ; save hi
+
+    ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+    movdqa xmm7, xmm5
+    movd eax, xmm7
+    pslldq xmm7, 2
+    and eax, 0xFFFF
+    movd xmm6, eax
+    por xmm7, xmm6
+    paddw xmm5, xmm7
+    psraw xmm5, 1
+    paddw xmm5, xmm1
+
+    psrldq xmm2, 14
+    movd ebx, xmm2                      ; save hi
+
+    movdqa [rdx], xmm5                  ; out lo
+
+    ; move right
+    lea rsi, [rsi + 16 * 2]
+    lea rdi, [rdi + 8 * 2]
+    lea rdx, [rdx + 8 * 2]
+
+    ; loop
+    shl ecx, 16
+    mov cx, 2
+loop2e1:
+    movdqa xmm1, [rsi]                  ; src[2n]
+    movdqa xmm2, [rsi + 16]
+    movdqa xmm6, xmm1
+    movdqa xmm7, xmm2
+    pand xmm1, [rel cdFFFF]
+    pand xmm2, [rel cdFFFF]
+    packusdw xmm1, xmm2
+    movdqa xmm2, xmm6                   ; src[2n + 1]
+    movdqa xmm3, xmm7
+    psrldq xmm2, 2
+    psrldq xmm3, 2
+    pand xmm2, [rel cdFFFF]
+    pand xmm3, [rel cdFFFF]
+    packusdw xmm2, xmm3
+    movdqa xmm3, xmm6                   ; src[2n + 2]
+    movdqa xmm4, xmm7
+    psrldq xmm3, 4
+    psrldq xmm4, 4
+    movd eax, xmm7
+    movd xmm5, eax
+    pslldq xmm5, 12
+    por xmm3, xmm5
+    mov eax, [rsi + 32]
+    movd xmm5, eax
+    pslldq xmm5, 12
+    por xmm4, xmm5
+    pand xmm3, [rel cdFFFF]
+    pand xmm4, [rel cdFFFF]
+    packusdw xmm3, xmm4
+    movdqa xmm4, xmm1
+    movdqa xmm5, xmm2
+    movdqa xmm6, xmm3
+    ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+    paddw xmm4, xmm6
+    psraw xmm4, 1
+    psubw xmm5, xmm4
+    psraw xmm5, 1
+
+    movdqa xmm6, xmm5                   ; out hi
+    paddw xmm6, xmm8
+    psraw xmm6, xmm9
+    movdqa [rdi], xmm6
+    movdqa xmm2, xmm5                   ; save hi
+
+    ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+    movdqa xmm7, xmm5
+    pslldq xmm7, 2
+    movd xmm6, ebx
+    por xmm7, xmm6
+    paddw xmm5, xmm7
+    psraw xmm5, 1
+    paddw xmm5, xmm1
+
+    psrldq xmm2, 14
+    movd ebx, xmm2                      ; save hi
+
+    movdqa [rdx], xmm5                  ; out lo
+
+    ; move right
+    lea rsi, [rsi + 16 * 2]
+    lea rdi, [rdi + 8 * 2]
+    lea rdx, [rdx + 8 * 2]
+
+    dec cx
+    jnz loop2e1
+    shr ecx, 16
+
+    ; post
+    movdqa xmm1, [rsi]                  ; src[2n]
+    movdqa xmm2, [rsi + 16]
+    movdqa xmm6, xmm1
+    movdqa xmm7, xmm2
+    pand xmm1, [rel cdFFFF]
+    pand xmm2, [rel cdFFFF]
+    packusdw xmm1, xmm2
+    movdqa xmm2, xmm6                   ; src[2n + 1]
+    movdqa xmm3, xmm7
+    psrldq xmm2, 2
+    psrldq xmm3, 2
+    pand xmm2, [rel cdFFFF]
+    pand xmm3, [rel cdFFFF]
+    packusdw xmm2, xmm3
+    movdqa xmm3, xmm6                   ; src[2n + 2]
+    movdqa xmm4, xmm7
+    psrldq xmm3, 4
+    psrldq xmm4, 4
+    movd eax, xmm7
+    movd xmm5, eax
+    pslldq xmm5, 12
+    por xmm3, xmm5
+    movdqa xmm5, xmm7
+    psrldq xmm5, 12
+    pslldq xmm5, 12
+    por xmm4, xmm5
+    pand xmm3, [rel cdFFFF]
+    pand xmm4, [rel cdFFFF]
+    packusdw xmm3, xmm4
+    movdqa xmm4, xmm1
+    movdqa xmm5, xmm2
+    movdqa xmm6, xmm3
+    ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+    paddw xmm4, xmm6
+    psraw xmm4, 1
+    psubw xmm5, xmm4
+    psraw xmm5, 1
+
+    movdqa xmm6, xmm5                   ; out hi
+    paddw xmm6, xmm8
+    psraw xmm6, xmm9
+    movdqa [rdi], xmm6
+
+    ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+    movdqa xmm7, xmm5
+    pslldq xmm7, 2
+    movd xmm6, ebx
+    por xmm7, xmm6
+    paddw xmm5, xmm7
+    psraw xmm5, 1
+    paddw xmm5, xmm1
+
+    movdqa [rdx], xmm5                  ; out lo
+
+    ; move right
+    lea rsi, [rsi + 16 * 2]
+    lea rdi, [rdi + 8 * 2]
+    lea rdx, [rdx + 8 * 2]
+
+    ; move left
+    lea rsi, [rsi - 64 * 2]
+    lea rdi, [rdi - 32 * 2]
+    lea rdx, [rdx - 32 * 2]
+
+    ; move down
+    lea rsi, [rsi + 64 * 2]
+    lea rdi, [rdi + 32 * 2]
+    lea rdx, [rdx + 32 * 2]
+
+    dec ecx
+    jnz loop1e1
+
+    ret
+
+;******************************************************************************
+; source 8 bit unsigned, 64 pixel width
+rfx_dwt_2d_encode_block_verti_8_64:
+    mov ecx, 8
+loop1f:
+    ; pre
+    movq xmm1, [rsi]                    ; src[2n]
+    movq xmm2, [rsi + 64 * 1]           ; src[2n + 1]
+    movq xmm3, [rsi + 64 * 1 * 2]       ; src[2n + 2]
+    punpcklbw xmm1, xmm0
+    punpcklbw xmm2, xmm0
+    punpcklbw xmm3, xmm0
+    psubw xmm1, [rel cw128]
+    psubw xmm2, [rel cw128]
+    psubw xmm3, [rel cw128]
+    psllw xmm1, 5
+    psllw xmm2, 5
+    psllw xmm3, 5
+    movdqa xmm4, xmm1
+    movdqa xmm5, xmm2
+    movdqa xmm6, xmm3
+    ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+    paddw xmm4, xmm6
+    psraw xmm4, 1
+    psubw xmm5, xmm4
+    psraw xmm5, 1
+    movdqa [rdi], xmm5                  ; out hi
+    movdqa xmm6, xmm5                   ; save hi
+    ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+    paddw xmm5, xmm1
+    movdqa [rdx], xmm5                  ; out lo
+    movdqa xmm7, xmm6                   ; save hi
+    ; move down
+    lea rsi, [rsi + 64 * 1 * 2]         ; 2 rows
+    lea rdi, [rdi + 64 * 2]             ; 1 row
+    lea rdx, [rdx + 64 * 2]             ; 1 row
+
+    ; loop
+    shl ecx, 16
+    mov cx, 30
+loop2f:
+    movdqa xmm1, xmm3                   ; src[2n]
+    movq xmm2, [rsi + 64 * 1]           ; src[2n + 1]
+    movq xmm3, [rsi + 64 * 1 * 2]       ; src[2n + 2]
+    punpcklbw xmm2, xmm0
+    punpcklbw xmm3, xmm0
+    psubw xmm2, [rel cw128]
+    psubw xmm3, [rel cw128]
+    psllw xmm2, 5
+    psllw xmm3, 5
+    movdqa xmm4, xmm1
+    movdqa xmm5, xmm2
+    movdqa xmm6, xmm3
+    ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+    paddw xmm4, xmm6
+    psraw xmm4, 1
+    psubw xmm5, xmm4
+    psraw xmm5, 1
+    movdqa [rdi], xmm5                  ; out hi
+    movdqa xmm6, xmm5                   ; save hi
+    ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+    paddw xmm5, xmm7
+    psraw xmm5, 1
+    paddw xmm5, xmm1
+    movdqa [rdx], xmm5                  ; out lo
+    movdqa xmm7, xmm6                   ; save hi
+    ; move down
+    lea rsi, [rsi + 64 * 1 * 2]         ; 2 rows
+    lea rdi, [rdi + 64 * 2]             ; 1 row
+    lea rdx, [rdx + 64 * 2]             ; 1 row
+
+    dec cx
+    jnz loop2f
+    shr ecx, 16
+
+    ; post
+    movdqa xmm1, xmm3                   ; src[2n]
+    movq xmm2, [rsi + 64 * 1]           ; src[2n + 1]
+    punpcklbw xmm2, xmm0
+    psubw xmm2, [rel cw128]
+    psllw xmm2, 5
+    movdqa xmm4, xmm1
+    movdqa xmm5, xmm2
+    movdqa xmm6, xmm3
+    ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+    paddw xmm4, xmm6
+    psraw xmm4, 1
+    psubw xmm5, xmm4
+    psraw xmm5, 1
+    movdqa [rdi], xmm5                  ; out hi
+    ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+    paddw xmm5, xmm7
+    psraw xmm5, 1
+    paddw xmm5, xmm1
+    movdqa [rdx], xmm5                  ; out lo
+    ; move down
+    lea rsi, [rsi + 64 * 1 * 2]         ; 2 rows
+    lea rdi, [rdi + 64 * 2]             ; 1 row
+    lea rdx, [rdx + 64 * 2]             ; 1 row
+
+    ; move up
+    lea rsi, [rsi - 64 * 1 * 64]
+    lea rdi, [rdi - 32 * 64 * 2]
+    lea rdx, [rdx - 32 * 64 * 2]
+
+    ; move right
+    lea rsi, [rsi + 8]
+    lea rdi, [rdi + 16]
+    lea rdx, [rdx + 16]
+
+    dec ecx
+    jnz loop1f
+
+    ret
+
+set_quants_hi:
+    sub rax, 6 - 5
+    movd xmm9, eax
+    imul rax, 16
+    lea rdx, [rel cwa0]
+    add rdx, rax
+    movdqa xmm8, [rdx]
+    ret
+
+set_quants_lo:
+    sub rax, 6 - 5
+    movd xmm11, eax
+    imul rax, 16
+    lea rdx, [rel cwa0]
+    add rdx, rax
+    movdqa xmm10, [rdx]
+    ret
+
+;The first six integer or pointer arguments are passed in registers
+;RDI, RSI, RDX, RCX, R8, and R9
+
+;int
+;rfxcodec_encode_dwt_shift_amd64_sse41(const char *qtable,
+;                                      unsigned char *in_buffer,
+;                                      short *out_buffer,
+;                                      short *work_buffer);
+
+;******************************************************************************
+%ifidn __OUTPUT_FORMAT__,elf64
+PROC rfxcodec_encode_dwt_shift_amd64_sse41
+%else
+PROC _rfxcodec_encode_dwt_shift_amd64_sse41
+%endif
+    ; save registers
+    push rbx
+    push rdx
+    push rcx
+    push rsi
+    push rdi
+    pxor xmm0, xmm0
+
+    ; verical DWT to work buffer, level 1
+    mov rsi, [rsp + 8]                  ; src
+    mov rdi, [rsp + 16]                 ; dst hi
+    lea rdi, [rdi + 64 * 32 * 2]        ; dst hi
+    mov rdx, [rsp + 16]                 ; dst lo
+    call rfx_dwt_2d_encode_block_verti_8_64
+
+    ; horizontal DWT to out buffer, level 1, part 1
+    xor rax, rax
+    mov rdx, [rsp]
+    mov al, [rdx + 4]
+    and al, 0xF
+    call set_quants_hi
+    mov rsi, [rsp + 16]                 ; src
+    mov rdi, [rsp + 24]                 ; dst hi - HL1
+    mov rdx, [rsp + 24]                 ; dst lo - LL1
+    lea rdx, [rdx + 32 * 32 * 6]        ; dst lo - LL1
+    call rfx_dwt_2d_encode_block_horiz_16_64_no_lo
+
+    ; horizontal DWT to out buffer, level 1, part 2
+    xor rax, rax
+    mov rdx, [rsp]
+    mov al, [rdx + 4]
+    shr al, 4
+    call set_quants_hi
+    xor rax, rax
+    mov rdx, [rsp]
+    mov al, [rdx + 3]
+    shr al, 4
+    call set_quants_lo
+    mov rsi, [rsp + 16]                 ; src
+    lea rsi, [rsi + 64 * 32 * 2]        ; src
+    mov rdi, [rsp + 24]                 ; dst hi - HH1
+    lea rdi, [rdi + 32 * 32 * 4]        ; dst hi - HH1
+    mov rdx, [rsp + 24]                 ; dst lo - LH1
+    lea rdx, [rdx + 32 * 32 * 2]        ; dst lo - LH1
+    call rfx_dwt_2d_encode_block_horiz_16_64
+
+    ; verical DWT to work buffer, level 2
+    mov rsi, [rsp + 24]                 ; src
+    lea rsi, [rsi + 32 * 32 * 6]        ; src
+    mov rdi, [rsp + 16]                 ; dst hi
+    lea rdi, [rdi + 32 * 16 * 2]        ; dst hi
+    mov rdx, [rsp + 16]                 ; dst lo
+    call rfx_dwt_2d_encode_block_verti_16_32
+
+    ; horizontal DWT to out buffer, level 2, part 1
+    xor rax, rax
+    mov rdx, [rsp]
+    mov al, [rdx + 2]
+    shr al, 4
+    call set_quants_hi
+    mov rsi, [rsp + 16]                 ; src
+    ; 32 * 32 * 6 + 16 * 16 * 0 = 6144
+    mov rdi, [rsp + 24]                 ; dst hi - HL2
+    lea rdi, [rdi + 6144]               ; dst hi - HL2
+    ; 32 * 32 * 6 + 16 * 16 * 6 = 7680
+    mov rdx, [rsp + 24]                 ; dst lo - LL2
+    lea rdx, [rdx + 7680]               ; dst lo - LL2
+    call rfx_dwt_2d_encode_block_horiz_16_32_no_lo
+
+    ; horizontal DWT to out buffer, level 2, part 2
+    xor rax, rax
+    mov rdx, [rsp]
+    mov al, [rdx + 3]
+    and al, 0xF
+    call set_quants_hi
+    xor rax, rax
+    mov rdx, [rsp]
+    mov al, [rdx + 2]
+    and al, 0xF
+    call set_quants_lo
+    mov rsi, [rsp + 16]                 ; src
+    lea rsi, [rsi + 32 * 16 * 2]        ; src
+    ; 32 * 32 * 6 + 16 * 16 * 4 = 7168
+    mov rdi, [rsp + 24]                 ; dst hi - HH2
+    lea rdi, [rdi + 7168]               ; dst hi - HH2
+    ; 32 * 32 * 6 + 16 * 16 * 2 = 6656
+    mov rdx, [rsp + 24]                 ; dst lo - LH2
+    lea rdx, [rdx + 6656]               ; dst lo - LH2
+    call rfx_dwt_2d_encode_block_horiz_16_32
+
+    ; verical DWT to work buffer, level 3
+    ; 32 * 32 * 6 + 16 * 16 * 6 = 7680
+    mov rsi, [rsp + 24]                 ; src
+    lea rsi, [rsi + 7680]               ; src
+    mov rdi, [rsp + 16]                 ; dst hi
+    lea rdi, [rdi + 16 * 8 * 2]         ; dst hi
+    mov rdx, [rsp + 16]                 ; dst lo
+    call rfx_dwt_2d_encode_block_verti_16_16
+
+    ; horizontal DWT to out buffer, level 3, part 1
+    xor rax, rax
+    mov rdx, [rsp]
+    mov al, [rdx + 1]
+    and al, 0xF
+    call set_quants_hi
+    xor rax, rax
+    mov rdx, [rsp]
+    mov al, [rdx + 0]
+    and al, 0xF
+    call set_quants_lo
+    mov rsi, [rsp + 16]                 ; src
+    ; 32 * 32 * 6 + 16 * 16 * 6 + 8 * 8 * 0 = 7680
+    mov rdi, [rsp + 24]                 ; dst hi - HL3
+    lea rdi, [rdi + 7680]               ; dst hi - HL3
+    ; 32 * 32 * 6 + 16 * 16 * 6 + 8 * 8 * 6 = 8064
+    mov rdx, [rsp + 24]                 ; dst lo - LL3
+    lea rdx, [rdx + 8064]               ; dst lo - LL3
+    call rfx_dwt_2d_encode_block_horiz_16_16
+
+    ; horizontal DWT to out buffer, level 3, part 2
+    xor rax, rax
+    mov rdx, [rsp]
+    mov al, [rdx + 1]
+    shr al, 4
+    call set_quants_hi
+    xor rax, rax
+    mov rdx, [rsp]
+    mov al, [rdx + 0]
+    shr al, 4
+    call set_quants_lo
+    mov rsi, [rsp + 16]                 ; src
+    lea rsi, [rsi + 16 * 8 * 2]         ; src
+    ; 32 * 32 * 6 + 16 * 16 * 6 + 8 * 8 * 4 = 7936
+    mov rdi, [rsp + 24]                 ; dst hi - HH3
+    lea rdi, [rdi + 7936]               ; dst hi - HH3
+    ; 32 * 32 * 6 + 16 * 16 * 6 + 8 * 8 * 2 = 7808
+    mov rdx, [rsp + 24]                 ; dst lo - LH3
+    lea rdx, [rdx + 7808]               ; dst lo - LH3
+    call rfx_dwt_2d_encode_block_horiz_16_16
+
+    mov rax, 0
+    ; restore registers
+    pop rdi
+    pop rsi
+    pop rcx
+    pop rdx
+    pop rbx
+    ret
+    align 16
+
diff --git a/src/amd64/rfxdwt_amd64_sse2.asm b/src/amd64/rfxdwt_amd64_sse2.asm
deleted file mode 100644
index 4648371..0000000
--- a/src/amd64/rfxdwt_amd64_sse2.asm
+++ /dev/null
@@ -1,21 +0,0 @@
-
-section .data
-    const1 times 8 dw 1
-
-%macro PROC 1
-    align 16
-    global %1
-    %1:
-%endmacro
-
-;int
-;dwt_shift_amd64_sse2(const int* qtable, sint8* src, sint16* dst, sint16* temp)
-
-PROC dwt_shift_amd64_sse2
-    ; save registers
-    push rbx
-    mov rax, 0
-    pop rbx
-    ret
-    align 16
-
diff --git a/src/amd64/rfxrlgr1_amd64.asm b/src/amd64/rfxrlgr1_amd64.asm
deleted file mode 100644
index 7c80678..0000000
--- a/src/amd64/rfxrlgr1_amd64.asm
+++ /dev/null
@@ -1,21 +0,0 @@
-
-section .data
-    const1 times 8 dw 1
-
-%macro PROC 1
-    align 16
-    global %1
-    %1:
-%endmacro
-
-;int
-;diff_rlgr1_amd64(sint16 *co, int num_co, uint8 *dst, int dst_bytes);
-
-PROC diff_rlgr1_amd64
-    ; save registers
-    push rbx
-    mov rax, 0
-    pop rbx
-    ret
-    align 16
-
diff --git a/src/amd64/rfxrlgr3_amd64.asm b/src/amd64/rfxrlgr3_amd64.asm
deleted file mode 100644
index 3270760..0000000
--- a/src/amd64/rfxrlgr3_amd64.asm
+++ /dev/null
@@ -1,21 +0,0 @@
-
-section .data
-    const1 times 8 dw 1
-
-%macro PROC 1
-    align 16
-    global %1
-    %1:
-%endmacro
-
-;int
-;diff_rlgr3_amd64(sint16 *co, int num_co, uint8 *dst, int dst_bytes);
-
-PROC diff_rlgr3_amd64
-    ; save registers
-    push rbx
-    mov rax, 0
-    pop rbx
-    ret
-    align 16
-
diff --git a/src/nasm_lt.sh b/src/nasm_lt.sh
new file mode 100755
index 0000000..6cd7329
--- /dev/null
+++ b/src/nasm_lt.sh
@@ -0,0 +1,57 @@
+#! /bin/sh
+command=""
+infile=""
+o_opt=no
+pic=no
+while [ $# -gt 0 ]; do
+    case "$1" in
+        -DPIC|-fPIC|-fpic|-Kpic|-KPIC)
+            if [ "$pic" != "yes" ] ; then
+                command="$command -DPIC"
+                pic=yes
+            fi
+            ;;
+        -f|-fbin|-faout|-faoutb|-fcoff|-felf|-felf64|-fas86| \
+        -fobj|-fwin32|-fwin64|-frdf|-fieee|-fmacho|-fmacho64)
+            # it's a file format specifier for nasm.
+            command="$command $1"
+            ;;
+        -f*)
+            # maybe a code-generation flag for gcc.
+            ;;
+        -[Ii]*)
+            incdir=`echo "$1" | sed 's/^-[Ii]//'`
+            if [ "x$incdir" = x -a "x$2" != x ] ; then
+                case "$2" in
+                    -*) ;;
+                    *) incdir="$2"; shift;;
+                esac
+            fi
+            if [ "x$incdir" != x ] ; then
+                # In the case of NASM, the trailing slash is necessary.
+                incdir=`echo "$incdir" | sed 's%/*$%/%'`
+                command="$command -I$incdir"
+            fi
+            ;;
+        -o*)
+            o_opt=yes
+            command="$command $1"
+            ;;
+        *.asm)
+            infile=$1
+            command="$command $1"
+            ;;
+        *)
+            command="$command $1"
+            ;;
+    esac
+    shift
+done
+if [ "$o_opt" != yes ] ; then
+    # By default, NASM creates an output file
+    # in the same directory as the input file.
+    outfile="-o `echo $infile | sed -e 's%^.*/%%' -e 's%\.[^.]*$%%'`.o"
+    command="$command $outfile"
+fi
+echo $command
+exec $command
diff --git a/src/rfxcommon.h b/src/rfxcommon.h
index 6b4c6f8..74514e9 100644
--- a/src/rfxcommon.h
+++ b/src/rfxcommon.h
@@ -1,7 +1,7 @@
 /**
  * RFX codec
  *
- * Copyright 2014 Jay Sorg <jay.sorg@gmail.com>
+ * Copyright 2014-2015 Jay Sorg <jay.sorg@gmail.com>
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,8 @@
 #define MAX(_val1, _val2) (_val1) > (_val2) ? (_val1) : (_val2)
 #define MINMAX(_v, _l, _h) ((_v) < (_l) ? (_l) : ((_v) > (_h) ? (_h) : (_v)))
 
+#define DWT_FACTOR 5
+
 typedef signed char sint8;
 typedef unsigned char uint8;
 typedef signed short sint16;
diff --git a/src/rfxcompose.c b/src/rfxcompose.c
index d3af2dd..f208a32 100644
--- a/src/rfxcompose.c
+++ b/src/rfxcompose.c
@@ -3,6 +3,7 @@
  * RemoteFX Codec Library
  *
  * Copyright 2011 Vic Lee
+ * Copyright 2015 Jay Sorg <jay.sorg@gmail.com>
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -28,12 +29,16 @@
 #include "rfxconstants.h"
 #include "rfxencode_tile.h"
 
+#define LLOG_LEVEL 1
+#define LLOGLN(_level, _args) \
+    do { if (_level < LLOG_LEVEL) { printf _args ; printf("\n"); } } while (0)
+
 /*
  * LL3, LH3, HL3, HH3, LH2, HL2, HH2, LH1, HL1, HH1
  */
-static const int g_rfx_default_quantization_values[] =
+static const unsigned char g_rfx_default_quantization_values[] =
 {
-    6, 6, 6, 6, 7, 7, 8, 8, 8, 9
+    0x66, 0x66, 0x77, 0x88, 0x98
 };
 
 /******************************************************************************/
@@ -168,7 +173,7 @@ rfx_compose_message_frame_begin(struct rfxencode* enc, STREAM* s)
 /******************************************************************************/
 static int
 rfx_compose_message_region(struct rfxencode* enc, STREAM* s,
-                           struct rfx_rect *regions, int num_regions)
+                           const struct rfx_rect *regions, int num_regions)
 {
     int size;
     int i;
@@ -200,7 +205,7 @@ rfx_compose_message_region(struct rfxencode* enc, STREAM* s,
 static int
 rfx_compose_message_tile_yuv(struct rfxencode *enc, STREAM *s,
                              char *tile_data, int tile_width, int tile_height,
-                             int stride_bytes, const int *quantVals,
+                             int stride_bytes, const char *quantVals,
                              int quantIdxY, int quantIdxCb, int quantIdxCr,
                              int xIdx, int yIdx)
 {
@@ -221,9 +226,9 @@ rfx_compose_message_tile_yuv(struct rfxencode *enc, STREAM *s,
     stream_seek(s, 6); /* YLen, CbLen, CrLen */
     if (rfx_encode_yuv(enc, tile_data, tile_width, tile_height,
                        stride_bytes,
-                       quantVals + quantIdxY * 10,
-                       quantVals + quantIdxCb * 10,
-                       quantVals + quantIdxCr * 10,
+                       quantVals + quantIdxY * 5,
+                       quantVals + quantIdxCb * 5,
+                       quantVals + quantIdxCr * 5,
                        s, &YLen, &CbLen, &CrLen) != 0)
     {
         return 1;
@@ -239,11 +244,56 @@ rfx_compose_message_tile_yuv(struct rfxencode *enc, STREAM *s,
     return 0;
 }
 
+/******************************************************************************/
+static int
+rfx_compose_message_tile_yuva(struct rfxencode *enc, STREAM *s,
+                              char *tile_data, int tile_width, int tile_height,
+                              int stride_bytes, const char *quantVals,
+                              int quantIdxY, int quantIdxCb, int quantIdxCr,
+                              int xIdx, int yIdx)
+{
+    int YLen = 0;
+    int CbLen = 0;
+    int CrLen = 0;
+    int ALen = 0;
+    int start_pos;
+    int end_pos;
+
+    start_pos = stream_get_pos(s);
+    stream_write_uint16(s, CBT_TILE); /* BlockT.blockType */
+    stream_seek_uint32(s); /* set BlockT.blockLen later */
+    stream_write_uint8(s, quantIdxY);
+    stream_write_uint8(s, quantIdxCb);
+    stream_write_uint8(s, quantIdxCr);
+    stream_write_uint16(s, xIdx);
+    stream_write_uint16(s, yIdx);
+    stream_seek(s, 8); /* YLen, CbLen, CrLen, ALen */
+    if (rfx_encode_yuva(enc, tile_data, tile_width, tile_height,
+                        stride_bytes,
+                        quantVals + quantIdxY * 5,
+                        quantVals + quantIdxCb * 5,
+                        quantVals + quantIdxCr * 5,
+                        s, &YLen, &CbLen, &CrLen, &ALen) != 0)
+    {
+        return 1;
+    }
+    end_pos = stream_get_pos(s);
+    stream_set_pos(s, start_pos + 2);
+    stream_write_uint32(s, 19 + YLen + CbLen + CrLen + ALen); /* BlockT.blockLen */
+    stream_set_pos(s, start_pos + 13);
+    stream_write_uint16(s, YLen);
+    stream_write_uint16(s, CbLen);
+    stream_write_uint16(s, CrLen);
+    stream_write_uint16(s, ALen);
+    stream_set_pos(s, end_pos);
+    return 0;
+}
+
 /******************************************************************************/
 static int
 rfx_compose_message_tile_rgb(struct rfxencode *enc, STREAM *s,
                              char *tile_data, int tile_width, int tile_height,
-                             int stride_bytes, const int *quantVals,
+                             int stride_bytes, const char *quantVals,
                              int quantIdxY, int quantIdxCb, int quantIdxCr,
                              int xIdx, int yIdx)
 {
@@ -264,9 +314,9 @@ rfx_compose_message_tile_rgb(struct rfxencode *enc, STREAM *s,
     stream_seek(s, 6); /* YLen, CbLen, CrLen */
     if (rfx_encode_rgb(enc, tile_data, tile_width, tile_height,
                        stride_bytes,
-                       quantVals + quantIdxY * 10,
-                       quantVals + quantIdxCb * 10,
-                       quantVals + quantIdxCr * 10,
+                       quantVals + quantIdxY * 5,
+                       quantVals + quantIdxCb * 5,
+                       quantVals + quantIdxCr * 5,
                        s, &YLen, &CbLen, &CrLen) != 0)
     {
         return 1;
@@ -282,21 +332,68 @@ rfx_compose_message_tile_rgb(struct rfxencode *enc, STREAM *s,
     return 0;
 }
 
+/******************************************************************************/
+static int
+rfx_compose_message_tile_argb(struct rfxencode *enc, STREAM *s,
+                              char *tile_data, int tile_width, int tile_height,
+                              int stride_bytes, const char *quantVals,
+                              int quantIdxY, int quantIdxCb, int quantIdxCr,
+                              int xIdx, int yIdx)
+{
+    int YLen = 0;
+    int CbLen = 0;
+    int CrLen = 0;
+    int ALen = 0;
+    int start_pos;
+    int end_pos;
+
+    LLOGLN(10, ("rfx_compose_message_tile_argb:"));
+    start_pos = stream_get_pos(s);
+    stream_write_uint16(s, CBT_TILE); /* BlockT.blockType */
+    stream_seek_uint32(s); /* set BlockT.blockLen later */
+    stream_write_uint8(s, quantIdxY);
+    stream_write_uint8(s, quantIdxCb);
+    stream_write_uint8(s, quantIdxCr);
+    stream_write_uint16(s, xIdx);
+    stream_write_uint16(s, yIdx);
+    stream_seek(s, 8); /* YLen, CbLen, CrLen, ALen */
+    if (rfx_encode_argb(enc, tile_data, tile_width, tile_height,
+                        stride_bytes,
+                        quantVals + quantIdxY * 5,
+                        quantVals + quantIdxCb * 5,
+                        quantVals + quantIdxCr * 5,
+                        s, &YLen, &CbLen, &CrLen, &ALen) != 0)
+    {
+        LLOGLN(10, ("rfx_compose_message_tile_argb: rfx_encode_argb failed"));
+        return 1;
+    }
+    end_pos = stream_get_pos(s);
+    stream_set_pos(s, start_pos + 2);
+    stream_write_uint32(s, 19 + YLen + CbLen + CrLen + ALen); /* BlockT.blockLen */
+    stream_set_pos(s, start_pos + 13);
+    stream_write_uint16(s, YLen);
+    stream_write_uint16(s, CbLen);
+    stream_write_uint16(s, CrLen);
+    stream_write_uint16(s, ALen);
+    stream_set_pos(s, end_pos);
+    return 0;
+}
+
 /******************************************************************************/
 static int
 rfx_compose_message_tileset(struct rfxencode* enc, STREAM* s,
                             char* buf, int width, int height,
                             int stride_bytes,
-                            struct rfx_tile *tiles, int num_tiles,
-                            const int *quants, int num_quants)
+                            const struct rfx_tile *tiles, int num_tiles,
+                            const char *quants, int num_quants,
+                            int flags)
 {
     int size;
     int start_pos;
     int end_pos;
     int index;
     int numQuants;
-    const int *quantVals;
-    const int *quantValsPtr;
+    const char *quantVals;
     int quantIdxY;
     int quantIdxCb;
     int quantIdxCr;
@@ -308,10 +405,11 @@ rfx_compose_message_tileset(struct rfxencode* enc, STREAM* s,
     int cy;
     char *tile_data;
 
+    LLOGLN(10, ("rfx_compose_message_tileset:"));
     if (quants == 0)
     {
         numQuants = 1;
-        quantVals = g_rfx_default_quantization_values;
+        quantVals = (const char *) g_rfx_default_quantization_values;
     }
     else
     {
@@ -321,7 +419,15 @@ rfx_compose_message_tileset(struct rfxencode* enc, STREAM* s,
     numTiles = num_tiles;
     size = 22 + numQuants * 5;
     start_pos = stream_get_pos(s);
-    stream_write_uint16(s, WBT_EXTENSION); /* CodecChannelT.blockType */
+    if (flags & RFX_FLAGS_ALPHAV1)
+    {
+        LLOGLN(10, ("rfx_compose_message_tileset: RFX_FLAGS_ALPHAV1 set"));
+        stream_write_uint16(s, WBT_EXTENSION_PLUS); /* CodecChannelT.blockType */
+    }
+    else
+    {
+        stream_write_uint16(s, WBT_EXTENSION); /* CodecChannelT.blockType */
+    }
     stream_seek_uint32(s); /* set CodecChannelT.blockLen later */
     stream_write_uint8(s, 1); /* CodecChannelT.codecId */
     stream_write_uint8(s, 0); /* CodecChannelT.channelId */
@@ -332,54 +438,100 @@ rfx_compose_message_tileset(struct rfxencode* enc, STREAM* s,
     stream_write_uint8(s, 0x40); /* tileSize */
     stream_write_uint16(s, numTiles); /* numTiles */
     stream_seek_uint32(s); /* set tilesDataSize later */
-    quantValsPtr = quantVals;
-    for (index = 0; index < numQuants * 5; index++)
-    {
-        stream_write_uint8(s, quantValsPtr[0] + (quantValsPtr[1] << 4));
-        quantValsPtr += 2;
-    }
+    memcpy(s->p, quantVals, numQuants * 5);
+    s->p += numQuants * 5;
     end_pos = stream_get_pos(s);
     if (enc->format == RFX_FORMAT_YUV)
     {
-        for (index = 0; index < numTiles; index++)
+        if (flags & RFX_FLAGS_ALPHAV1)
+        {
+            for (index = 0; index < numTiles; index++)
+            {
+                x = tiles[index].x;
+                y = tiles[index].y;
+                cx = tiles[index].cx;
+                cy = tiles[index].cy;
+                quantIdxY = tiles[index].quant_y;
+                quantIdxCb = tiles[index].quant_cb;
+                quantIdxCr = tiles[index].quant_cr;
+                tile_data = buf + (y << 8) * (stride_bytes >> 8) + (x << 8);
+                if (rfx_compose_message_tile_yuva(enc, s,
+                                                  tile_data, cx, cy, stride_bytes,
+                                                  quantVals,
+                                                  quantIdxY, quantIdxCb, quantIdxCr,
+                                                  x / 64, y / 64) != 0)
+                {
+                    return 1;
+                }
+            }
+        }
+        else
         {
-            x = tiles[index].x;
-            y = tiles[index].y;
-            cx = tiles[index].cx;
-            cy = tiles[index].cy;
-            quantIdxY = tiles[index].quant_y;
-            quantIdxCb = tiles[index].quant_cb;
-            quantIdxCr = tiles[index].quant_cr;
-            tile_data = buf + (y << 8) * (stride_bytes >> 8) + (x << 8);
-            if (rfx_compose_message_tile_yuv(enc, s,
-                                             tile_data, cx, cy, stride_bytes,
-                                             quantVals,
-                                             quantIdxY, quantIdxCb, quantIdxCr,
-                                             x / 64, y / 64) != 0)
+            for (index = 0; index < numTiles; index++)
             {
-                return 1;
+                x = tiles[index].x;
+                y = tiles[index].y;
+                cx = tiles[index].cx;
+                cy = tiles[index].cy;
+                quantIdxY = tiles[index].quant_y;
+                quantIdxCb = tiles[index].quant_cb;
+                quantIdxCr = tiles[index].quant_cr;
+                tile_data = buf + (y << 8) * (stride_bytes >> 8) + (x << 8);
+                if (rfx_compose_message_tile_yuv(enc, s,
+                                                 tile_data, cx, cy, stride_bytes,
+                                                 quantVals,
+                                                 quantIdxY, quantIdxCb, quantIdxCr,
+                                                 x / 64, y / 64) != 0)
+                {
+                    return 1;
+                }
             }
         }
     }
     else
     {
-        for (index = 0; index < numTiles; index++)
+        if (flags & RFX_FLAGS_ALPHAV1)
+        {
+            for (index = 0; index < numTiles; index++)
+            {
+                x = tiles[index].x;
+                y = tiles[index].y;
+                cx = tiles[index].cx;
+                cy = tiles[index].cy;
+                quantIdxY = tiles[index].quant_y;
+                quantIdxCb = tiles[index].quant_cb;
+                quantIdxCr = tiles[index].quant_cr;
+                tile_data = buf + y * stride_bytes + x * (enc->bits_per_pixel / 8);
+                if (rfx_compose_message_tile_argb(enc, s,
+                                                  tile_data, cx, cy, stride_bytes,
+                                                  quantVals,
+                                                  quantIdxY, quantIdxCb, quantIdxCr,
+                                                  x / 64, y / 64) != 0)
+                {
+                    return 1;
+                }
+            }
+        }
+        else
         {
-            x = tiles[index].x;
-            y = tiles[index].y;
-            cx = tiles[index].cx;
-            cy = tiles[index].cy;
-            quantIdxY = tiles[index].quant_y;
-            quantIdxCb = tiles[index].quant_cb;
-            quantIdxCr = tiles[index].quant_cr;
-            tile_data = buf + y * stride_bytes + x * (enc->bits_per_pixel / 8);
-            if (rfx_compose_message_tile_rgb(enc, s,
-                                             tile_data, cx, cy, stride_bytes,
-                                             quantVals,
-                                             quantIdxY, quantIdxCb, quantIdxCr,
-                                             x / 64, y / 64) != 0)
+            for (index = 0; index < numTiles; index++)
             {
-                return 1;
+                x = tiles[index].x;
+                y = tiles[index].y;
+                cx = tiles[index].cx;
+                cy = tiles[index].cy;
+                quantIdxY = tiles[index].quant_y;
+                quantIdxCb = tiles[index].quant_cb;
+                quantIdxCr = tiles[index].quant_cr;
+                tile_data = buf + y * stride_bytes + x * (enc->bits_per_pixel / 8);
+                if (rfx_compose_message_tile_rgb(enc, s,
+                                                 tile_data, cx, cy, stride_bytes,
+                                                 quantVals,
+                                                 quantIdxY, quantIdxCb, quantIdxCr,
+                                                 x / 64, y / 64) != 0)
+                {
+                    return 1;
+                }
             }
         }
     }
@@ -412,10 +564,10 @@ rfx_compose_message_frame_end(struct rfxencode* enc, STREAM* s)
 /******************************************************************************/
 int
 rfx_compose_message_data(struct rfxencode* enc, STREAM* s,
-                         struct rfx_rect *regions, int num_regions,
+                         const struct rfx_rect *regions, int num_regions,
                          char *buf, int width, int height, int stride_bytes,
-                         struct rfx_tile *tiles, int num_tiles,
-                         const int *quants, int num_quants)
+                         const struct rfx_tile *tiles, int num_tiles,
+                         const char *quants, int num_quants, int flags)
 {
     if (rfx_compose_message_frame_begin(enc, s) != 0)
     {
@@ -426,7 +578,8 @@ rfx_compose_message_data(struct rfxencode* enc, STREAM* s,
         return 1;
     }
     if (rfx_compose_message_tileset(enc, s, buf, width, height, stride_bytes,
-                                    tiles, num_tiles, quants, num_quants) != 0)
+                                    tiles, num_tiles, quants, num_quants,
+                                    flags) != 0)
     {
         return 1;
     }
diff --git a/src/rfxcompose.h b/src/rfxcompose.h
index aab4770..7d30233 100644
--- a/src/rfxcompose.h
+++ b/src/rfxcompose.h
@@ -1,7 +1,7 @@
 /**
  * RFX codec encoder
  *
- * Copyright 2014 Jay Sorg <jay.sorg@gmail.com>
+ * Copyright 2014-2015 Jay Sorg <jay.sorg@gmail.com>
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,9 +25,9 @@ int
 rfx_compose_message_header(struct rfxencode* enc, STREAM* s);
 int
 rfx_compose_message_data(struct rfxencode* enc, STREAM* s,
-                         struct rfx_rect *regions, int num_regions,
+                         const struct rfx_rect *regions, int num_regions,
                          char *buf, int width, int height, int stride_bytes,
-                         struct rfx_tile *tiles, int num_tiles,
-                         const int *quants, int num_quants);
+                         const struct rfx_tile *tiles, int num_tiles,
+                         const char *quants, int num_quants, int flags);
 
 #endif
diff --git a/src/rfxconstants.h b/src/rfxconstants.h
index 05cb18d..770fccb 100644
--- a/src/rfxconstants.h
+++ b/src/rfxconstants.h
@@ -39,6 +39,7 @@ enum _RLGR_MODE
 #define WBT_FRAME_END           0xCCC5
 #define WBT_REGION              0xCCC6
 #define WBT_EXTENSION           0xCCC7
+#define WBT_EXTENSION_PLUS      0xDDD7
 #define CBT_REGION              0xCAC1
 #define CBT_TILESET             0xCAC2
 #define CBT_TILE                0xCAC3
diff --git a/src/rfxencode.c b/src/rfxencode.c
index 4ad57f8..9bbf103 100644
--- a/src/rfxencode.c
+++ b/src/rfxencode.c
@@ -1,7 +1,7 @@
 /**
  * RFX codec encoder
  *
- * Copyright 2014 Jay Sorg <jay.sorg@gmail.com>
+ * Copyright 2014-2015 Jay Sorg <jay.sorg@gmail.com>
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -28,103 +28,46 @@
 #include "rfxconstants.h"
 #include "rfxencode_tile.h"
 
-/******************************************************************************/
-static void
-cpuid(int func, int *eax, int *ebx, int *ecx, int *edx)
-{
-    *eax = 0;
-    *ebx = 0;
-    *ecx = 0;
-    *edx = 0;
-#ifdef __GNUC__
-#if defined(__i386__) || defined(__x86_64__)
-    *eax = func;
-    __asm volatile
-        (
-            "mov %%ebx, %%edi;"
-            "cpuid;"
-            "mov %%ebx, %%esi;"
-            "mov %%edi, %%ebx;"
-            :"+a" (*eax), "=S" (*ebx), "=c" (*ecx), "=d" (*edx)
-            : :"edi"
-        );
-#endif
+#ifdef RFX_USE_ACCEL_X86
+#include "x86/funcs_x86.h"
 #endif
-}
-
-#if 0
-inline unsigned int get_cpu_feature_flags()
-{
-    unsigned int features;
-
-    __asm
-    {
-        // Save registers
-        push    eax
-        push    ebx
-        push    ecx
-        push    edx
-
-        // Get the feature flags (eax=1) from edx
-        mov     eax, 1
-        cpuid
-        mov     features, edx
-
-        // Restore registers
-        pop     edx
-        pop     ecx
-        pop     ebx
-        pop     eax
-    }
-
-    return features;
-}
-
-#define cpuid(func,a,b,c,d)\
-	asm {\
-	mov	eax, func\
-	cpuid\
-	mov	a, eax\
-	mov	b, ebx\
-	mov	c, ecx\
-	mov	d, edx\
-	}
 
-#endif
-
-// http://softpixel.com/~cwright/programming/simd/cpuid.php
-
-#define SSE4_1_FLAG     0x080000
-#define SSE4_2_FLAG     0x100000
-
-/*
-Function 0x80000001:
-bit (edx) feature
-22        AMD MMX Extensions
-30        3DNow!2
-31        3DNow! 
-*/ 
-
-#if 0
-#define cpuid(_func, _ax, _bx, _cx, _dx) \
-    __asm volatile ("cpuid": \
-    "=a" (_ax), "=b" (_bx), "=c" (_cx), "=d" (_dx) : "a" (_func));
+#ifdef RFX_USE_ACCEL_AMD64
+#include "amd64/funcs_amd64.h"
 #endif
 
 /******************************************************************************/
-void *
-rfxcodec_encode_create(int width, int height, int format, int flags)
+int
+rfxcodec_encode_create_ex(int width, int height, int format, int flags,
+                          void **handle)
 {
     struct rfxencode *enc;
-    int ax, bx, cx, dx;
+    int ax;
+    int bx;
+    int cx;
+    int dx;
 
     enc = (struct rfxencode *) malloc(sizeof(struct rfxencode));
     if (enc == 0)
     {
-        return 0;
+        return 1;
     }
     memset(enc, 0, sizeof(struct rfxencode));
-    cpuid(1, &ax, &bx, &cx, &dx);
+
+    enc->dwt_buffer = (sint16*)(((size_t)(enc->dwt_buffer_a)) & ~15);
+    enc->dwt_buffer1 = (sint16*)(((size_t)(enc->dwt_buffer1_a)) & ~15);
+    enc->dwt_buffer2 = (sint16*)(((size_t)(enc->dwt_buffer2_a)) & ~15);
+
+#if defined(RFX_USE_ACCEL_X86)
+    cpuid_x86(1, 0, &ax, &bx, &cx, &dx);
+#elif defined(RFX_USE_ACCEL_AMD64)
+    cpuid_amd64(1, 0, &ax, &bx, &cx, &dx);
+#else
+    ax = 0;
+    bx = 0;
+    cx = 0;
+    dx = 0;
+#endif
     if (dx & (1 << 26)) /* SSE 2 */
     {
         printf("rfxcodec_encode_create: got sse2\n");
@@ -150,7 +93,16 @@ rfxcodec_encode_create(int width, int height, int format, int flags)
         printf("rfxcodec_encode_create: got popcnt\n");
         enc->got_popcnt = 1;
     }
-    cpuid(0x80000001, &ax, &bx, &cx, &dx);
+#if defined(RFX_USE_ACCEL_X86)
+    cpuid_x86(0x80000001, 0, &ax, &bx, &cx, &dx);
+#elif defined(RFX_USE_ACCEL_AMD64)
+    cpuid_amd64(0x80000001, 0, &ax, &bx, &cx, &dx);
+#else
+    ax = 0;
+    bx = 0;
+    cx = 0;
+    dx = 0;
+#endif
     if (cx & (1 << 5)) /* lzcnt */
     {
         printf("rfxcodec_encode_create: got lzcnt\n");
@@ -169,7 +121,7 @@ rfxcodec_encode_create(int width, int height, int format, int flags)
     {
         enc->mode = RLGR1;
     }
-    switch (format) 
+    switch (format)
     {
         case RFX_FORMAT_BGRA:
             enc->bits_per_pixel = 32;
@@ -188,7 +140,7 @@ rfxcodec_encode_create(int width, int height, int format, int flags)
             break;
         default:
             free(enc);
-            return NULL;
+            return 2;
     }
     enc->format = format;
     /* assign encoding functions */
@@ -196,29 +148,133 @@ rfxcodec_encode_create(int width, int height, int format, int flags)
     {
         if (enc->mode == RLGR3)
         {
+            printf("rfxcodec_encode_create: rfx_encode set to rfx_encode_component_rlgr3\n");
             enc->rfx_encode = rfx_encode_component_rlgr3; /* rfxencode_tile.c */
         }
         else
         {
+            printf("rfxcodec_encode_create: rfx_encode set to rfx_encode_component_rlgr1\n");
             enc->rfx_encode = rfx_encode_component_rlgr1; /* rfxencode_tile.c */
         }
     }
     else
     {
-#if defined(RFX_USE_ACCEL) && RFX_USE_ACCEL
-        enc->rfx_encode = rfx_encode_component_x86_sse4; /* rfxencode_tile.c */
+#if defined(RFX_USE_ACCEL_X86)
+        if (enc->got_sse41)
+        {
+            if (enc->mode == RLGR3)
+            {
+                printf("rfxcodec_encode_create: rfx_encode set to rfx_encode_component_rlgr3_x86_sse41\n");
+                enc->rfx_encode = rfx_encode_component_rlgr3_x86_sse41; /* rfxencode_tile.c */
+            }
+            else
+            {
+                printf("rfxcodec_encode_create: rfx_encode set to rfx_encode_component_rlgr1_x86_sse41\n");
+                enc->rfx_encode = rfx_encode_component_rlgr1_x86_sse41; /* rfxencode_tile.c */
+            }
+        }
+        else if (enc->got_sse2)
+        {
+            if (enc->mode == RLGR3)
+            {
+                printf("rfxcodec_encode_create: rfx_encode set to rfx_encode_component_rlgr3_x86_sse2\n");
+                enc->rfx_encode = rfx_encode_component_rlgr3_x86_sse2; /* rfxencode_tile.c */
+            }
+            else
+            {
+                printf("rfxcodec_encode_create: rfx_encode set to rfx_encode_component_rlgr1_x86_sse2\n");
+                enc->rfx_encode = rfx_encode_component_rlgr1_x86_sse2; /* rfxencode_tile.c */
+            }
+        }
+        else
+        {
+            if (enc->mode == RLGR3)
+            {
+                printf("rfxcodec_encode_create: rfx_encode set to rfx_encode_component_rlgr3\n");
+                enc->rfx_encode = rfx_encode_component_rlgr3; /* rfxencode_tile.c */
+            }
+            else
+            {
+                printf("rfxcodec_encode_create: rfx_encode set to rfx_encode_component_rlgr1\n");
+                enc->rfx_encode = rfx_encode_component_rlgr1; /* rfxencode_tile.c */
+            }
+        }
+#elif defined(RFX_USE_ACCEL_AMD64)
+        if (enc->got_sse41)
+        {
+            if (enc->mode == RLGR3)
+            {
+                printf("rfxcodec_encode_create: rfx_encode set to rfx_encode_component_rlgr3_amd64_sse41\n");
+                enc->rfx_encode = rfx_encode_component_rlgr3_amd64_sse41; /* rfxencode_tile.c */
+            }
+            else
+            {
+                printf("rfxcodec_encode_create: rfx_encode set to rfx_encode_component_rlgr1_amd64_sse41\n");
+                enc->rfx_encode = rfx_encode_component_rlgr1_amd64_sse41; /* rfxencode_tile.c */
+            }
+        }
+        else if (enc->got_sse2)
+        {
+            if (enc->mode == RLGR3)
+            {
+                printf("rfxcodec_encode_create: rfx_encode set to rfx_encode_component_rlgr3_amd64_sse2\n");
+                enc->rfx_encode = rfx_encode_component_rlgr3_amd64_sse2; /* rfxencode_tile.c */
+            }
+            else
+            {
+                printf("rfxcodec_encode_create: rfx_encode set to rfx_encode_component_rlgr1_amd64_sse2\n");
+                enc->rfx_encode = rfx_encode_component_rlgr1_amd64_sse2; /* rfxencode_tile.c */
+            }
+        }
+        else
+        {
+            if (enc->mode == RLGR3)
+            {
+                printf("rfxcodec_encode_create: rfx_encode set to rfx_encode_component_rlgr3\n");
+                enc->rfx_encode = rfx_encode_component_rlgr3; /* rfxencode_tile.c */
+            }
+            else
+            {
+                printf("rfxcodec_encode_create: rfx_encode set to rfx_encode_component_rlgr1\n");
+                enc->rfx_encode = rfx_encode_component_rlgr1; /* rfxencode_tile.c */
+            }
+        }
 #else
         if (enc->mode == RLGR3)
         {
+            printf("rfxcodec_encode_create: rfx_encode set to rfx_encode_component_rlgr3\n");
             enc->rfx_encode = rfx_encode_component_rlgr3; /* rfxencode_tile.c */
         }
         else
         {
+            printf("rfxcodec_encode_create: rfx_encode set to rfx_encode_component_rlgr1\n");
             enc->rfx_encode = rfx_encode_component_rlgr1; /* rfxencode_tile.c */
         }
 #endif
     }
-    return enc; 
+    if (ax == 0)
+    {
+    }
+    if (bx == 0)
+    {
+    }
+    *handle = enc;
+    return 0;
+}
+
+/******************************************************************************/
+void *
+rfxcodec_encode_create(int width, int height, int format, int flags)
+{
+    int error;
+    void *handle;
+
+    error = rfxcodec_encode_create_ex(width, height, format, flags, &handle);
+    if (error == 0)
+    {
+        return handle; 
+    }
+    return 0;
 }
 
 /******************************************************************************/
@@ -238,11 +294,11 @@ rfxcodec_encode_destroy(void * handle)
 
 /******************************************************************************/
 int
-rfxcodec_encode(void *handle, char *cdata, int *cdata_bytes,
-                char *buf, int width, int height, int stride_bytes,
-                struct rfx_rect *regions, int num_regions,
-                struct rfx_tile *tiles, int num_tiles,
-                const int *quants, int num_quants)
+rfxcodec_encode_ex(void *handle, char *cdata, int *cdata_bytes,
+                   char *buf, int width, int height, int stride_bytes,
+                   const struct rfx_rect *regions, int num_regions,
+                   const struct rfx_tile *tiles, int num_tiles,
+                   const char *quants, int num_quants, int flags)
 {
     struct rfxencode *enc;
     STREAM s;
@@ -263,10 +319,25 @@ rfxcodec_encode(void *handle, char *cdata, int *cdata_bytes,
     }
     if (rfx_compose_message_data(enc, &s, regions, num_regions,
                                  buf, width, height, stride_bytes,
-                                 tiles, num_tiles, quants, num_quants) != 0)
+                                 tiles, num_tiles, quants, num_quants,
+                                 flags) != 0)
     {
         return 1;
     }
     *cdata_bytes = (int) (s.p - s.data);
     return 0;
 }
+
+/******************************************************************************/
+int
+rfxcodec_encode(void *handle, char *cdata, int *cdata_bytes,
+                char *buf, int width, int height, int stride_bytes,
+                const struct rfx_rect *regions, int num_regions,
+                const struct rfx_tile *tiles, int num_tiles,
+                const char *quants, int num_quants)
+{
+    return rfxcodec_encode_ex(handle, cdata, cdata_bytes, buf, width, height,
+                              stride_bytes, regions, num_regions, tiles,
+                              num_tiles, quants, num_quants, 0);
+}
+
diff --git a/src/rfxencode.h b/src/rfxencode.h
index 4db6a01..c9fc5d0 100644
--- a/src/rfxencode.h
+++ b/src/rfxencode.h
@@ -1,7 +1,7 @@
 /**
  * RFX codec encoder
  *
- * Copyright 2014 Jay Sorg <jay.sorg@gmail.com>
+ * Copyright 2014-2015 Jay Sorg <jay.sorg@gmail.com>
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,8 +21,7 @@
 
 struct rfxencode;
 
-typedef int (*rfx_encode_proc)(struct rfxencode *enc,
-                               const int *quantization_values,
+typedef int (*rfx_encode_proc)(struct rfxencode *enc, const char *qtable,
                                uint8 *data, uint8 *buffer,
                                int buffer_size, int *size);
 
@@ -39,13 +38,18 @@ struct rfxencode
     int format;
     int pad0[7];
 
+    uint8 a_buffer[4096];
     uint8 y_r_buffer[4096];
-    uint8 cb_g_buffer[4096];
-    uint8 cr_b_buffer[4096];
-
-    sint16 dwt_buffer[4096];
-    sint16 dwt_buffer1[4096];
-
+    uint8 u_g_buffer[4096];
+    uint8 v_b_buffer[4096];
+    uint8 pad1[16];
+    sint16 dwt_buffer_a[4096];
+    sint16 dwt_buffer1_a[4096];
+    sint16 dwt_buffer2_a[4096];
+    uint8 pad2[16];
+    sint16* dwt_buffer;
+    sint16* dwt_buffer1;
+    sint16* dwt_buffer2;
     rfx_encode_proc rfx_encode;
 
     int got_sse2;
@@ -56,7 +60,6 @@ struct rfxencode
     int got_popcnt;
     int got_lzcnt;
     int got_neon;
-
 };
 
 #endif
diff --git a/src/rfxencode_alpha.c b/src/rfxencode_alpha.c
new file mode 100644
index 0000000..58d8e10
--- /dev/null
+++ b/src/rfxencode_alpha.c
@@ -0,0 +1,279 @@
+/**
+ * librfxcodec: A Remote Desktop Protocol client.
+ * RemoteFX Codec Library
+ *
+ * Copyright 2015 Jay Sorg <jay.sorg@gmail.com>
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <rfxcodec_encode.h>
+
+#include "rfxcommon.h"
+#include "rfxencode.h"
+#include "rfxconstants.h"
+#include "rfxencode_tile.h"
+
+#define LLOG_LEVEL 1
+#define LLOGLN(_level, _args) \
+    do { if (_level < LLOG_LEVEL) { printf _args ; printf("\n"); } } while (0)
+
+#if 1
+/*****************************************************************************/
+static int
+fdelta(char *in_plane, char *out_plane, int cx, int cy)
+{
+    char delta;
+    char *src8;
+    char *dst8;
+    int index;
+    int jndex;
+
+    memcpy(out_plane, in_plane, cx);
+    src8 = in_plane;
+    dst8 = out_plane;
+    for (jndex = 1; jndex < cy; jndex++)
+    {
+        for (index = 0; index < cx; index++)
+        {
+            delta = src8[cx] - src8[0];
+            if (delta & 0x80)
+            {
+                delta = (((~delta) + 1) << 1) - 1;
+            }
+            else
+            {
+                delta = delta << 1;
+            }
+            dst8[cx] = delta;
+            src8++;
+            dst8++;
+        }
+    }
+    return 0;
+}
+#endif
+
+#if 0
+/*****************************************************************************/
+#define DELTA_ONE \
+do { \
+    delta = src8[cx] - src8[0]; \
+    is_neg = (delta >> 7) & 1; \
+    dst8[cx] = (((delta ^ -is_neg) + is_neg) << 1) - is_neg; \
+    src8++; \
+    dst8++; \
+} while (0)
+
+/*****************************************************************************/
+static int
+fdelta(char *in_plane, char *out_plane, int cx, int cy)
+{
+    char delta;
+    char is_neg;
+    char *src8;
+    char *dst8;
+    char *src8_end;
+
+    memcpy(out_plane, in_plane, cx);
+    src8 = in_plane;
+    dst8 = out_plane;
+    src8_end = src8 + (cx * cy - cx);
+    while (src8 + 8 <= src8_end)
+    {
+        DELTA_ONE;
+        DELTA_ONE;
+        DELTA_ONE;
+        DELTA_ONE;
+        DELTA_ONE;
+        DELTA_ONE;
+        DELTA_ONE;
+        DELTA_ONE;
+    }
+    while (src8 < src8_end)
+    {
+        DELTA_ONE;
+    }
+    return 0;
+}
+#endif
+
+/*****************************************************************************/
+static int
+fout(int collen, int replen, char *colptr, STREAM *s)
+{
+    int code;
+    int lcollen;
+    int lreplen;
+    int cont;
+
+    LLOGLN(10, ("fout: collen %d replen %d", collen, replen));
+    cont = collen > 13;
+    while (cont)
+    {
+        lcollen = collen;
+        if (lcollen > 15)
+        {
+            lcollen = 15;
+        }
+        code = lcollen << 4;
+        stream_write_uint8(s, code);
+        memcpy(s->p, colptr, lcollen);
+        s->p += lcollen;
+        colptr += lcollen;
+        collen -= lcollen;
+        cont = collen > 13;
+    }
+    cont = (collen > 0) || (replen > 0);
+    while (cont)
+    {
+        lreplen = replen;
+        if ((collen == 0) && (lreplen > 15))
+        {
+            /* big run */
+            if (lreplen > 47)
+            {
+                lreplen = 47;
+            }
+            LLOGLN(10, ("fout: big run lreplen %d", lreplen));
+            replen -= lreplen;
+            code = ((lreplen & 0xF) << 4) | ((lreplen & 0xF0) >> 4);
+            stream_write_uint8(s, code);
+            colptr += lreplen;
+        }
+        else
+        {
+            if (lreplen > 15)
+            {
+                lreplen = 15;
+            }
+            replen -= lreplen;
+            if (lreplen < 3)
+            {
+                collen += lreplen;
+                lreplen = 0;
+            }
+            code = (collen << 4) | lreplen;
+            stream_write_uint8(s, code);
+            memcpy(s->p, colptr, collen);
+            s->p += collen;
+            colptr += collen + lreplen;
+            collen = 0;
+        }
+        cont = replen > 0;
+    }
+    return 0;
+}
+
+/*****************************************************************************/
+static int
+fpack(char *plane, int cx, int cy, STREAM *s)
+{
+    char *ptr8;
+    char *colptr;
+    char *lend;
+    uint8 *holdp;
+    int jndex;
+    int collen;
+    int replen;
+
+    LLOGLN(10, ("fpack:"));
+    holdp = s->p;
+    for (jndex = 0; jndex < cy; jndex++)
+    {
+        LLOGLN(10, ("line start line %d cx %d cy %d", jndex, cx, cy));
+        ptr8 = (char *) (plane + jndex * cx);
+        lend = ptr8 + (cx - 1);
+        colptr = ptr8;
+        if (colptr[0] == 0)
+        {
+            collen = 0;
+            replen = 1;
+        }
+        else
+        {
+            collen = 1;
+            replen = 0;
+        }
+        while (ptr8 < lend)
+        {
+            if (ptr8[0] == ptr8[1])
+            {
+                replen++;
+            }
+            else
+            {
+                if (replen > 0)
+                {
+                    if (replen < 3)
+                    {
+                        collen += replen + 1;
+                        replen = 0;
+                    }
+                    else
+                    {
+                        fout(collen, replen, colptr, s);
+                        colptr = ptr8 + 1;
+                        replen = 0;
+                        collen = 1;
+                    }
+                }
+                else
+                {
+                    collen++;
+                }
+            }
+            ptr8++;
+        }
+        /* end of line */
+        fout(collen, replen, colptr, s);
+    }
+    return (int) (s->p - holdp);
+}
+
+/*****************************************************************************/
+int
+rfx_encode_plane(struct rfxencode *enc, uint8 *plane, int cx, int cy,
+                 STREAM *s)
+{
+    char *org_plane;
+    char *delta_plane;
+    int bytes;
+    uint8 *holdp;
+
+    org_plane = (char *) plane;
+    delta_plane = (char *) (enc->dwt_buffer1);
+    fdelta(org_plane, delta_plane, cx, cy);
+    holdp = s->p;
+    stream_write_uint8(s, 0x10); /* flags, RLE */
+    bytes = fpack(delta_plane, cx, cy, s);
+    if (bytes > cx * cy)
+    {
+        LLOGLN(10, ("rfx_encode_plane: too big bytes %d", bytes));
+        s->p = holdp;
+        stream_write_uint8(s, 0); /* flags */
+        memcpy(s->p, plane, cx * cy);
+        s->p += cx * cy;
+        stream_write_uint8(s, 0); /* pad if not RLE */
+        bytes = cx * cy + 2;
+    }
+    else
+    {
+        LLOGLN(10, ("rfx_encode_plane: ok bytes %d", bytes));
+    }
+    return bytes;
+}
diff --git a/src/rfxencode_alpha.h b/src/rfxencode_alpha.h
new file mode 100644
index 0000000..3f01218
--- /dev/null
+++ b/src/rfxencode_alpha.h
@@ -0,0 +1,28 @@
+/**
+ * librfxcodec: A Remote Desktop Protocol client.
+ * RemoteFX Codec Library
+ *
+ * Copyright 2015 Jay Sorg <jay.sorg@gmail.com>
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __RFXCODEC_ENCODE_ALPHA_H
+#define __RFXCODEC_ENCODE_ALPHA_H
+
+int
+rfx_encode_plane(struct rfxencode *enc, uint8 *plane, int cx, int cy,
+                 STREAM *s);
+
+#endif
+
diff --git a/src/rfxencode_dwt.c b/src/rfxencode_dwt.c
index b68b765..36c8e93 100644
--- a/src/rfxencode_dwt.c
+++ b/src/rfxencode_dwt.c
@@ -3,7 +3,7 @@
  * RemoteFX Codec Library - DWT
  *
  * Copyright 2011 Vic Lee
- * Copyright 2014 Jay Sorg <jay.sorg@gmail.com>
+ * Copyright 2014-2015 Jay Sorg <jay.sorg@gmail.com>
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -151,6 +151,7 @@ rfx_dwt_2d_encode_block8(uint8 *in_buffer,
 {
     uint8 *src;
     sint16 *l, *h;
+    sint16 s1, s2, s3;
     int total_width;
     int x, y;
     int n;
@@ -166,8 +167,12 @@ rfx_dwt_2d_encode_block8(uint8 *in_buffer,
         l = dwt + x;
         h = l + subband_width * total_width;
         src = in_buffer + x;
-        *h = ((src[total_width] - 128) - (((src[0] - 128) + (src[2 * total_width] - 128)) >> 1)) >> 1;
-        *l = (src[0] - 128) + *h;
+        s1 = (src[total_width] - 128) << DWT_FACTOR;
+        s2 = (src[0] - 128) << DWT_FACTOR;
+        s3 = (src[2 * total_width] - 128) << DWT_FACTOR;
+        *h = (s1 - ((s2 + s3) >> 1)) >> 1;
+        s1 = (src[0] - 128) << DWT_FACTOR;
+        *l = s1 + *h;
 
         /* loop */
         for (n = 1; n < subband_width - 1; n++)
@@ -176,8 +181,12 @@ rfx_dwt_2d_encode_block8(uint8 *in_buffer,
             l = dwt + n * total_width + x;
             h = l + subband_width * total_width;
             src = in_buffer + y * total_width + x;
-            *h = ((src[total_width] - 128) - (((src[0] - 128) + (src[2 * total_width] - 128)) >> 1)) >> 1;
-            *l = (src[0] - 128) + ((*(h - total_width) + *h) >> 1);
+            s1 = (src[total_width] - 128) << DWT_FACTOR;
+            s2 = (src[0] - 128) << DWT_FACTOR;
+            s3 = (src[2 * total_width] - 128) << DWT_FACTOR;
+            *h = (s1 - ((s2 + s3) >> 1)) >> 1;
+            s1 = (src[0] - 128) << DWT_FACTOR;
+            *l = s1 + ((*(h - total_width) + *h) >> 1);
         }
 
         /* post */
@@ -186,8 +195,12 @@ rfx_dwt_2d_encode_block8(uint8 *in_buffer,
         l = dwt + n * total_width + x;
         h = l + subband_width * total_width;
         src = in_buffer + y * total_width + x;
-        *h = ((src[total_width] - 128) - (((src[0] - 128) + (src[0] - 128)) >> 1)) >> 1;
-        *l = (src[0] - 128) + ((*(h - total_width) + *h) >> 1);
+        s1 = (src[total_width] - 128) << DWT_FACTOR;
+        s2 = (src[0] - 128) << DWT_FACTOR;
+        s3 = (src[0] - 128) << DWT_FACTOR;
+        *h = (s1 - ((s2 + s3) >> 1)) >> 1;
+        s1 = (src[0] - 128) << DWT_FACTOR;
+        *l = s1 + ((*(h - total_width) + *h) >> 1);
 
     }
 
diff --git a/src/rfxencode_dwt.h b/src/rfxencode_dwt.h
index 248edc1..36a62ed 100644
--- a/src/rfxencode_dwt.h
+++ b/src/rfxencode_dwt.h
@@ -1,7 +1,7 @@
 /**
  * RFX codec encoder
  *
- * Copyright 2014 Jay Sorg <jay.sorg@gmail.com>
+ * Copyright 2014-2015 Jay Sorg <jay.sorg@gmail.com>
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/src/rfxencode_quantization.c b/src/rfxencode_quantization.c
index 9c65b40..6e3a577 100644
--- a/src/rfxencode_quantization.c
+++ b/src/rfxencode_quantization.c
@@ -3,7 +3,7 @@
  * RemoteFX Codec Library - Quantization
  *
  * Copyright 2011 Vic Lee
- * Copyright 2014 Jay Sorg <jay.sorg@gmail.com>
+ * Copyright 2014-2015 Jay Sorg <jay.sorg@gmail.com>
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -88,7 +88,7 @@ rfx_quantization_encode_block(sint16* buffer, int buffer_size, uint32 factor)
 }
 #endif
 
-#if 1
+#if 0
 /******************************************************************************/
 static int
 rfx_quantization_encode_block(sint16* buffer, int buffer_size, uint32 factor)
@@ -110,20 +110,54 @@ rfx_quantization_encode_block(sint16* buffer, int buffer_size, uint32 factor)
 }
 #endif
 
+#if 1
+/******************************************************************************/
+static int
+rfx_quantization_encode_block(sint16* buffer, int buffer_size, uint32 factor)
+{
+    sint16* dst;
+    sint16 half;
+
+    factor += DWT_FACTOR;
+    if (factor == 0)
+    {
+        return 1;
+    }
+    half = (1 << (factor - 1));
+    for (dst = buffer; buffer_size > 0; dst++, buffer_size--)
+    {
+        *dst = (*dst + half) >> factor;
+    }
+    return 0;
+}
+#endif
+
 /******************************************************************************/
 int
-rfx_quantization_encode(sint16* buffer, const int* quantization_values)
+rfx_quantization_encode(sint16* buffer, const char* qtable)
 {
-    rfx_quantization_encode_block(buffer, 1024, quantization_values[8] - 6); /* HL1 */
-    rfx_quantization_encode_block(buffer + 1024, 1024, quantization_values[7] - 6); /* LH1 */
-    rfx_quantization_encode_block(buffer + 2048, 1024, quantization_values[9] - 6); /* HH1 */
-    rfx_quantization_encode_block(buffer + 3072, 256, quantization_values[5] - 6); /* HL2 */
-    rfx_quantization_encode_block(buffer + 3328, 256, quantization_values[4] - 6); /* LH2 */
-    rfx_quantization_encode_block(buffer + 3584, 256, quantization_values[6] - 6); /* HH2 */
-    rfx_quantization_encode_block(buffer + 3840, 64, quantization_values[2] - 6); /* HL3 */
-    rfx_quantization_encode_block(buffer + 3904, 64, quantization_values[1] - 6); /* LH3 */
-    rfx_quantization_encode_block(buffer + 3968, 64, quantization_values[3] - 6); /* HH3 */
-    rfx_quantization_encode_block(buffer + 4032, 64, quantization_values[0] - 6); /* LL3 */
+    uint32 factor;
+
+    factor = ((qtable[4] >> 0) & 0xf) - 6;
+    rfx_quantization_encode_block(buffer, 1024, factor); /* HL1 */
+    factor = ((qtable[3] >> 4) & 0xf) - 6;
+    rfx_quantization_encode_block(buffer + 1024, 1024, factor); /* LH1 */
+    factor = ((qtable[4] >> 4) & 0xf) - 6;
+    rfx_quantization_encode_block(buffer + 2048, 1024, factor); /* HH1 */
+    factor = ((qtable[2] >> 4) & 0xf) - 6;
+    rfx_quantization_encode_block(buffer + 3072, 256, factor); /* HL2 */
+    factor = ((qtable[2] >> 0) & 0xf) - 6;
+    rfx_quantization_encode_block(buffer + 3328, 256, factor); /* LH2 */
+    factor = ((qtable[3] >> 0) & 0xf) - 6;
+    rfx_quantization_encode_block(buffer + 3584, 256, factor); /* HH2 */
+    factor = ((qtable[1] >> 0) & 0xf) - 6;
+    rfx_quantization_encode_block(buffer + 3840, 64, factor); /* HL3 */
+    factor = ((qtable[0] >> 4) & 0xf) - 6;
+    rfx_quantization_encode_block(buffer + 3904, 64, factor); /* LH3 */
+    factor = ((qtable[1] >> 4) & 0xf) - 6;
+    rfx_quantization_encode_block(buffer + 3968, 64, factor); /* HH3 */
+    factor = ((qtable[0] >> 0) & 0xf) - 6;
+    rfx_quantization_encode_block(buffer + 4032, 64, factor); /* LL3 */
     return 0;
 }
 
diff --git a/src/rfxencode_quantization.h b/src/rfxencode_quantization.h
index a0cd802..d246889 100644
--- a/src/rfxencode_quantization.h
+++ b/src/rfxencode_quantization.h
@@ -23,6 +23,6 @@
 #include "rfxcommon.h"
 
 int
-rfx_quantization_encode(sint16* buffer, const int* quantization_values);
+rfx_quantization_encode(sint16 *buffer, const char *quantization_values);
 
 #endif /* __RFX_QUANTIZATION_H */
diff --git a/src/rfxencode_rlgr1.c b/src/rfxencode_rlgr1.c
index e4b9867..638b535 100644
--- a/src/rfxencode_rlgr1.c
+++ b/src/rfxencode_rlgr1.c
@@ -124,7 +124,7 @@ do { \
 } while (0)
 
 int
-rfx_rlgr1_encode(const sint16* data, int data_size, uint8* buffer, int buffer_size)
+rfx_rlgr1_encode(const sint16* data, uint8* buffer, int buffer_size)
 {
     int k;
     int kp;
@@ -137,6 +137,7 @@ rfx_rlgr1_encode(const sint16* data, int data_size, uint8* buffer, int buffer_si
     int sign;
     int processed_size;
     int lmag;
+    int data_size;
 
     RFX_BITSTREAM bs;
 
@@ -150,6 +151,7 @@ rfx_rlgr1_encode(const sint16* data, int data_size, uint8* buffer, int buffer_si
     krp = 1 << LSGR;
 
     /* process all the input coefficients */
+    data_size = 4096;
     while (data_size > 0)
     {
         if (k)
diff --git a/src/rfxencode_rlgr1.h b/src/rfxencode_rlgr1.h
index a08e637..f941e06 100644
--- a/src/rfxencode_rlgr1.h
+++ b/src/rfxencode_rlgr1.h
@@ -23,6 +23,6 @@
 #include "rfxcommon.h"
 
 int
-rfx_rlgr1_encode(const sint16* data, int data_size, uint8* buffer, int buffer_size);
+rfx_rlgr1_encode(const sint16* data, uint8* buffer, int buffer_size);
 
 #endif /* __RFX_RLGR_H */
diff --git a/src/rfxencode_rlgr3.c b/src/rfxencode_rlgr3.c
index 3b1666d..809767d 100644
--- a/src/rfxencode_rlgr3.c
+++ b/src/rfxencode_rlgr3.c
@@ -124,7 +124,7 @@ do { \
 } while (0)
 
 int
-rfx_rlgr3_encode(const sint16* data, int data_size, uint8* buffer, int buffer_size)
+rfx_rlgr3_encode(const sint16* data, uint8* buffer, int buffer_size)
 {
     int k;
     int kp;
@@ -137,6 +137,7 @@ rfx_rlgr3_encode(const sint16* data, int data_size, uint8* buffer, int buffer_si
     int sign;
     int processed_size;
     int lmag;
+    int data_size;
 
     RFX_BITSTREAM bs;
 
@@ -153,6 +154,7 @@ rfx_rlgr3_encode(const sint16* data, int data_size, uint8* buffer, int buffer_si
     krp = 1 << LSGR;
 
     /* process all the input coefficients */
+    data_size = 4096;
     while (data_size > 0)
     {
         if (k)
diff --git a/src/rfxencode_rlgr3.h b/src/rfxencode_rlgr3.h
index 1efdc4c..2743e39 100644
--- a/src/rfxencode_rlgr3.h
+++ b/src/rfxencode_rlgr3.h
@@ -23,6 +23,6 @@
 #include "rfxcommon.h"
 
 int
-rfx_rlgr3_encode(const sint16* data, int data_size, uint8* buffer, int buffer_size);
+rfx_rlgr3_encode(const sint16* data, uint8* buffer, int buffer_size);
 
 #endif /* __RFX_RLGR_H */
diff --git a/src/rfxencode_tile.c b/src/rfxencode_tile.c
index 409121c..e78b746 100644
--- a/src/rfxencode_tile.c
+++ b/src/rfxencode_tile.c
@@ -3,7 +3,7 @@
  * RemoteFX Codec Library - Encode
  *
  * Copyright 2011 Vic Lee
- * Copyright 2014 Jay Sorg <jay.sorg@gmail.com>
+ * Copyright 2014-2015 Jay Sorg <jay.sorg@gmail.com>
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -33,6 +33,15 @@
 #include "rfxencode_differential.h"
 #include "rfxencode_rlgr1.h"
 #include "rfxencode_rlgr3.h"
+#include "rfxencode_alpha.h"
+
+#ifdef RFX_USE_ACCEL_X86
+#include "x86/funcs_x86.h"
+#endif
+
+#ifdef RFX_USE_ACCEL_AMD64
+#include "amd64/funcs_amd64.h"
+#endif
 
 #define LLOG_LEVEL 1
 #define LLOGLN(_level, _args) \
@@ -50,69 +59,337 @@ rfx_encode_format_rgb(char *rgb_data, int width, int height,
     uint8 r;
     uint8 g;
     uint8 b;
+    uint8 *lr_buf;
+    uint8 *lg_buf;
+    uint8 *lb_buf;
 
+    LLOGLN(10, ("rfx_encode_format_rgb: pixel_format %d", pixel_format));
+    b = 0;
+    g = 0;
+    r = 0;
     switch (pixel_format)
     {
         case RFX_FORMAT_BGRA:
             for (y = 0; y < height; y++)
             {
                 src = (uint8*) (rgb_data + y * stride_bytes);
+                lr_buf = r_buf + y * 64;
+                lg_buf = g_buf + y * 64;
+                lb_buf = b_buf + y * 64;
                 for (x = 0; x < width; x++)
                 {
                     b = *src++;
-                    *b_buf++ = b;
+                    *lb_buf++ = b;
                     g = *src++;
-                    *g_buf++ = g;
+                    *lg_buf++ = g;
                     r = *src++;
-                    *r_buf++ = r;
+                    *lr_buf++ = r;
                     src++;
                 }
+                while (x < 64)
+                {
+                    *lr_buf++ = r;
+                    *lg_buf++ = g;
+                    *lb_buf++ = r;
+                    x++;
+                }
+            }
+            while (y < 64)
+            {
+                lr_buf = r_buf + y * 64;
+                lg_buf = g_buf + y * 64;
+                lb_buf = b_buf + y * 64;
+                memcpy(lr_buf, lr_buf - 64, 64);
+                memcpy(lg_buf, lg_buf - 64, 64);
+                memcpy(lb_buf, lb_buf - 64, 64);
+                y++;
             }
             break;
         case RFX_FORMAT_RGBA:
             for (y = 0; y < height; y++)
             {
                 src = (uint8*) (rgb_data + y * stride_bytes);
+                lr_buf = r_buf + y * 64;
+                lg_buf = g_buf + y * 64;
+                lb_buf = b_buf + y * 64;
                 for (x = 0; x < width; x++)
                 {
                     r = *src++;
-                    *r_buf++ = r;
+                    *lr_buf++ = r;
                     g = *src++;
-                    *g_buf++ = g;
+                    *lg_buf++ = g;
                     b = *src++;
-                    *b_buf++ = b;
+                    *lb_buf++ = b;
                     src++;
                 }
+                while (x < 64)
+                {
+                    *lr_buf++ = r;
+                    *lg_buf++ = g;
+                    *lb_buf++ = b;
+                    x++;
+                }
+            }
+            while (y < 64)
+            {
+                lr_buf = r_buf + y * 64;
+                lg_buf = g_buf + y * 64;
+                lb_buf = b_buf + y * 64;
+                memcpy(lr_buf, lr_buf - 64, 64);
+                memcpy(lg_buf, lg_buf - 64, 64);
+                memcpy(lb_buf, lb_buf - 64, 64);
+                y++;
             }
             break;
         case RFX_FORMAT_BGR:
             for (y = 0; y < height; y++)
             {
                 src = (uint8*) (rgb_data + y * stride_bytes);
+                lr_buf = r_buf + y * 64;
+                lg_buf = g_buf + y * 64;
+                lb_buf = b_buf + y * 64;
                 for (x = 0; x < width; x++)
                 {
                     b = *src++;
-                    *b_buf++ = b;
+                    *lb_buf++ = b;
                     g = *src++;
-                    *g_buf++ = g;
+                    *lg_buf++ = g;
                     r = *src++;
-                    *r_buf++ = r;
+                    *lr_buf++ = r;
+                }
+                while (x < 64)
+                {
+                    *lr_buf++ = r;
+                    *lg_buf++ = g;
+                    *lb_buf++ = b;
+                    x++;
                 }
             }
+            while (y < 64)
+            {
+                lr_buf = r_buf + y * 64;
+                lg_buf = g_buf + y * 64;
+                lb_buf = b_buf + y * 64;
+                memcpy(lr_buf, lr_buf - 64, 64);
+                memcpy(lg_buf, lg_buf - 64, 64);
+                memcpy(lb_buf, lb_buf - 64, 64);
+                y++;
+            }
             break;
         case RFX_FORMAT_RGB:
             for (y = 0; y < height; y++)
             {
                 src = (uint8*) (rgb_data + y * stride_bytes);
+                lr_buf = r_buf + y * 64;
+                lg_buf = g_buf + y * 64;
+                lb_buf = b_buf + y * 64;
+                for (x = 0; x < width; x++)
+                {
+                    r = *src++;
+                    *lr_buf++ = r;
+                    g = *src++;
+                    *lg_buf++ = g;
+                    b = *src++;
+                    *lb_buf++ = b;
+                }
+                while (x < 64)
+                {
+                    *lr_buf++ = r;
+                    *lg_buf++ = g;
+                    *lb_buf++ = b;
+                    x++;
+                }
+            }
+            while (y < 64)
+            {
+                lr_buf = r_buf + y * 64;
+                lg_buf = g_buf + y * 64;
+                lb_buf = b_buf + y * 64;
+                memcpy(lr_buf, lr_buf - 64, 64);
+                memcpy(lg_buf, lg_buf - 64, 64);
+                memcpy(lb_buf, lb_buf - 64, 64);
+                y++;
+            }
+            break;
+    }
+    return 0;
+}
+
+/******************************************************************************/
+static int
+rfx_encode_format_argb(char *argb_data, int width, int height,
+                       int stride_bytes, int pixel_format,
+                       uint8 *a_buf, uint8 *r_buf, uint8 *g_buf, uint8 *b_buf)
+{
+    int x;
+    int y;
+    const uint8 *src;
+    uint8 a;
+    uint8 r;
+    uint8 g;
+    uint8 b;
+    uint8 *la_buf;
+    uint8 *lr_buf;
+    uint8 *lg_buf;
+    uint8 *lb_buf;
+
+    LLOGLN(10, ("rfx_encode_format_argb: pixel_format %d", pixel_format));
+    b = 0;
+    g = 0;
+    r = 0;
+    a = 0;
+    switch (pixel_format)
+    {
+        case RFX_FORMAT_BGRA:
+            for (y = 0; y < height; y++)
+            {
+                src = (uint8*) (argb_data + y * stride_bytes);
+                la_buf = a_buf + y * 64;
+                lr_buf = r_buf + y * 64;
+                lg_buf = g_buf + y * 64;
+                lb_buf = b_buf + y * 64;
+                for (x = 0; x < width; x++)
+                {
+                    b = *src++;
+                    *lb_buf++ = b;
+                    g = *src++;
+                    *lg_buf++ = g;
+                    r = *src++;
+                    *lr_buf++ = r;
+                    a = *src++;
+                    *la_buf++ = a;
+                }
+                while (x < 64)
+                {
+                    *la_buf++ = a;
+                    *lr_buf++ = r;
+                    *lg_buf++ = g;
+                    *lb_buf++ = r;
+                    x++;
+                }
+            }
+            while (y < 64)
+            {
+                la_buf = a_buf + y * 64;
+                lr_buf = r_buf + y * 64;
+                lg_buf = g_buf + y * 64;
+                lb_buf = b_buf + y * 64;
+                memcpy(la_buf, la_buf - 64, 64);
+                memcpy(lr_buf, lr_buf - 64, 64);
+                memcpy(lg_buf, lg_buf - 64, 64);
+                memcpy(lb_buf, lb_buf - 64, 64);
+                y++;
+            }
+            break;
+        case RFX_FORMAT_RGBA:
+            for (y = 0; y < height; y++)
+            {
+                src = (uint8*) (argb_data + y * stride_bytes);
+                la_buf = a_buf + y * 64;
+                lr_buf = r_buf + y * 64;
+                lg_buf = g_buf + y * 64;
+                lb_buf = b_buf + y * 64;
+                for (x = 0; x < width; x++)
+                {
+                    r = *src++;
+                    *lr_buf++ = r;
+                    g = *src++;
+                    *lg_buf++ = g;
+                    b = *src++;
+                    *lb_buf++ = b;
+                    a = *src++;
+                    *la_buf++ = a;
+                }
+                while (x < 64)
+                {
+                    *la_buf++ = a;
+                    *lr_buf++ = r;
+                    *lg_buf++ = g;
+                    *lb_buf++ = b;
+                    x++;
+                }
+            }
+            while (y < 64)
+            {
+                la_buf = a_buf + y * 64;
+                lr_buf = r_buf + y * 64;
+                lg_buf = g_buf + y * 64;
+                lb_buf = b_buf + y * 64;
+                memcpy(la_buf, la_buf - 64, 64);
+                memcpy(lr_buf, lr_buf - 64, 64);
+                memcpy(lg_buf, lg_buf - 64, 64);
+                memcpy(lb_buf, lb_buf - 64, 64);
+                y++;
+            }
+            break;
+        case RFX_FORMAT_BGR:
+            for (y = 0; y < height; y++)
+            {
+                src = (uint8*) (argb_data + y * stride_bytes);
+                lr_buf = r_buf + y * 64;
+                lg_buf = g_buf + y * 64;
+                lb_buf = b_buf + y * 64;
                 for (x = 0; x < width; x++)
                 {
+                    b = *src++;
+                    *lb_buf++ = b;
+                    g = *src++;
+                    *lg_buf++ = g;
                     r = *src++;
-                    *r_buf++ = r;
+                    *lr_buf++ = r;
+                }
+                while (x < 64)
+                {
+                    *lr_buf++ = r;
+                    *lg_buf++ = g;
+                    *lb_buf++ = b;
+                    x++;
+                }
+            }
+            while (y < 64)
+            {
+                lr_buf = r_buf + y * 64;
+                lg_buf = g_buf + y * 64;
+                lb_buf = b_buf + y * 64;
+                memcpy(lr_buf, lr_buf - 64, 64);
+                memcpy(lg_buf, lg_buf - 64, 64);
+                memcpy(lb_buf, lb_buf - 64, 64);
+                y++;
+            }
+            break;
+        case RFX_FORMAT_RGB:
+            for (y = 0; y < height; y++)
+            {
+                src = (uint8*) (argb_data + y * stride_bytes);
+                lr_buf = r_buf + y * 64;
+                lg_buf = g_buf + y * 64;
+                lb_buf = b_buf + y * 64;
+                for (x = 0; x < width; x++)
+                {
+                    r = *src++;
+                    *lr_buf++ = r;
                     g = *src++;
-                    *g_buf++ = g;
+                    *lg_buf++ = g;
                     b = *src++;
-                    *b_buf++ = b;
+                    *lb_buf++ = b;
                 }
+                while (x < 64)
+                {
+                    *lr_buf++ = r;
+                    *lg_buf++ = g;
+                    *lb_buf++ = b;
+                    x++;
+                }
+            }
+            while (y < 64)
+            {
+                lr_buf = r_buf + y * 64;
+                lg_buf = g_buf + y * 64;
+                lb_buf = b_buf + y * 64;
+                memcpy(lr_buf, lr_buf - 64, 64);
+                memcpy(lg_buf, lg_buf - 64, 64);
+                memcpy(lb_buf, lb_buf - 64, 64);
+                y++;
             }
             break;
     }
@@ -131,25 +408,25 @@ rfx_encode_format_rgb(char *rgb_data, int width, int height,
   -11071 -21736  32807
    32756 -27429  -5327 */
 static int
-rfx_encode_rgb_to_ycbcr(uint8 *y_r_buf, uint8 *cb_g_buf, uint8 *cr_b_buf)
+rfx_encode_rgb_to_yuv(uint8 *y_r_buf, uint8 *u_g_buf, uint8 *v_b_buf)
 {
     int i;
     sint32 r, g, b;
-    sint32 y, cb, cr;
+    sint32 y, u, v;
 
     for (i = 0; i < 4096; i++)
     {
         r = y_r_buf[i];
-        g = cb_g_buf[i];
-        b = cr_b_buf[i];
+        g = u_g_buf[i];
+        b = v_b_buf[i];
 
-        y =  (r *  19595 + g *  38470 + b *   7471) >> 16;
-        cb = (r * -11071 + g * -21736 + b *  32807) >> 16;
-        cr = (r *  32756 + g * -27429 + b *  -5327) >> 16;
+        y = (r *  19595 + g *  38470 + b *   7471) >> 16;
+        u = (r * -11071 + g * -21736 + b *  32807) >> 16;
+        v = (r *  32756 + g * -27429 + b *  -5327) >> 16;
 
         y_r_buf[i] = MINMAX(y, 0, 255);
-        cb_g_buf[i] = MINMAX(cb + 128, 0, 255);
-        cr_b_buf[i] = MINMAX(cr + 128, 0, 255);
+        u_g_buf[i] = MINMAX(u + 128, 0, 255);
+        v_b_buf[i] = MINMAX(v + 128, 0, 255);
 
     }
     return 0;
@@ -157,14 +434,15 @@ rfx_encode_rgb_to_ycbcr(uint8 *y_r_buf, uint8 *cb_g_buf, uint8 *cr_b_buf)
 
 /******************************************************************************/
 int
-rfx_encode_component_rlgr1(struct rfxencode *enc, const int *quantization_values,
+rfx_encode_component_rlgr1(struct rfxencode *enc, const char *qtable,
                            uint8 *data, uint8 *buffer, int buffer_size, int *size)
 {
+    LLOGLN(10, ("rfx_encode_component_rlgr1:"));
     if (rfx_dwt_2d_encode(data, enc->dwt_buffer1, enc->dwt_buffer) != 0)
     {
         return 1;
     }
-    if (rfx_quantization_encode(enc->dwt_buffer1, quantization_values) != 0)
+    if (rfx_quantization_encode(enc->dwt_buffer1, qtable) != 0)
     {
         return 1;
     }
@@ -172,47 +450,220 @@ rfx_encode_component_rlgr1(struct rfxencode *enc, const int *quantization_values
     {
         return 1;
     }
-    *size = rfx_rlgr1_encode(enc->dwt_buffer1, 4096, buffer, buffer_size);
+    *size = rfx_rlgr1_encode(enc->dwt_buffer1, buffer, buffer_size);
     return 0;
 }
 
 /******************************************************************************/
 int
-rfx_encode_component_rlgr3(struct rfxencode *enc, const int *quantization_values,
+rfx_encode_component_rlgr3(struct rfxencode *enc, const char *qtable,
                            uint8 *data, uint8 *buffer, int buffer_size, int *size)
 {
+    LLOGLN(10, ("rfx_encode_component_rlgr3:"));
     if (rfx_dwt_2d_encode(data, enc->dwt_buffer1, enc->dwt_buffer) != 0)
     {
         return 1;
     }
-    if (rfx_quantization_encode(enc->dwt_buffer1, quantization_values) != 0)
+    if (rfx_quantization_encode(enc->dwt_buffer1, qtable) != 0)
+    {
+        return 1;
+    }
+    if (rfx_differential_encode(enc->dwt_buffer1 + 4032, 64) != 0)
+    {
+        return 1;
+    }
+    *size = rfx_rlgr3_encode(enc->dwt_buffer1, buffer, buffer_size);
+    return 0;
+}
+
+/******************************************************************************/
+int
+rfx_encode_component_rlgr1_x86_sse2(struct rfxencode *enc, const char *qtable,
+                                    uint8 *data,
+                                    uint8 *buffer, int buffer_size, int *size)
+{
+    LLOGLN(10, ("rfx_encode_component_rlgr1_x86_sse2:"));
+#if defined(RFX_USE_ACCEL_X86)
+    if (rfxcodec_encode_dwt_shift_x86_sse2(qtable, data, enc->dwt_buffer1,
+                                           enc->dwt_buffer) != 0)
+    {
+        return 1;
+    }
+    //*size = rfxcodec_encode_diff_rlgr1_x86_sse2(enc->dwt_buffer1,
+    //                                            buffer, buffer_size);
+    if (rfx_differential_encode(enc->dwt_buffer1 + 4032, 64) != 0)
+    {
+        return 1;
+    }
+    *size = rfx_rlgr1_encode(enc->dwt_buffer1, buffer, buffer_size);
+#endif
+    return 0;
+}
+
+/******************************************************************************/
+int
+rfx_encode_component_rlgr3_x86_sse2(struct rfxencode *enc, const char *qtable,
+                                    uint8 *data,
+                                    uint8 *buffer, int buffer_size, int *size)
+{
+    LLOGLN(10, ("rfx_encode_component_rlgr3_x86_sse2:"));
+#if defined(RFX_USE_ACCEL_X86)
+    if (rfxcodec_encode_dwt_shift_x86_sse2(qtable, data, enc->dwt_buffer1,
+                                           enc->dwt_buffer) != 0)
+    {
+        return 1;
+    }
+    //*size = rfxcodec_encode_diff_rlgr3_x86_sse2(enc->dwt_buffer1,
+    //                                            buffer, buffer_size);
+    if (rfx_differential_encode(enc->dwt_buffer1 + 4032, 64) != 0)
+    {
+        return 1;
+    }
+    *size = rfx_rlgr3_encode(enc->dwt_buffer1, buffer, buffer_size);
+#endif
+    return 0;
+}
+
+/******************************************************************************/
+int
+rfx_encode_component_rlgr1_x86_sse41(struct rfxencode *enc, const char *qtable,
+                                     uint8 *data,
+                                     uint8 *buffer, int buffer_size, int *size)
+{
+    LLOGLN(10, ("rfx_encode_component_rlgr1_x86_sse41:"));
+#if defined(RFX_USE_ACCEL_X86)
+    if (rfxcodec_encode_dwt_shift_x86_sse41(qtable, data, enc->dwt_buffer1,
+                                            enc->dwt_buffer) != 0)
+    {
+        return 1;
+    }
+    //*size = rfxcodec_encode_diff_rlgr1_x86_sse2(enc->dwt_buffer1,
+    //                                            buffer, buffer_size);
+    if (rfx_differential_encode(enc->dwt_buffer1 + 4032, 64) != 0)
+    {
+        return 1;
+    }
+    *size = rfx_rlgr1_encode(enc->dwt_buffer1, buffer, buffer_size);
+#endif
+    return 0;
+}
+
+/******************************************************************************/
+int
+rfx_encode_component_rlgr3_x86_sse41(struct rfxencode *enc, const char *qtable,
+                                     uint8 *data,
+                                     uint8 *buffer, int buffer_size, int *size)
+{
+    LLOGLN(10, ("rfx_encode_component_rlgr3_x86_sse41:"));
+#if defined(RFX_USE_ACCEL_X86)
+    if (rfxcodec_encode_dwt_shift_x86_sse41(qtable, data, enc->dwt_buffer1,
+                                            enc->dwt_buffer) != 0)
     {
         return 1;
     }
+    //*size = rfxcodec_encode_diff_rlgr3_x86_sse(enc->dwt_buffer1,
+    //                                            buffer, buffer_size);
     if (rfx_differential_encode(enc->dwt_buffer1 + 4032, 64) != 0)
     {
         return 1;
     }
-    *size = rfx_rlgr3_encode(enc->dwt_buffer1, 4096, buffer, buffer_size);
+    *size = rfx_rlgr3_encode(enc->dwt_buffer1, buffer, buffer_size);
+#endif
     return 0;
 }
 
 /******************************************************************************/
 int
-rfx_encode_component_x86_sse2(struct rfxencode *enc,
-                              const int *quantization_values,
-                              uint8 *data,
-                              uint8 *buffer, int buffer_size, int *size)
+rfx_encode_component_rlgr1_amd64_sse2(struct rfxencode *enc, const char *qtable,
+                                      uint8 *data,
+                                      uint8 *buffer, int buffer_size, int *size)
 {
-    LLOGLN(10, ("rfx_encode_component_x86_sse2:"));
-#if defined(RFX_USE_ACCEL) && RFX_USE_ACCEL
-    /* put asm calls here */
-    if (dwt_shift_x86_sse2(quantization_values, data, enc->dwt_buffer1,
-                           enc->dwt_buffer) != 0)
+    LLOGLN(10, ("rfx_encode_component_rlgr1_amd64_sse2:"));
+#if defined(RFX_USE_ACCEL_AMD64)
+    if (rfxcodec_encode_dwt_shift_amd64_sse2(qtable, data, enc->dwt_buffer1,
+                                             enc->dwt_buffer) != 0)
+    {
+        return 1;
+    }
+    //*size = rfxcodec_encode_diff_rlgr1_amd64_sse2(enc->dwt_buffer1,
+    //                                              buffer, buffer_size);
+    if (rfx_differential_encode(enc->dwt_buffer1 + 4032, 64) != 0)
     {
         return 1;
     }
-    *size = diff_rlgr3_x86(enc->dwt_buffer1, 4096, buffer, buffer_size);
+    *size = rfx_rlgr1_encode(enc->dwt_buffer1, buffer, buffer_size);
+#endif
+    return 0;
+}
+
+/******************************************************************************/
+int
+rfx_encode_component_rlgr3_amd64_sse2(struct rfxencode *enc, const char *qtable,
+                                      uint8 *data,
+                                      uint8 *buffer, int buffer_size, int *size)
+{
+    LLOGLN(10, ("rfx_encode_component_rlgr3_amd64_sse2:"));
+#if defined(RFX_USE_ACCEL_AMD64)
+    if (rfxcodec_encode_dwt_shift_amd64_sse2(qtable, data, enc->dwt_buffer1,
+                                             enc->dwt_buffer) != 0)
+    {
+        return 1;
+    }
+    //*size = rfxcodec_encode_diff_rlgr3_amd64_sse2(enc->dwt_buffer1,
+    //                                              buffer, buffer_size);
+    if (rfx_differential_encode(enc->dwt_buffer1 + 4032, 64) != 0)
+    {
+        return 1;
+    }
+    *size = rfx_rlgr3_encode(enc->dwt_buffer1, buffer, buffer_size);
+#endif
+    return 0;
+}
+
+/******************************************************************************/
+int
+rfx_encode_component_rlgr1_amd64_sse41(struct rfxencode *enc, const char *qtable,
+                                       uint8 *data,
+                                       uint8 *buffer, int buffer_size, int *size)
+{
+    LLOGLN(10, ("rfx_encode_component_rlgr1_amd64_sse2:"));
+#if defined(RFX_USE_ACCEL_AMD64)
+    if (rfxcodec_encode_dwt_shift_amd64_sse41(qtable, data, enc->dwt_buffer1,
+                                              enc->dwt_buffer) != 0)
+    {
+        return 1;
+    }
+    //*size = rfxcodec_encode_diff_rlgr1_amd64_sse2(enc->dwt_buffer1,
+    //                                              buffer, buffer_size);
+    if (rfx_differential_encode(enc->dwt_buffer1 + 4032, 64) != 0)
+    {
+        return 1;
+    }
+    *size = rfx_rlgr1_encode(enc->dwt_buffer1, buffer, buffer_size);
+#endif
+    return 0;
+}
+
+/******************************************************************************/
+int
+rfx_encode_component_rlgr3_amd64_sse41(struct rfxencode *enc, const char *qtable,
+                                       uint8 *data,
+                                       uint8 *buffer, int buffer_size, int *size)
+{
+    LLOGLN(10, ("rfx_encode_component_rlgr3_amd64_sse2:"));
+#if defined(RFX_USE_ACCEL_AMD64)
+    if (rfxcodec_encode_dwt_shift_amd64_sse41(qtable, data, enc->dwt_buffer1,
+                                              enc->dwt_buffer) != 0)
+    {
+        return 1;
+    }
+    //*size = rfxcodec_encode_diff_rlgr3_amd64_sse2(enc->dwt_buffer1,
+    //                                              buffer, buffer_size);
+    if (rfx_differential_encode(enc->dwt_buffer1 + 4032, 64) != 0)
+    {
+        return 1;
+    }
+    *size = rfx_rlgr3_encode(enc->dwt_buffer1, buffer, buffer_size);
 #endif
     return 0;
 }
@@ -221,23 +672,84 @@ rfx_encode_component_x86_sse2(struct rfxencode *enc,
 int
 rfx_encode_rgb(struct rfxencode *enc, char *rgb_data,
                int width, int height, int stride_bytes,
-               const int *y_quants, const int *cb_quants, const int *cr_quants,
-               STREAM *data_out, int *y_size, int *cb_size, int *cr_size)
+               const char *y_quants, const char *u_quants,
+               const char *v_quants,
+               STREAM *data_out, int *y_size, int *u_size, int *v_size)
 {
     uint8 *y_r_buffer;
-    uint8 *cb_g_buffer;
-    uint8 *cr_b_buffer;
+    uint8 *u_g_buffer;
+    uint8 *v_b_buffer;
 
     y_r_buffer = enc->y_r_buffer;
-    cb_g_buffer = enc->cb_g_buffer;
-    cr_b_buffer = enc->cr_b_buffer;
+    u_g_buffer = enc->u_g_buffer;
+    v_b_buffer = enc->v_b_buffer;
     if (rfx_encode_format_rgb(rgb_data, width, height, stride_bytes,
                               enc->format,
-                              y_r_buffer, cb_g_buffer, cr_b_buffer) != 0)
+                              y_r_buffer, u_g_buffer, v_b_buffer) != 0)
+    {
+        return 1;
+    }
+    if (rfx_encode_rgb_to_yuv(y_r_buffer, u_g_buffer, v_b_buffer) != 0)
+    {
+        return 1;
+    }
+    if (enc->rfx_encode(enc, y_quants, y_r_buffer,
+                        stream_get_tail(data_out),
+                        stream_get_left(data_out),
+                        y_size) != 0)
+    {
+        return 1;
+    }
+    LLOGLN(10, ("rfx_encode_rgb: y_size %d", *y_size));
+    stream_seek(data_out, *y_size);
+    if (enc->rfx_encode(enc, u_quants, u_g_buffer,
+                        stream_get_tail(data_out),
+                        stream_get_left(data_out),
+                        u_size) != 0)
     {
         return 1;
     }
-    if (rfx_encode_rgb_to_ycbcr(y_r_buffer, cb_g_buffer, cr_b_buffer) != 0)
+    LLOGLN(10, ("rfx_encode_rgb: u_size %d", *u_size));
+    stream_seek(data_out, *u_size);
+    if (enc->rfx_encode(enc, v_quants, v_b_buffer,
+                        stream_get_tail(data_out),
+                        stream_get_left(data_out),
+                        v_size) != 0)
+    {
+        return 1;
+    }
+    LLOGLN(10, ("rfx_encode_rgb: v_size %d", *v_size));
+    stream_seek(data_out, *v_size);
+    return 0;
+}
+
+/******************************************************************************/
+int
+rfx_encode_argb(struct rfxencode *enc, char *rgb_data,
+                int width, int height, int stride_bytes,
+                const char *y_quants, const char *u_quants,
+                const char *v_quants,
+                STREAM *data_out, int *y_size, int *u_size,
+                int *v_size, int *a_size)
+{
+    uint8 *a_buffer;
+    uint8 *y_r_buffer;
+    uint8 *u_g_buffer;
+    uint8 *v_b_buffer;
+
+    LLOGLN(10, ("rfx_encode_argb:"));
+    a_buffer = enc->a_buffer;
+    y_r_buffer = enc->y_r_buffer;
+    u_g_buffer = enc->u_g_buffer;
+    v_b_buffer = enc->v_b_buffer;
+    if (rfx_encode_format_argb(rgb_data, width, height, stride_bytes,
+                               enc->format,
+                               a_buffer, y_r_buffer,
+                               u_g_buffer, v_b_buffer) != 0)
+    {
+        return 1;
+    }
+    if (rfx_encode_rgb_to_yuv(y_r_buffer, u_g_buffer, v_b_buffer) != 0)
     {
         return 1;
     }
@@ -250,24 +762,25 @@ rfx_encode_rgb(struct rfxencode *enc, char *rgb_data,
     }
     LLOGLN(10, ("rfx_encode_rgb: y_size %d", *y_size));
     stream_seek(data_out, *y_size);
-    if (enc->rfx_encode(enc, cb_quants, cb_g_buffer,
+    if (enc->rfx_encode(enc, u_quants, u_g_buffer,
                         stream_get_tail(data_out),
                         stream_get_left(data_out),
-                        cb_size) != 0)
+                        u_size) != 0)
     {
         return 1;
     }
-    LLOGLN(10, ("rfx_encode_rgb: cb_size %d", *cb_size));
-    stream_seek(data_out, *cb_size);
-    if (enc->rfx_encode(enc, cr_quants, cr_b_buffer,
+    LLOGLN(10, ("rfx_encode_rgb: u_size %d", *u_size));
+    stream_seek(data_out, *u_size);
+    if (enc->rfx_encode(enc, v_quants, v_b_buffer,
                         stream_get_tail(data_out),
                         stream_get_left(data_out),
-                        cr_size) != 0)
+                        v_size) != 0)
     {
         return 1;
     }
-    LLOGLN(10, ("rfx_encode_rgb: cr_size %d", *cr_size));
-    stream_seek(data_out, *cr_size);
+    LLOGLN(10, ("rfx_encode_rgb: v_size %d", *v_size));
+    stream_seek(data_out, *v_size);
+    *a_size = rfx_encode_plane(enc, a_buffer, 64, 64, data_out);
     return 0;
 }
 
@@ -275,7 +788,8 @@ rfx_encode_rgb(struct rfxencode *enc, char *rgb_data,
 int
 rfx_encode_yuv(struct rfxencode *enc, char *yuv_data,
                int width, int height, int stride_bytes,
-               const int *y_quants, const int *u_quants, const int *v_quants,
+               const char *y_quants, const char *u_quants,
+               const char *v_quants,
                STREAM *data_out, int *y_size, int *u_size, int *v_size)
 {
     uint8 *y_buffer;
@@ -311,3 +825,50 @@ rfx_encode_yuv(struct rfxencode *enc, char *yuv_data,
     stream_seek(data_out, *v_size);
     return 0;
 }
+
+/******************************************************************************/
+int
+rfx_encode_yuva(struct rfxencode *enc, char *yuva_data,
+                int width, int height, int stride_bytes,
+                const char *y_quants, const char *u_quants,
+                const char *v_quants,
+                STREAM *data_out, int *y_size, int *u_size,
+                int *v_size, int *a_size)
+{
+    uint8 *y_buffer;
+    uint8 *u_buffer;
+    uint8 *v_buffer;
+    uint8 *a_buffer;
+
+    y_buffer = (uint8 *) yuva_data;
+    u_buffer = (uint8 *) (yuva_data + RFX_YUV_BTES);
+    v_buffer = (uint8 *) (yuva_data + RFX_YUV_BTES * 2);
+    a_buffer = (uint8 *) (yuva_data + RFX_YUV_BTES * 3);
+    if (enc->rfx_encode(enc, y_quants, y_buffer,
+                        stream_get_tail(data_out),
+                        stream_get_left(data_out),
+                        y_size) != 0)
+    {
+        return 1;
+    }
+    stream_seek(data_out, *y_size);
+    if (enc->rfx_encode(enc, u_quants, u_buffer,
+                        stream_get_tail(data_out),
+                        stream_get_left(data_out),
+                        u_size) != 0)
+    {
+        return 1;
+    }
+    stream_seek(data_out, *u_size);
+    if (enc->rfx_encode(enc, v_quants, v_buffer,
+                        stream_get_tail(data_out),
+                        stream_get_left(data_out),
+                        v_size) != 0)
+    {
+        return 1;
+    }
+    stream_seek(data_out, *v_size);
+    *a_size = rfx_encode_plane(enc, a_buffer, 64, 64, data_out);
+    return 0;
+}
+
diff --git a/src/rfxencode_tile.h b/src/rfxencode_tile.h
index 01604c6..6195d8d 100644
--- a/src/rfxencode_tile.h
+++ b/src/rfxencode_tile.h
@@ -3,7 +3,7 @@
  * RemoteFX Codec Library - Encode
  *
  * Copyright 2011 Vic Lee
- * Copyright 2014 Jay Sorg <jay.sorg@gmail.com>
+ * Copyright 2014-2015 Jay Sorg <jay.sorg@gmail.com>
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,34 +27,71 @@
 #define RFX_YUV_BTES (64 * 64)
 
 int
-rfx_encode_component_rlgr1(struct rfxencode *enc,
-                           const int *quantization_values,
+rfx_encode_component_rlgr1(struct rfxencode *enc, const char *qtable,
                            uint8 *data,
                            uint8 *buffer, int buffer_size, int *size);
 int
-rfx_encode_component_rlgr3(struct rfxencode *enc,
-                           const int *quantization_values,
+rfx_encode_component_rlgr3(struct rfxencode *enc, const char *qtable,
                            uint8 *data,
                            uint8 *buffer, int buffer_size, int *size);
 int
-rfx_encode_component_x86_sse2(struct rfxencode *enc,
-                              const int *quantization_values,
-                              uint8 *data,
-                              uint8 *buffer, int buffer_size, int *size);
-int
-rfx_encode_component_amd64_sse2(struct rfxencode *enc,
-                                const int *quantization_values,
-                                uint8 *data,
-                                uint8 *buffer, int buffer_size, int *size);
-int
 rfx_encode_rgb(struct rfxencode *enc, char *rgb_data,
                int width, int height, int stride_bytes,
-               const int *y_quants, const int *cb_quants, const int *cr_quants,
+               const char *y_quants, const char *u_quants,
+               const char *v_quants,
                STREAM *data_out, int *y_size, int *cb_size, int *cr_size);
 int
+rfx_encode_argb(struct rfxencode *enc, char *argb_data,
+                int width, int height, int stride_bytes,
+                const char *y_quants, const char *cb_quants,
+                const char *cr_quants,
+                STREAM *data_out, int *y_size, int *u_size,
+                int *v_size, int *a_size);
+int
 rfx_encode_yuv(struct rfxencode *enc, char *yuv_data,
                int width, int height, int stride_bytes,
-               const int *y_quants, const int *u_quants, const int *v_quants,
+               const char *y_quants, const char *u_quants,
+               const char *v_quants,
                STREAM *data_out, int *y_size, int *u_size, int *v_size);
+int
+rfx_encode_yuva(struct rfxencode *enc, char *yuv_data,
+                int width, int height, int stride_bytes,
+                const char *y_quants, const char *u_quants,
+                const char *v_quants,
+                STREAM *data_out, int *y_size, int *u_size,
+                int *v_size, int *a_size);
+
+int
+rfx_encode_component_rlgr1_x86_sse2(struct rfxencode *enc, const char *qtable,
+                                    uint8 *data,
+                                    uint8 *buffer, int buffer_size, int *size);
+int
+rfx_encode_component_rlgr3_x86_sse2(struct rfxencode *enc, const char *qtable,
+                                    uint8 *data,
+                                    uint8 *buffer, int buffer_size, int *size);
+int
+rfx_encode_component_rlgr1_x86_sse41(struct rfxencode *enc, const char *qtable,
+                                     uint8 *data,
+                                     uint8 *buffer, int buffer_size, int *size);
+int
+rfx_encode_component_rlgr3_x86_sse41(struct rfxencode *enc, const char *qtable,
+                                     uint8 *data,
+                                     uint8 *buffer, int buffer_size, int *size);
+int
+rfx_encode_component_rlgr1_amd64_sse2(struct rfxencode *enc, const char *qtable,
+                                      uint8 *data,
+                                      uint8 *buffer, int buffer_size, int *size);
+int
+rfx_encode_component_rlgr3_amd64_sse2(struct rfxencode *enc, const char *qtable,
+                                      uint8 *data,
+                                      uint8 *buffer, int buffer_size, int *size);
+int
+rfx_encode_component_rlgr1_amd64_sse41(struct rfxencode *enc, const char *qtable,
+                                       uint8 *data,
+                                       uint8 *buffer, int buffer_size, int *size);
+int
+rfx_encode_component_rlgr3_amd64_sse41(struct rfxencode *enc, const char *qtable,
+                                       uint8 *data,
+                                       uint8 *buffer, int buffer_size, int *size);
 
 #endif
diff --git a/src/x86/cpuid_x86.asm b/src/x86/cpuid_x86.asm
index 6f9e8c2..fe19a90 100644
--- a/src/x86/cpuid_x86.asm
+++ b/src/x86/cpuid_x86.asm
@@ -1,3 +1,6 @@
+%ifidn __OUTPUT_FORMAT__,elf
+SECTION .note.GNU-stack noalloc noexec nowrite progbits
+%endif
 
 SECTION .text
 
@@ -10,7 +13,11 @@ SECTION .text
 ;int
 ;cpuid_x86(int eax_in, int ecx_in, int *eax, int *ebx, int *ecx, int *edx)
 
+%ifidn __OUTPUT_FORMAT__,elf
 PROC cpuid_x86
+%else
+PROC _cpuid_x86
+%endif
     ; save registers
     push ebx
     push ecx
diff --git a/src/x86/funcs_x86.h b/src/x86/funcs_x86.h
index 6025d0a..858bc5c 100644
--- a/src/x86/funcs_x86.h
+++ b/src/x86/funcs_x86.h
@@ -1,5 +1,5 @@
 /*
-Copyright 2014 Jay Sorg
+Copyright 2014-2015 Jay Sorg
 
 Permission to use, copy, modify, distribute, and sell this software and its
 documentation for any purpose is hereby granted without fee, provided that
@@ -24,12 +24,49 @@ x86 asm files
 #ifndef __FUNCS_X86_H
 #define __FUNCS_X86_H
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 int
 cpuid_x86(int eax_in, int ecx_in, int *eax, int *ebx, int *ecx, int *edx);
+
 int
-dwt_shift_x86_sse2(unsigned char *yuvs, int width, int height, int *rgbs);
+rfxcodec_encode_dwt_shift_x86_sse2(const char *qtable,
+                                   unsigned char *data,
+                                   short *dwt_buffer1,
+                                   short *dwt_buffer);
 int
-diff_rlgr3_x86(sint16 *co, int num_co, uint8 *dst, int dst_bytes);
+rfxcodec_encode_dwt_shift_x86_sse41(const char *qtable,
+                                    unsigned char *data,
+                                    short *dwt_buffer1,
+                                    short *dwt_buffer);
 
+int
+rfxcodec_encode_diff_rlgr1_x86_sse2(short *co,
+                                    void *dst, int dst_bytes);
+int
+rfxcodec_encode_diff_rlgr3_x86_sse2(short *co,
+                                    void *dst, int dst_bytes);
+
+int
+rfxcodec_decode_rlgr1_diff_x86_sse2(void *data, int data_bytes,
+                                    short *out_data);
+int
+rfxcodec_decode_rlgr3_diff_x86_sse2(void *data, int data_bytes,
+                                    short *out_data);
+int
+rfxcodec_decode_shift_idwt_x86_sse2(const char *qtable, short *src, short *dst);
+int
+rfxcodec_decode_yuv2rgb_x86_sse2(short *ydata, short *udata, short *vdata,
+                                 unsigned int *rgbdata, int stride);
+int
+rfxcodec_decode_yuva2argb_x86_sse2(short *ydata, short *udata,
+                                   short *vdata, char *adata,
+                                   unsigned int *rgbdata, int stride);
+
+#ifdef __cplusplus
+}
 #endif
 
+#endif
diff --git a/src/x86/readme.txt b/src/x86/readme.txt
deleted file mode 100644
index e69de29..0000000
diff --git a/src/x86/rfxcodec_encode_diff_rlgr1_x86_sse2.asm b/src/x86/rfxcodec_encode_diff_rlgr1_x86_sse2.asm
new file mode 100644
index 0000000..13d10e9
--- /dev/null
+++ b/src/x86/rfxcodec_encode_diff_rlgr1_x86_sse2.asm
@@ -0,0 +1,35 @@
+%ifidn __OUTPUT_FORMAT__,elf
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
+
+section .data
+    const1 times 8 dw 1
+
+section .text
+
+%macro PROC 1
+    align 16
+    global %1
+    %1:
+%endmacro
+
+;int
+;rfxcodec_encode_diff_rlgr1_x86_sse2(short *co,
+;                                    void *dst, int dst_bytes);
+
+%ifidn __OUTPUT_FORMAT__,elf
+PROC rfxcodec_encode_diff_rlgr1_x86_sse2
+%else
+PROC _rfxcodec_encode_diff_rlgr1_x86_sse2
+%endif
+    push ebx
+    push esi
+    push edi
+
+    mov eax, 0
+    pop edi
+    pop esi
+    pop ebx
+    ret
+    align 16
+
diff --git a/src/x86/rfxcodec_encode_diff_rlgr3_x86_sse2.asm b/src/x86/rfxcodec_encode_diff_rlgr3_x86_sse2.asm
new file mode 100644
index 0000000..a8588f2
--- /dev/null
+++ b/src/x86/rfxcodec_encode_diff_rlgr3_x86_sse2.asm
@@ -0,0 +1,35 @@
+%ifidn __OUTPUT_FORMAT__,elf
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
+
+section .data
+    const1 times 8 dw 1
+
+section .text
+
+%macro PROC 1
+    align 16
+    global %1
+    %1:
+%endmacro
+
+;int
+;rfxcodec_encode_diff_rlgr3_x86_sse2(short *co,
+;                                    void *dst, int dst_bytes);
+
+%ifidn __OUTPUT_FORMAT__,elf
+PROC rfxcodec_encode_diff_rlgr3_x86_sse2
+%else
+PROC _rfxcodec_encode_diff_rlgr3_x86_sse2
+%endif
+    push ebx
+    push esi
+    push edi
+
+    mov eax, 0
+    pop edi
+    pop esi
+    pop ebx
+    ret
+    align 16
+
diff --git a/src/x86/rfxcodec_encode_dwt_shift_x86_sse2.asm b/src/x86/rfxcodec_encode_dwt_shift_x86_sse2.asm
new file mode 100644
index 0000000..f6b71b2
--- /dev/null
+++ b/src/x86/rfxcodec_encode_dwt_shift_x86_sse2.asm
@@ -0,0 +1,1533 @@
+;
+;Copyright 2016 Jay Sorg
+;
+;Permission to use, copy, modify, distribute, and sell this software and its
+;documentation for any purpose is hereby granted without fee, provided that
+;the above copyright notice appear in all copies and that both that
+;copyright notice and this permission notice appear in supporting
+;documentation.
+;
+;The above copyright notice and this permission notice shall be included in
+;all copies or substantial portions of the Software.
+;
+;THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+;IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+;FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+;OPEN GROUP BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+;AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+;CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+;
+;x86 asm dwt
+
+%ifidn __OUTPUT_FORMAT__,elf
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
+
+section .data
+    align 16
+    cw128    times 8 dw 128
+    cdFFFF   times 4 dd 65535
+    ; these are 1 << (factor - 1) 0 to 15 is factor
+    cwa0     times 8 dw 0     ; 0
+    cwa1     times 8 dw 1     ; 1
+    cwa2     times 8 dw 2     ; 2
+    cwa4     times 8 dw 4     ; 3
+    cwa8     times 8 dw 8     ; 4
+    cwa16    times 8 dw 16    ; 5
+    cwa32    times 8 dw 32    ; 6
+    cwa64    times 8 dw 64    ; 7
+    cwa128   times 8 dw 128   ; 8
+    cwa256   times 8 dw 256   ; 9
+    cwa512   times 8 dw 512   ; 10
+    cwa1024  times 8 dw 1024  ; 11
+    cwa2048  times 8 dw 2048  ; 12
+    cwa4096  times 8 dw 4096  ; 13
+    cwa8192  times 8 dw 8192  ; 14
+    cwa16384 times 8 dw 16384 ; 15
+
+section .text
+
+%macro PROC 1
+    align 16
+    global %1
+    %1:
+%endmacro
+
+%define LHI_ADD  [esp + 1 * 16 + 4]
+%define LHI_SFT  [esp + 2 * 16 + 4]
+%define LLO_ADD  [esp + 3 * 16 + 4]
+%define LLO_SFT  [esp + 4 * 16 + 4]
+
+;******************************************************************************
+; source 16 bit signed, 16 pixel width
+rfx_dwt_2d_encode_block_horiz_16_16:
+    mov ecx, 8
+loop1a:
+    ; pre / post
+    movdqa xmm1, [esi]                  ; src[2n]
+    movdqa xmm2, [esi + 16]
+    movdqa xmm6, xmm1
+    movdqa xmm7, xmm2
+    pand xmm1, [cdFFFF]
+    pand xmm2, [cdFFFF]
+    pslld xmm1, 16
+    pslld xmm2, 16
+    psrad xmm1, 16
+    psrad xmm2, 16
+    packssdw xmm1, xmm2
+    movdqa xmm2, xmm6                   ; src[2n + 1]
+    movdqa xmm3, xmm7
+    psrldq xmm2, 2
+    psrldq xmm3, 2
+    pand xmm2, [cdFFFF]
+    pand xmm3, [cdFFFF]
+    pslld xmm2, 16
+    pslld xmm3, 16
+    psrad xmm2, 16
+    psrad xmm3, 16
+    packssdw xmm2, xmm3
+    movdqa xmm3, xmm6                   ; src[2n + 2]
+    movdqa xmm4, xmm7
+    psrldq xmm3, 4
+    psrldq xmm4, 4
+    movd eax, xmm7
+    movd xmm5, eax
+    pslldq xmm5, 12
+    por xmm3, xmm5
+    movdqa xmm5, xmm7
+    psrldq xmm5, 12
+    pslldq xmm5, 12
+    por xmm4, xmm5
+    pand xmm3, [cdFFFF]
+    pand xmm4, [cdFFFF]
+    pslld xmm3, 16
+    pslld xmm4, 16
+    psrad xmm3, 16
+    psrad xmm4, 16
+    packssdw xmm3, xmm4
+    movdqa xmm4, xmm1
+    movdqa xmm5, xmm2
+    movdqa xmm6, xmm3
+    ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+    paddw xmm4, xmm6
+    psraw xmm4, 1
+    psubw xmm5, xmm4
+    psraw xmm5, 1
+    movdqa xmm6, xmm5                   ; out hi
+    paddw xmm6, LHI_ADD
+    psraw xmm6, LHI_SFT
+    movdqa [edi], xmm6
+    ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+    movdqa xmm7, xmm5
+    movd eax, xmm7
+    pslldq xmm7, 2
+    and eax, 0xFFFF
+    movd xmm6, eax
+    por xmm7, xmm6
+    paddw xmm5, xmm7
+    psraw xmm5, 1
+    paddw xmm5, xmm1
+
+    movdqa xmm6, xmm5                   ; out lo
+    paddw xmm6, LLO_ADD
+    psraw xmm6, LLO_SFT
+    movdqa [edx], xmm6
+
+    ; move right
+    lea esi, [esi + 16 * 2]
+    lea edi, [edi + 8 * 2]
+    lea edx, [edx + 8 * 2]
+
+    ; move left
+    lea esi, [esi - 16 * 2]
+    lea edi, [edi - 8 * 2]
+    lea edx, [edx - 8 * 2]
+
+    ; move down
+    lea esi, [esi + 16 * 2]
+    lea edi, [edi + 8 * 2]
+    lea edx, [edx + 8 * 2]
+
+    dec ecx
+    jnz loop1a
+
+    ret
+
+;******************************************************************************
+; source 16 bit signed, 16 pixel width
+rfx_dwt_2d_encode_block_verti_16_16:
+    mov ecx, 2
+loop1b:
+    ; pre
+    movdqa xmm1, [esi]                  ; src[2n]
+    movdqa xmm2, [esi + 16 * 2]         ; src[2n + 1]
+    movdqa xmm3, [esi + 16 * 2 * 2]     ; src[2n + 2]
+    movdqa xmm4, xmm1
+    movdqa xmm5, xmm2
+    movdqa xmm6, xmm3
+    ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+    paddw xmm4, xmm6
+    psraw xmm4, 1
+    psubw xmm5, xmm4
+    psraw xmm5, 1
+    movdqa [edi], xmm5                  ; out hi
+    movdqa xmm6, xmm5                   ; save hi
+    ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+    paddw xmm5, xmm1
+    movdqa [edx], xmm5                  ; out lo
+    movdqa xmm7, xmm6                   ; save hi
+    ; move down
+    lea esi, [esi + 16 * 2 * 2]         ; 2 rows
+    lea edi, [edi + 16 * 2]             ; 1 row
+    lea edx, [edx + 16 * 2]             ; 1 row
+
+    ; loop
+    shl ecx, 16
+    mov cx, 6
+loop2b:
+    movdqa xmm1, xmm3                   ; src[2n]
+    movdqa xmm2, [esi + 16 * 2]         ; src[2n + 1]
+    movdqa xmm3, [esi + 16 * 2 * 2]     ; src[2n + 2]
+    movdqa xmm4, xmm1
+    movdqa xmm5, xmm2
+    movdqa xmm6, xmm3
+    ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+    paddw xmm4, xmm6
+    psraw xmm4, 1
+    psubw xmm5, xmm4
+    psraw xmm5, 1
+    movdqa [edi], xmm5                  ; out hi
+    movdqa xmm6, xmm5                   ; save hi
+    ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+    paddw xmm5, xmm7
+    psraw xmm5, 1
+    paddw xmm5, xmm1
+    movdqa [edx], xmm5                  ; out lo
+    movdqa xmm7, xmm6                   ; save hi
+    ; move down
+    lea esi, [esi + 16 * 2 * 2]         ; 2 rows
+    lea edi, [edi + 16 * 2]             ; 1 row
+    lea edx, [edx + 16 * 2]             ; 1 row
+
+    dec cx
+    jnz loop2b
+    shr ecx, 16
+
+    ; post
+    movdqa xmm1, xmm3                   ; src[2n]
+    movdqa xmm2, [esi + 16 * 2]         ; src[2n + 1]
+    movdqa xmm4, xmm1
+    movdqa xmm5, xmm2
+    movdqa xmm6, xmm3
+    ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+    paddw xmm4, xmm6
+    psraw xmm4, 1
+    psubw xmm5, xmm4
+    psraw xmm5, 1
+    movdqa [edi], xmm5                  ; out hi
+    ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+    paddw xmm5, xmm7
+    psraw xmm5, 1
+    paddw xmm5, xmm1
+    movdqa [edx], xmm5                  ; out lo
+    ; move down
+    lea esi, [esi + 16 * 2 * 2]         ; 2 row
+    lea edi, [edi + 16 * 2]             ; 1 row
+    lea edx, [edx + 16 * 2]             ; 1 row
+
+    ; move up
+    lea esi, [esi - 16 * 16 * 2]
+    lea edi, [edi - 8 * 16 * 2]
+    lea edx, [edx - 8 * 16 * 2]
+
+    ; move right
+    lea esi, [esi + 16]
+    lea edi, [edi + 16]
+    lea edx, [edx + 16]
+
+    dec ecx
+    jnz loop1b
+
+    ret
+
+;******************************************************************************
+; source 16 bit signed, 32 pixel width
+rfx_dwt_2d_encode_block_horiz_16_32:
+    mov ecx, 16
+loop1c:
+    ; pre
+    movdqa xmm1, [esi]                  ; src[2n]
+    movdqa xmm2, [esi + 16]
+    movdqa xmm6, xmm1
+    movdqa xmm7, xmm2
+    pand xmm1, [cdFFFF]
+    pand xmm2, [cdFFFF]
+    pslld xmm1, 16
+    pslld xmm2, 16
+    psrad xmm1, 16
+    psrad xmm2, 16
+    packssdw xmm1, xmm2
+    movdqa xmm2, xmm6                   ; src[2n + 1]
+    movdqa xmm3, xmm7
+    psrldq xmm2, 2
+    psrldq xmm3, 2
+    pand xmm2, [cdFFFF]
+    pand xmm3, [cdFFFF]
+    pslld xmm2, 16
+    pslld xmm3, 16
+    psrad xmm2, 16
+    psrad xmm3, 16
+    packssdw xmm2, xmm3
+    movdqa xmm3, xmm6                   ; src[2n + 2]
+    movdqa xmm4, xmm7
+    psrldq xmm3, 4
+    psrldq xmm4, 4
+    movd eax, xmm7
+    movd xmm5, eax
+    pslldq xmm5, 12
+    por xmm3, xmm5
+    mov eax, [esi + 32]
+    movd xmm5, eax
+    pslldq xmm5, 12
+    por xmm4, xmm5
+    pand xmm3, [cdFFFF]
+    pand xmm4, [cdFFFF]
+    pslld xmm3, 16
+    pslld xmm4, 16
+    psrad xmm3, 16
+    psrad xmm4, 16
+    packssdw xmm3, xmm4
+    movdqa xmm4, xmm1
+    movdqa xmm5, xmm2
+    movdqa xmm6, xmm3
+    ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+    paddw xmm4, xmm6
+    psraw xmm4, 1
+    psubw xmm5, xmm4
+    psraw xmm5, 1
+
+    movdqa xmm6, xmm5                   ; out hi
+    paddw xmm6, LHI_ADD
+    psraw xmm6, LHI_SFT
+    movdqa [edi], xmm6
+    movdqa xmm2, xmm5                   ; save hi
+
+    ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+    movdqa xmm7, xmm5
+    movd eax, xmm7
+    pslldq xmm7, 2
+    and eax, 0xFFFF
+    movd xmm6, eax
+    por xmm7, xmm6
+    paddw xmm5, xmm7
+    psraw xmm5, 1
+    paddw xmm5, xmm1
+
+    psrldq xmm2, 14
+    movd ebx, xmm2                      ; save hi
+
+    movdqa xmm6, xmm5                   ; out lo
+    paddw xmm6, LLO_ADD
+    psraw xmm6, LLO_SFT
+    movdqa [edx], xmm6
+
+    ; move right
+    lea esi, [esi + 16 * 2]
+    lea edi, [edi + 8 * 2]
+    lea edx, [edx + 8 * 2]
+
+    ; post
+    movdqa xmm1, [esi]                  ; src[2n]
+    movdqa xmm2, [esi + 16]
+    movdqa xmm6, xmm1
+    movdqa xmm7, xmm2
+    pand xmm1, [cdFFFF]
+    pand xmm2, [cdFFFF]
+    pslld xmm1, 16
+    pslld xmm2, 16
+    psrad xmm1, 16
+    psrad xmm2, 16
+    packssdw xmm1, xmm2
+    movdqa xmm2, xmm6                   ; src[2n + 1]
+    movdqa xmm3, xmm7
+    psrldq xmm2, 2
+    psrldq xmm3, 2
+    pand xmm2, [cdFFFF]
+    pand xmm3, [cdFFFF]
+    pslld xmm2, 16
+    pslld xmm3, 16
+    psrad xmm2, 16
+    psrad xmm3, 16
+    packssdw xmm2, xmm3
+    movdqa xmm3, xmm6                   ; src[2n + 2]
+    movdqa xmm4, xmm7
+    psrldq xmm3, 4
+    psrldq xmm4, 4
+    movd eax, xmm7
+    movd xmm5, eax
+    pslldq xmm5, 12
+    por xmm3, xmm5
+    movdqa xmm5, xmm7
+    psrldq xmm5, 12
+    pslldq xmm5, 12
+    por xmm4, xmm5
+    pand xmm3, [cdFFFF]
+    pand xmm4, [cdFFFF]
+    pslld xmm3, 16
+    pslld xmm4, 16
+    psrad xmm3, 16
+    psrad xmm4, 16
+    packssdw xmm3, xmm4
+    movdqa xmm4, xmm1
+    movdqa xmm5, xmm2
+    movdqa xmm6, xmm3
+    ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+    paddw xmm4, xmm6
+    psraw xmm4, 1
+    psubw xmm5, xmm4
+    psraw xmm5, 1
+
+    movdqa xmm6, xmm5                   ; out hi
+    paddw xmm6, LHI_ADD
+    psraw xmm6, LHI_SFT
+    movdqa [edi], xmm6
+
+    ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+    movdqa xmm7, xmm5
+    pslldq xmm7, 2
+    movd xmm6, ebx
+    por xmm7, xmm6
+    paddw xmm5, xmm7
+    psraw xmm5, 1
+    paddw xmm5, xmm1
+
+    movdqa xmm6, xmm5                   ; out lo
+    paddw xmm6, LLO_ADD
+    psraw xmm6, LLO_SFT
+    movdqa [edx], xmm6
+
+    ; move right
+    lea esi, [esi + 16 * 2]
+    lea edi, [edi + 8 * 2]
+    lea edx, [edx + 8 * 2]
+
+    ; move left
+    lea esi, [esi - 32 * 2]
+    lea edi, [edi - 16 * 2]
+    lea edx, [edx - 16 * 2]
+
+    ; move down
+    lea esi, [esi + 32 * 2]
+    lea edi, [edi + 16 * 2]
+    lea edx, [edx + 16 * 2]
+
+    dec ecx
+    jnz loop1c
+
+    ret
+
+;******************************************************************************
+; source 16 bit signed, 32 pixel width
+rfx_dwt_2d_encode_block_horiz_16_32_no_lo:
+    mov ecx, 16
+loop1c1:
+    ; pre
+    movdqa xmm1, [esi]                  ; src[2n]
+    movdqa xmm2, [esi + 16]
+    movdqa xmm6, xmm1
+    movdqa xmm7, xmm2
+    pand xmm1, [cdFFFF]
+    pand xmm2, [cdFFFF]
+    pslld xmm1, 16
+    pslld xmm2, 16
+    psrad xmm1, 16
+    psrad xmm2, 16
+    packssdw xmm1, xmm2
+    movdqa xmm2, xmm6                   ; src[2n + 1]
+    movdqa xmm3, xmm7
+    psrldq xmm2, 2
+    psrldq xmm3, 2
+    pand xmm2, [cdFFFF]
+    pand xmm3, [cdFFFF]
+    pslld xmm2, 16
+    pslld xmm3, 16
+    psrad xmm2, 16
+    psrad xmm3, 16
+    packssdw xmm2, xmm3
+    movdqa xmm3, xmm6                   ; src[2n + 2]
+    movdqa xmm4, xmm7
+    psrldq xmm3, 4
+    psrldq xmm4, 4
+    movd eax, xmm7
+    movd xmm5, eax
+    pslldq xmm5, 12
+    por xmm3, xmm5
+    mov eax, [esi + 32]
+    movd xmm5, eax
+    pslldq xmm5, 12
+    por xmm4, xmm5
+    pand xmm3, [cdFFFF]
+    pand xmm4, [cdFFFF]
+    pslld xmm3, 16
+    pslld xmm4, 16
+    psrad xmm3, 16
+    psrad xmm4, 16
+    packssdw xmm3, xmm4
+    movdqa xmm4, xmm1
+    movdqa xmm5, xmm2
+    movdqa xmm6, xmm3
+    ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+    paddw xmm4, xmm6
+    psraw xmm4, 1
+    psubw xmm5, xmm4
+    psraw xmm5, 1
+
+    movdqa xmm6, xmm5                   ; out hi
+    paddw xmm6, LHI_ADD
+    psraw xmm6, LHI_SFT
+    movdqa [edi], xmm6
+    movdqa xmm2, xmm5                   ; save hi
+
+    ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+    movdqa xmm7, xmm5
+    movd eax, xmm7
+    pslldq xmm7, 2
+    and eax, 0xFFFF
+    movd xmm6, eax
+    por xmm7, xmm6
+    paddw xmm5, xmm7
+    psraw xmm5, 1
+    paddw xmm5, xmm1
+
+    psrldq xmm2, 14
+    movd ebx, xmm2                      ; save hi
+
+    movdqa [edx], xmm5                  ; out lo
+
+    ; move right
+    lea esi, [esi + 16 * 2]
+    lea edi, [edi + 8 * 2]
+    lea edx, [edx + 8 * 2]
+
+    ; post
+    movdqa xmm1, [esi]                  ; src[2n]
+    movdqa xmm2, [esi + 16]
+    movdqa xmm6, xmm1
+    movdqa xmm7, xmm2
+    pand xmm1, [cdFFFF]
+    pand xmm2, [cdFFFF]
+    pslld xmm1, 16
+    pslld xmm2, 16
+    psrad xmm1, 16
+    psrad xmm2, 16
+    packssdw xmm1, xmm2
+    movdqa xmm2, xmm6                   ; src[2n + 1]
+    movdqa xmm3, xmm7
+    psrldq xmm2, 2
+    psrldq xmm3, 2
+    pand xmm2, [cdFFFF]
+    pand xmm3, [cdFFFF]
+    pslld xmm2, 16
+    pslld xmm3, 16
+    psrad xmm2, 16
+    psrad xmm3, 16
+    packssdw xmm2, xmm3
+    movdqa xmm3, xmm6                   ; src[2n + 2]
+    movdqa xmm4, xmm7
+    psrldq xmm3, 4
+    psrldq xmm4, 4
+    movd eax, xmm7
+    movd xmm5, eax
+    pslldq xmm5, 12
+    por xmm3, xmm5
+    movdqa xmm5, xmm7
+    psrldq xmm5, 12
+    pslldq xmm5, 12
+    por xmm4, xmm5
+    pand xmm3, [cdFFFF]
+    pand xmm4, [cdFFFF]
+    pslld xmm3, 16
+    pslld xmm4, 16
+    psrad xmm3, 16
+    psrad xmm4, 16
+    packssdw xmm3, xmm4
+    movdqa xmm4, xmm1
+    movdqa xmm5, xmm2
+    movdqa xmm6, xmm3
+    ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+    paddw xmm4, xmm6
+    psraw xmm4, 1
+    psubw xmm5, xmm4
+    psraw xmm5, 1
+
+    movdqa xmm6, xmm5                   ; out hi
+    paddw xmm6, LHI_ADD
+    psraw xmm6, LHI_SFT
+    movdqa [edi], xmm6
+
+    ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+    movdqa xmm7, xmm5
+    pslldq xmm7, 2
+    movd xmm6, ebx
+    por xmm7, xmm6
+    paddw xmm5, xmm7
+    psraw xmm5, 1
+    paddw xmm5, xmm1
+
+    movdqa [edx], xmm5                  ; out lo
+
+    ; move right
+    lea esi, [esi + 16 * 2]
+    lea edi, [edi + 8 * 2]
+    lea edx, [edx + 8 * 2]
+
+    ; move left
+    lea esi, [esi - 32 * 2]
+    lea edi, [edi - 16 * 2]
+    lea edx, [edx - 16 * 2]
+
+    ; move down
+    lea esi, [esi + 32 * 2]
+    lea edi, [edi + 16 * 2]
+    lea edx, [edx + 16 * 2]
+
+    dec ecx
+    jnz loop1c1
+
+    ret
+
+;******************************************************************************
+; source 16 bit signed, 32 pixel width
+rfx_dwt_2d_encode_block_verti_16_32:
+    mov ecx, 4
+loop1d:
+    ; pre
+    movdqa xmm1, [esi]                  ; src[2n]
+    movdqa xmm2, [esi + 32 * 2]         ; src[2n + 1]
+    movdqa xmm3, [esi + 32 * 2 * 2]     ; src[2n + 2]
+    movdqa xmm4, xmm1
+    movdqa xmm5, xmm2
+    movdqa xmm6, xmm3
+    ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+    paddw xmm4, xmm6
+    psraw xmm4, 1
+    psubw xmm5, xmm4
+    psraw xmm5, 1
+    movdqa [edi], xmm5                  ; out hi
+    movdqa xmm6, xmm5                   ; save hi
+    ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+    paddw xmm5, xmm1
+    movdqa [edx], xmm5                  ; out lo
+    movdqa xmm7, xmm6                   ; save hi
+    ; move down
+    lea esi, [esi + 32 * 2 * 2]         ; 2 rows
+    lea edi, [edi + 32 * 2]             ; 1 row
+    lea edx, [edx + 32 * 2]             ; 1 row
+
+    ; loop
+    shl ecx, 16
+    mov cx, 14
+loop2d:
+    movdqa xmm1, xmm3                   ; src[2n]
+    movdqa xmm2, [esi + 32 * 2]         ; src[2n + 1]
+    movdqa xmm3, [esi + 32 * 2 * 2]     ; src[2n + 2]
+    movdqa xmm4, xmm1
+    movdqa xmm5, xmm2
+    movdqa xmm6, xmm3
+    ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+    paddw xmm4, xmm6
+    psraw xmm4, 1
+    psubw xmm5, xmm4
+    psraw xmm5, 1
+    movdqa [edi], xmm5                  ; out hi
+    movdqa xmm6, xmm5                   ; save hi
+    ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+    paddw xmm5, xmm7
+    psraw xmm5, 1
+    paddw xmm5, xmm1
+    movdqa [edx], xmm5                  ; out lo
+    movdqa xmm7, xmm6                   ; save hi
+    ; move down
+    lea esi, [esi + 32 * 2 * 2]         ; 2 rows
+    lea edi, [edi + 32 * 2]             ; 1 row
+    lea edx, [edx + 32 * 2]             ; 1 row
+
+    dec cx
+    jnz loop2d
+    shr ecx, 16
+
+    ; post
+    movdqa xmm1, xmm3                   ; src[2n]
+    movdqa xmm2, [esi + 32 * 2]         ; src[2n + 1]
+    movdqa xmm4, xmm1
+    movdqa xmm5, xmm2
+    movdqa xmm6, xmm3
+    ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+    paddw xmm4, xmm6
+    psraw xmm4, 1
+    psubw xmm5, xmm4
+    psraw xmm5, 1
+    movdqa [edi], xmm5                  ; out hi
+    ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+    paddw xmm5, xmm7
+    psraw xmm5, 1
+    paddw xmm5, xmm1
+    movdqa [edx], xmm5                  ; out lo
+    ; move down
+    lea esi, [esi + 32 * 2 * 2]         ; 2 row
+    lea edi, [edi + 32 * 2]             ; 1 row
+    lea edx, [edx + 32 * 2]             ; 1 row
+
+    ; move up
+    lea esi, [esi - 32 * 32 * 2]
+    lea edi, [edi - 16 * 32 * 2]
+    lea edx, [edx - 16 * 32 * 2]
+
+    ; move right
+    lea esi, [esi + 16]
+    lea edi, [edi + 16]
+    lea edx, [edx + 16]
+
+    dec ecx
+    jnz loop1d
+
+    ret
+
+;******************************************************************************
+; source 16 bit signed, 64 pixel width
+rfx_dwt_2d_encode_block_horiz_16_64:
+    mov ecx, 32
+loop1e:
+    ; pre
+    movdqa xmm1, [esi]                  ; src[2n]
+    movdqa xmm2, [esi + 16]
+    movdqa xmm6, xmm1
+    movdqa xmm7, xmm2
+    pand xmm1, [cdFFFF]
+    pand xmm2, [cdFFFF]
+    pslld xmm1, 16
+    pslld xmm2, 16
+    psrad xmm1, 16
+    psrad xmm2, 16
+    packssdw xmm1, xmm2
+    movdqa xmm2, xmm6                   ; src[2n + 1]
+    movdqa xmm3, xmm7
+    psrldq xmm2, 2
+    psrldq xmm3, 2
+    pand xmm2, [cdFFFF]
+    pand xmm3, [cdFFFF]
+    pslld xmm2, 16
+    pslld xmm3, 16
+    psrad xmm2, 16
+    psrad xmm3, 16
+    packssdw xmm2, xmm3
+    movdqa xmm3, xmm6                   ; src[2n + 2]
+    movdqa xmm4, xmm7
+    psrldq xmm3, 4
+    psrldq xmm4, 4
+    movd eax, xmm7
+    movd xmm5, eax
+    pslldq xmm5, 12
+    por xmm3, xmm5
+    mov eax, [esi + 32]
+    movd xmm5, eax
+    pslldq xmm5, 12
+    por xmm4, xmm5
+    pand xmm3, [cdFFFF]
+    pand xmm4, [cdFFFF]
+    pslld xmm3, 16
+    pslld xmm4, 16
+    psrad xmm3, 16
+    psrad xmm4, 16
+    packssdw xmm3, xmm4
+    movdqa xmm4, xmm1
+    movdqa xmm5, xmm2
+    movdqa xmm6, xmm3
+    ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+    paddw xmm4, xmm6
+    psraw xmm4, 1
+    psubw xmm5, xmm4
+    psraw xmm5, 1
+
+    movdqa xmm6, xmm5                   ; out hi
+    paddw xmm6, LHI_ADD
+    psraw xmm6, LHI_SFT
+    movdqa [edi], xmm6
+    movdqa xmm2, xmm5                   ; save hi
+
+    ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+    movdqa xmm7, xmm5
+    movd eax, xmm7
+    pslldq xmm7, 2
+    and eax, 0xFFFF
+    movd xmm6, eax
+    por xmm7, xmm6
+    paddw xmm5, xmm7
+    psraw xmm5, 1
+    paddw xmm5, xmm1
+
+    psrldq xmm2, 14
+    movd ebx, xmm2                      ; save hi
+
+    movdqa xmm6, xmm5                   ; out lo
+    paddw xmm6, LLO_ADD
+    psraw xmm6, LLO_SFT
+    movdqa [edx], xmm6
+
+    ; move right
+    lea esi, [esi + 16 * 2]
+    lea edi, [edi + 8 * 2]
+    lea edx, [edx + 8 * 2]
+
+    ; loop
+    shl ecx, 16
+    mov cx, 2
+loop2e:
+    movdqa xmm1, [esi]                  ; src[2n]
+    movdqa xmm2, [esi + 16]
+    movdqa xmm6, xmm1
+    movdqa xmm7, xmm2
+    pand xmm1, [cdFFFF]
+    pand xmm2, [cdFFFF]
+    pslld xmm1, 16
+    pslld xmm2, 16
+    psrad xmm1, 16
+    psrad xmm2, 16
+    packssdw xmm1, xmm2
+    movdqa xmm2, xmm6                   ; src[2n + 1]
+    movdqa xmm3, xmm7
+    psrldq xmm2, 2
+    psrldq xmm3, 2
+    pand xmm2, [cdFFFF]
+    pand xmm3, [cdFFFF]
+    pslld xmm2, 16
+    pslld xmm3, 16
+    psrad xmm2, 16
+    psrad xmm3, 16
+    packssdw xmm2, xmm3
+    movdqa xmm3, xmm6                   ; src[2n + 2]
+    movdqa xmm4, xmm7
+    psrldq xmm3, 4
+    psrldq xmm4, 4
+    movd eax, xmm7
+    movd xmm5, eax
+    pslldq xmm5, 12
+    por xmm3, xmm5
+    mov eax, [esi + 32]
+    movd xmm5, eax
+    pslldq xmm5, 12
+    por xmm4, xmm5
+    pand xmm3, [cdFFFF]
+    pand xmm4, [cdFFFF]
+    pslld xmm3, 16
+    pslld xmm4, 16
+    psrad xmm3, 16
+    psrad xmm4, 16
+    packssdw xmm3, xmm4
+    movdqa xmm4, xmm1
+    movdqa xmm5, xmm2
+    movdqa xmm6, xmm3
+    ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+    paddw xmm4, xmm6
+    psraw xmm4, 1
+    psubw xmm5, xmm4
+    psraw xmm5, 1
+
+    movdqa xmm6, xmm5                   ; out hi
+    paddw xmm6, LHI_ADD
+    psraw xmm6, LHI_SFT
+    movdqa [edi], xmm6
+    movdqa xmm2, xmm5                   ; save hi
+
+    ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+    movdqa xmm7, xmm5
+    pslldq xmm7, 2
+    movd xmm6, ebx
+    por xmm7, xmm6
+    paddw xmm5, xmm7
+    psraw xmm5, 1
+    paddw xmm5, xmm1
+
+    psrldq xmm2, 14
+    movd ebx, xmm2                      ; save hi
+
+    movdqa xmm6, xmm5                   ; out lo
+    paddw xmm6, LLO_ADD
+    psraw xmm6, LLO_SFT
+    movdqa [edx], xmm6
+
+    ; move right
+    lea esi, [esi + 16 * 2]
+    lea edi, [edi + 8 * 2]
+    lea edx, [edx + 8 * 2]
+
+    dec cx
+    jnz loop2e
+    shr ecx, 16
+
+    ; post
+    movdqa xmm1, [esi]                  ; src[2n]
+    movdqa xmm2, [esi + 16]
+    movdqa xmm6, xmm1
+    movdqa xmm7, xmm2
+    pand xmm1, [cdFFFF]
+    pand xmm2, [cdFFFF]
+    pslld xmm1, 16
+    pslld xmm2, 16
+    psrad xmm1, 16
+    psrad xmm2, 16
+    packssdw xmm1, xmm2
+    movdqa xmm2, xmm6                   ; src[2n + 1]
+    movdqa xmm3, xmm7
+    psrldq xmm2, 2
+    psrldq xmm3, 2
+    pand xmm2, [cdFFFF]
+    pand xmm3, [cdFFFF]
+    pslld xmm2, 16
+    pslld xmm3, 16
+    psrad xmm2, 16
+    psrad xmm3, 16
+    packssdw xmm2, xmm3
+    movdqa xmm3, xmm6                   ; src[2n + 2]
+    movdqa xmm4, xmm7
+    psrldq xmm3, 4
+    psrldq xmm4, 4
+    movd eax, xmm7
+    movd xmm5, eax
+    pslldq xmm5, 12
+    por xmm3, xmm5
+    movdqa xmm5, xmm7
+    psrldq xmm5, 12
+    pslldq xmm5, 12
+    por xmm4, xmm5
+    pand xmm3, [cdFFFF]
+    pand xmm4, [cdFFFF]
+    pslld xmm3, 16
+    pslld xmm4, 16
+    psrad xmm3, 16
+    psrad xmm4, 16
+    packssdw xmm3, xmm4
+    movdqa xmm4, xmm1
+    movdqa xmm5, xmm2
+    movdqa xmm6, xmm3
+    ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+    paddw xmm4, xmm6
+    psraw xmm4, 1
+    psubw xmm5, xmm4
+    psraw xmm5, 1
+
+    movdqa xmm6, xmm5                   ; out hi
+    paddw xmm6, LHI_ADD
+    psraw xmm6, LHI_SFT
+    movdqa [edi], xmm6
+
+    ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+    movdqa xmm7, xmm5
+    pslldq xmm7, 2
+    movd xmm6, ebx
+    por xmm7, xmm6
+    paddw xmm5, xmm7
+    psraw xmm5, 1
+    paddw xmm5, xmm1
+
+    movdqa xmm6, xmm5                   ; out lo
+    paddw xmm6, LLO_ADD
+    psraw xmm6, LLO_SFT
+    movdqa [edx], xmm6
+
+    ; move right
+    lea esi, [esi + 16 * 2]
+    lea edi, [edi + 8 * 2]
+    lea edx, [edx + 8 * 2]
+
+    ; move left
+    lea esi, [esi - 64 * 2]
+    lea edi, [edi - 32 * 2]
+    lea edx, [edx - 32 * 2]
+
+    ; move down
+    lea esi, [esi + 64 * 2]
+    lea edi, [edi + 32 * 2]
+    lea edx, [edx + 32 * 2]
+
+    dec ecx
+    jnz loop1e
+
+    ret
+
+;******************************************************************************
+; source 16 bit signed, 64 pixel width
+rfx_dwt_2d_encode_block_horiz_16_64_no_lo:
+    mov ecx, 32
+loop1e1:
+    ; pre
+    movdqa xmm1, [esi]                  ; src[2n]
+    movdqa xmm2, [esi + 16]
+    movdqa xmm6, xmm1
+    movdqa xmm7, xmm2
+    pand xmm1, [cdFFFF]
+    pand xmm2, [cdFFFF]
+    pslld xmm1, 16
+    pslld xmm2, 16
+    psrad xmm1, 16
+    psrad xmm2, 16
+    packssdw xmm1, xmm2
+    movdqa xmm2, xmm6                   ; src[2n + 1]
+    movdqa xmm3, xmm7
+    psrldq xmm2, 2
+    psrldq xmm3, 2
+    pand xmm2, [cdFFFF]
+    pand xmm3, [cdFFFF]
+    pslld xmm2, 16
+    pslld xmm3, 16
+    psrad xmm2, 16
+    psrad xmm3, 16
+    packssdw xmm2, xmm3
+    movdqa xmm3, xmm6                   ; src[2n + 2]
+    movdqa xmm4, xmm7
+    psrldq xmm3, 4
+    psrldq xmm4, 4
+    movd eax, xmm7
+    movd xmm5, eax
+    pslldq xmm5, 12
+    por xmm3, xmm5
+    mov eax, [esi + 32]
+    movd xmm5, eax
+    pslldq xmm5, 12
+    por xmm4, xmm5
+    pand xmm3, [cdFFFF]
+    pand xmm4, [cdFFFF]
+    pslld xmm3, 16
+    pslld xmm4, 16
+    psrad xmm3, 16
+    psrad xmm4, 16
+    packssdw xmm3, xmm4
+    movdqa xmm4, xmm1
+    movdqa xmm5, xmm2
+    movdqa xmm6, xmm3
+    ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+    paddw xmm4, xmm6
+    psraw xmm4, 1
+    psubw xmm5, xmm4
+    psraw xmm5, 1
+
+    movdqa xmm6, xmm5                   ; out hi
+    paddw xmm6, LHI_ADD
+    psraw xmm6, LHI_SFT
+    movdqa [edi], xmm6
+    movdqa xmm2, xmm5                   ; save hi
+
+    ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+    movdqa xmm7, xmm5
+    movd eax, xmm7
+    pslldq xmm7, 2
+    and eax, 0xFFFF
+    movd xmm6, eax
+    por xmm7, xmm6
+    paddw xmm5, xmm7
+    psraw xmm5, 1
+    paddw xmm5, xmm1
+
+    psrldq xmm2, 14
+    movd ebx, xmm2                      ; save hi
+
+    movdqa [edx], xmm5                  ; out lo
+
+    ; move right
+    lea esi, [esi + 16 * 2]
+    lea edi, [edi + 8 * 2]
+    lea edx, [edx + 8 * 2]
+
+    ; loop
+    shl ecx, 16
+    mov cx, 2
+loop2e1:
+    movdqa xmm1, [esi]                  ; src[2n]
+    movdqa xmm2, [esi + 16]
+    movdqa xmm6, xmm1
+    movdqa xmm7, xmm2
+    pand xmm1, [cdFFFF]
+    pand xmm2, [cdFFFF]
+    pslld xmm1, 16
+    pslld xmm2, 16
+    psrad xmm1, 16
+    psrad xmm2, 16
+    packssdw xmm1, xmm2
+    movdqa xmm2, xmm6                   ; src[2n + 1]
+    movdqa xmm3, xmm7
+    psrldq xmm2, 2
+    psrldq xmm3, 2
+    pand xmm2, [cdFFFF]
+    pand xmm3, [cdFFFF]
+    pslld xmm2, 16
+    pslld xmm3, 16
+    psrad xmm2, 16
+    psrad xmm3, 16
+    packssdw xmm2, xmm3
+    movdqa xmm3, xmm6                   ; src[2n + 2]
+    movdqa xmm4, xmm7
+    psrldq xmm3, 4
+    psrldq xmm4, 4
+    movd eax, xmm7
+    movd xmm5, eax
+    pslldq xmm5, 12
+    por xmm3, xmm5
+    mov eax, [esi + 32]
+    movd xmm5, eax
+    pslldq xmm5, 12
+    por xmm4, xmm5
+    pand xmm3, [cdFFFF]
+    pand xmm4, [cdFFFF]
+    pslld xmm3, 16
+    pslld xmm4, 16
+    psrad xmm3, 16
+    psrad xmm4, 16
+    packssdw xmm3, xmm4
+    movdqa xmm4, xmm1
+    movdqa xmm5, xmm2
+    movdqa xmm6, xmm3
+    ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+    paddw xmm4, xmm6
+    psraw xmm4, 1
+    psubw xmm5, xmm4
+    psraw xmm5, 1
+
+    movdqa xmm6, xmm5                   ; out hi
+    paddw xmm6, LHI_ADD
+    psraw xmm6, LHI_SFT
+    movdqa [edi], xmm6
+    movdqa xmm2, xmm5                   ; save hi
+
+    ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+    movdqa xmm7, xmm5
+    pslldq xmm7, 2
+    movd xmm6, ebx
+    por xmm7, xmm6
+    paddw xmm5, xmm7
+    psraw xmm5, 1
+    paddw xmm5, xmm1
+
+    psrldq xmm2, 14
+    movd ebx, xmm2                      ; save hi
+
+    movdqa [edx], xmm5                  ; out lo
+
+    ; move right
+    lea esi, [esi + 16 * 2]
+    lea edi, [edi + 8 * 2]
+    lea edx, [edx + 8 * 2]
+
+    dec cx
+    jnz loop2e1
+    shr ecx, 16
+
+    ; post
+    movdqa xmm1, [esi]                  ; src[2n]
+    movdqa xmm2, [esi + 16]
+    movdqa xmm6, xmm1
+    movdqa xmm7, xmm2
+    pand xmm1, [cdFFFF]
+    pand xmm2, [cdFFFF]
+    pslld xmm1, 16
+    pslld xmm2, 16
+    psrad xmm1, 16
+    psrad xmm2, 16
+    packssdw xmm1, xmm2
+    movdqa xmm2, xmm6                   ; src[2n + 1]
+    movdqa xmm3, xmm7
+    psrldq xmm2, 2
+    psrldq xmm3, 2
+    pand xmm2, [cdFFFF]
+    pand xmm3, [cdFFFF]
+    pslld xmm2, 16
+    pslld xmm3, 16
+    psrad xmm2, 16
+    psrad xmm3, 16
+    packssdw xmm2, xmm3
+    movdqa xmm3, xmm6                   ; src[2n + 2]
+    movdqa xmm4, xmm7
+    psrldq xmm3, 4
+    psrldq xmm4, 4
+    movd eax, xmm7
+    movd xmm5, eax
+    pslldq xmm5, 12
+    por xmm3, xmm5
+    movdqa xmm5, xmm7
+    psrldq xmm5, 12
+    pslldq xmm5, 12
+    por xmm4, xmm5
+    pand xmm3, [cdFFFF]
+    pand xmm4, [cdFFFF]
+    pslld xmm3, 16
+    pslld xmm4, 16
+    psrad xmm3, 16
+    psrad xmm4, 16
+    packssdw xmm3, xmm4
+    movdqa xmm4, xmm1
+    movdqa xmm5, xmm2
+    movdqa xmm6, xmm3
+    ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+    paddw xmm4, xmm6
+    psraw xmm4, 1
+    psubw xmm5, xmm4
+    psraw xmm5, 1
+
+    movdqa xmm6, xmm5                   ; out hi
+    paddw xmm6, LHI_ADD
+    psraw xmm6, LHI_SFT
+    movdqa [edi], xmm6
+
+    ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+    movdqa xmm7, xmm5
+    pslldq xmm7, 2
+    movd xmm6, ebx
+    por xmm7, xmm6
+    paddw xmm5, xmm7
+    psraw xmm5, 1
+    paddw xmm5, xmm1
+
+    movdqa [edx], xmm5                  ; out lo
+
+    ; move right
+    lea esi, [esi + 16 * 2]
+    lea edi, [edi + 8 * 2]
+    lea edx, [edx + 8 * 2]
+
+    ; move left
+    lea esi, [esi - 64 * 2]
+    lea edi, [edi - 32 * 2]
+    lea edx, [edx - 32 * 2]
+
+    ; move down
+    lea esi, [esi + 64 * 2]
+    lea edi, [edi + 32 * 2]
+    lea edx, [edx + 32 * 2]
+
+    dec ecx
+    jnz loop1e1
+
+    ret
+
+;******************************************************************************
+; source 8 bit unsigned, 64 pixel width
+rfx_dwt_2d_encode_block_verti_8_64:
+    mov ecx, 8
+loop1f:
+    ; pre
+    movq xmm1, [esi]                    ; src[2n]
+    movq xmm2, [esi + 64 * 1]           ; src[2n + 1]
+    movq xmm3, [esi + 64 * 1 * 2]       ; src[2n + 2]
+    punpcklbw xmm1, xmm0
+    punpcklbw xmm2, xmm0
+    punpcklbw xmm3, xmm0
+    psubw xmm1, [cw128]
+    psubw xmm2, [cw128]
+    psubw xmm3, [cw128]
+    psllw xmm1, 5
+    psllw xmm2, 5
+    psllw xmm3, 5
+    movdqa xmm4, xmm1
+    movdqa xmm5, xmm2
+    movdqa xmm6, xmm3
+    ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+    paddw xmm4, xmm6
+    psraw xmm4, 1
+    psubw xmm5, xmm4
+    psraw xmm5, 1
+    movdqa [edi], xmm5                  ; out hi
+    movdqa xmm6, xmm5                   ; save hi
+    ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+    paddw xmm5, xmm1
+    movdqa [edx], xmm5                  ; out lo
+    movdqa xmm7, xmm6                   ; save hi
+    ; move down
+    lea esi, [esi + 64 * 1 * 2]         ; 2 rows
+    lea edi, [edi + 64 * 2]             ; 1 row
+    lea edx, [edx + 64 * 2]             ; 1 row
+
+    ; loop
+    shl ecx, 16
+    mov cx, 30
+loop2f:
+    movdqa xmm1, xmm3                   ; src[2n]
+    movq xmm2, [esi + 64 * 1]           ; src[2n + 1]
+    movq xmm3, [esi + 64 * 1 * 2]       ; src[2n + 2]
+    punpcklbw xmm2, xmm0
+    punpcklbw xmm3, xmm0
+    psubw xmm2, [cw128]
+    psubw xmm3, [cw128]
+    psllw xmm2, 5
+    psllw xmm3, 5
+    movdqa xmm4, xmm1
+    movdqa xmm5, xmm2
+    movdqa xmm6, xmm3
+    ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+    paddw xmm4, xmm6
+    psraw xmm4, 1
+    psubw xmm5, xmm4
+    psraw xmm5, 1
+    movdqa [edi], xmm5                  ; out hi
+    movdqa xmm6, xmm5                   ; save hi
+    ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+    paddw xmm5, xmm7
+    psraw xmm5, 1
+    paddw xmm5, xmm1
+    movdqa [edx], xmm5                  ; out lo
+    movdqa xmm7, xmm6                   ; save hi
+    ; move down
+    lea esi, [esi + 64 * 1 * 2]         ; 2 rows
+    lea edi, [edi + 64 * 2]             ; 1 row
+    lea edx, [edx + 64 * 2]             ; 1 row
+
+    dec cx
+    jnz loop2f
+    shr ecx, 16
+
+    ; post
+    movdqa xmm1, xmm3                   ; src[2n]
+    movq xmm2, [esi + 64 * 1]           ; src[2n + 1]
+    punpcklbw xmm2, xmm0
+    psubw xmm2, [cw128]
+    psllw xmm2, 5
+    movdqa xmm4, xmm1
+    movdqa xmm5, xmm2
+    movdqa xmm6, xmm3
+    ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+    paddw xmm4, xmm6
+    psraw xmm4, 1
+    psubw xmm5, xmm4
+    psraw xmm5, 1
+    movdqa [edi], xmm5                  ; out hi
+    ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+    paddw xmm5, xmm7
+    psraw xmm5, 1
+    paddw xmm5, xmm1
+    movdqa [edx], xmm5                  ; out lo
+    ; move down
+    lea esi, [esi + 64 * 1 * 2]         ; 2 rows
+    lea edi, [edi + 64 * 2]             ; 1 row
+    lea edx, [edx + 64 * 2]             ; 1 row
+
+    ; move up
+    lea esi, [esi - 64 * 1 * 64]
+    lea edi, [edi - 32 * 64 * 2]
+    lea edx, [edx - 32 * 64 * 2]
+
+    ; move right
+    lea esi, [esi + 8]
+    lea edi, [edi + 16]
+    lea edx, [edx + 16]
+
+    dec ecx
+    jnz loop1f
+
+    ret
+
+set_quants_hi:
+    sub eax, 6 - 5
+    movd xmm1, eax
+    movdqa LHI_SFT, xmm1
+    imul eax, 16
+    lea edx, [cwa0]
+    add edx, eax
+    movdqa xmm1, [edx]
+    movdqa LHI_ADD, xmm1
+    ret
+
+set_quants_lo:
+    sub eax, 6 - 5
+    movd xmm1, eax
+    movdqa LLO_SFT, xmm1
+    imul eax, 16
+    lea edx, [cwa0]
+    add edx, eax
+    movdqa xmm1, [edx]
+    movdqa LLO_ADD, xmm1
+    ret
+
+%define LQTABLE           [esp + 144] ; qtable
+%define LIN_BUFFER        [esp + 148] ; in_buffer
+%define LOUT_BUFFER       [esp + 152] ; out_buffer
+%define LWORK_BUFFER      [esp + 156] ; work_buffer
+
+;int
+;rfxcodec_encode_dwt_shift_x86_sse2(const char *qtable,
+;                                   unsigned char *in_buffer,
+;                                   short *out_buffer,
+;                                   short *work_buffer);
+
+;******************************************************************************
+%ifidn __OUTPUT_FORMAT__,elf
+PROC rfxcodec_encode_dwt_shift_x86_sse2
+%else
+PROC _rfxcodec_encode_dwt_shift_x86_sse2
+%endif
+    ; align stack
+    mov eax, esp
+    sub eax, 0x10
+    and eax, 0x0F
+    sub esp, eax
+    push eax
+    sub esp, 3 * 4
+    sub esp, 4 * 4
+    ; copy params to after align
+    movdqu xmm0, [esp + eax + 4 * 4 + 3 * 4 + 4 + 4]
+    movdqu [esp], xmm0
+    ; save registers
+    push ebx
+    push esi
+    push edi
+    push ebp
+    sub esp, 16 * 8
+    pxor xmm0, xmm0
+
+    ; verical DWT to work buffer, level 1
+    mov esi, LIN_BUFFER                 ; src
+    mov edi, LWORK_BUFFER               ; dst hi
+    lea edi, [edi + 64 * 32 * 2]        ; dst hi
+    mov edx, LWORK_BUFFER               ; dst lo
+    call rfx_dwt_2d_encode_block_verti_8_64
+
+    ; horizontal DWT to out buffer, level 1, part 1
+    xor eax, eax
+    mov edx, LQTABLE
+    mov al, [edx + 4]
+    and al, 0xF
+    call set_quants_hi
+    mov esi, LWORK_BUFFER               ; src
+    mov edi, LOUT_BUFFER                ; dst hi - HL1
+    mov edx, LOUT_BUFFER                ; dst lo - LL1
+    lea edx, [edx + 32 * 32 * 6]        ; dst lo - LL1
+    call rfx_dwt_2d_encode_block_horiz_16_64_no_lo
+
+    ; horizontal DWT to out buffer, level 1, part 2
+    xor eax, eax
+    mov edx, LQTABLE
+    mov al, [edx + 4]
+    shr al, 4
+    call set_quants_hi
+    xor eax, eax
+    mov edx, LQTABLE
+    mov al, [edx + 3]
+    shr al, 4
+    call set_quants_lo
+    mov esi, LWORK_BUFFER               ; src
+    lea esi, [esi + 64 * 32 * 2]        ; src
+    mov edi, LOUT_BUFFER                ; dst hi - HH1
+    lea edi, [edi + 32 * 32 * 4]        ; dst hi - HH1
+    mov edx, LOUT_BUFFER                ; dst lo - LH1
+    lea edx, [edx + 32 * 32 * 2]        ; dst lo - LH1
+    call rfx_dwt_2d_encode_block_horiz_16_64
+
+    ; verical DWT to work buffer, level 2
+    mov esi, LOUT_BUFFER                ; src
+    lea esi, [esi + 32 * 32 * 6]        ; src
+    mov edi, LWORK_BUFFER               ; dst hi
+    lea edi, [edi + 32 * 16 * 2]        ; dst hi
+    mov edx, LWORK_BUFFER               ; dst lo
+    call rfx_dwt_2d_encode_block_verti_16_32
+
+    ; horizontal DWT to out buffer, level 2, part 1
+    xor eax, eax
+    mov edx, LQTABLE
+    mov al, [edx + 2]
+    shr al, 4
+    call set_quants_hi
+    mov esi, LWORK_BUFFER               ; src
+    ; 32 * 32 * 6 + 16 * 16 * 0 = 6144
+    mov edi, LOUT_BUFFER                ; dst hi - HL2
+    lea edi, [edi + 6144]               ; dst hi - HL2
+    ; 32 * 32 * 6 + 16 * 16 * 6 = 7680
+    mov edx, LOUT_BUFFER                ; dst lo - LL2
+    lea edx, [edx + 7680]               ; dst lo - LL2
+    call rfx_dwt_2d_encode_block_horiz_16_32_no_lo
+
+    ; horizontal DWT to out buffer, level 2, part 2
+    xor eax, eax
+    mov edx, LQTABLE
+    mov al, [edx + 3]
+    and al, 0xF
+    call set_quants_hi
+    xor eax, eax
+    mov edx, LQTABLE
+    mov al, [edx + 2]
+    and al, 0xF
+    call set_quants_lo
+    mov esi, LWORK_BUFFER               ; src
+    lea esi, [esi + 32 * 16 * 2]        ; src
+    ; 32 * 32 * 6 + 16 * 16 * 4 = 7168
+    mov edi, LOUT_BUFFER                ; dst hi - HH2
+    lea edi, [edi + 7168]               ; dst hi - HH2
+    ; 32 * 32 * 6 + 16 * 16 * 2 = 6656
+    mov edx, LOUT_BUFFER                ; dst lo - LH2
+    lea edx, [edx + 6656]               ; dst lo - LH2
+    call rfx_dwt_2d_encode_block_horiz_16_32
+
+    ; verical DWT to work buffer, level 3
+    ; 32 * 32 * 6 + 16 * 16 * 6 = 7680
+    mov esi, LOUT_BUFFER                ; src
+    lea esi, [esi + 7680]               ; src
+    mov edi, LWORK_BUFFER               ; dst hi
+    lea edi, [edi + 16 * 8 * 2]         ; dst hi
+    mov edx, LWORK_BUFFER               ; dst lo
+    call rfx_dwt_2d_encode_block_verti_16_16
+
+    ; horizontal DWT to out buffer, level 3, part 1
+    xor eax, eax
+    mov edx, LQTABLE
+    mov al, [edx + 1]
+    and al, 0xF
+    call set_quants_hi
+    xor eax, eax
+    mov edx, LQTABLE
+    mov al, [edx + 0]
+    and al, 0xF
+    call set_quants_lo
+    mov esi, LWORK_BUFFER               ; src
+    ; 32 * 32 * 6 + 16 * 16 * 6 + 8 * 8 * 0 = 7680
+    mov edi, LOUT_BUFFER                ; dst hi - HL3
+    lea edi, [edi + 7680]               ; dst hi - HL3
+    ; 32 * 32 * 6 + 16 * 16 * 6 + 8 * 8 * 6 = 8064
+    mov edx, LOUT_BUFFER                ; dst lo - LL3
+    lea edx, [edx + 8064]               ; dst lo - LL3
+    call rfx_dwt_2d_encode_block_horiz_16_16
+
+    ; horizontal DWT to out buffer, level 3, part 2
+    xor eax, eax
+    mov edx, LQTABLE
+    mov al, [edx + 1]
+    shr al, 4
+    call set_quants_hi
+    xor eax, eax
+    mov edx, LQTABLE
+    mov al, [edx + 0]
+    shr al, 4
+    call set_quants_lo
+    mov esi, LWORK_BUFFER               ; src
+    lea esi, [esi + 16 * 8 * 2]         ; src
+    ; 32 * 32 * 6 + 16 * 16 * 6 + 8 * 8 * 4 = 7936
+    mov edi, LOUT_BUFFER                ; dst hi - HH3
+    lea edi, [edi + 7936]               ; dst hi - HH3
+    ; 32 * 32 * 6 + 16 * 16 * 6 + 8 * 8 * 2 = 7808
+    mov edx, LOUT_BUFFER                ; dst lo - LH3
+    lea edx, [edx + 7808]               ; dst lo - LH3
+    call rfx_dwt_2d_encode_block_horiz_16_16
+
+    ; quants
+    add esp, 16 * 8
+    ; restore registers
+    pop ebp
+    pop edi
+    pop esi
+    pop ebx
+    ; params
+    add esp, 3 * 4
+    add esp, 4 * 4
+    ; align
+    pop eax
+    add esp, eax
+    ; return value
+    mov eax, 0
+    ret
+    align 16
+
diff --git a/src/x86/rfxcodec_encode_dwt_shift_x86_sse41.asm b/src/x86/rfxcodec_encode_dwt_shift_x86_sse41.asm
new file mode 100644
index 0000000..cb117da
--- /dev/null
+++ b/src/x86/rfxcodec_encode_dwt_shift_x86_sse41.asm
@@ -0,0 +1,1401 @@
+;
+;Copyright 2016 Jay Sorg
+;
+;Permission to use, copy, modify, distribute, and sell this software and its
+;documentation for any purpose is hereby granted without fee, provided that
+;the above copyright notice appear in all copies and that both that
+;copyright notice and this permission notice appear in supporting
+;documentation.
+;
+;The above copyright notice and this permission notice shall be included in
+;all copies or substantial portions of the Software.
+;
+;THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+;IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+;FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+;OPEN GROUP BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+;AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+;CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+;
+;x86 asm dwt
+
+%ifidn __OUTPUT_FORMAT__,elf
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
+
+section .data
+    align 16
+    cw128    times 8 dw 128
+    cdFFFF   times 4 dd 65535
+    ; these are 1 << (factor - 1) 0 to 15 is factor
+    cwa0     times 8 dw 0     ; 0
+    cwa1     times 8 dw 1     ; 1
+    cwa2     times 8 dw 2     ; 2
+    cwa4     times 8 dw 4     ; 3
+    cwa8     times 8 dw 8     ; 4
+    cwa16    times 8 dw 16    ; 5
+    cwa32    times 8 dw 32    ; 6
+    cwa64    times 8 dw 64    ; 7
+    cwa128   times 8 dw 128   ; 8
+    cwa256   times 8 dw 256   ; 9
+    cwa512   times 8 dw 512   ; 10
+    cwa1024  times 8 dw 1024  ; 11
+    cwa2048  times 8 dw 2048  ; 12
+    cwa4096  times 8 dw 4096  ; 13
+    cwa8192  times 8 dw 8192  ; 14
+    cwa16384 times 8 dw 16384 ; 15
+
+section .text
+
+%macro PROC 1
+    align 16
+    global %1
+    %1:
+%endmacro
+
+%define LHI_ADD  [esp + 1 * 16 + 4]
+%define LHI_SFT  [esp + 2 * 16 + 4]
+%define LLO_ADD  [esp + 3 * 16 + 4]
+%define LLO_SFT  [esp + 4 * 16 + 4]
+
+;******************************************************************************
+; source 16 bit signed, 16 pixel width
+rfx_dwt_2d_encode_block_horiz_16_16:
+    mov ecx, 8
+loop1a:
+    ; pre / post
+    movdqa xmm1, [esi]                  ; src[2n]
+    movdqa xmm2, [esi + 16]
+    movdqa xmm6, xmm1
+    movdqa xmm7, xmm2
+    pand xmm1, [cdFFFF]
+    pand xmm2, [cdFFFF]
+    packusdw xmm1, xmm2
+    movdqa xmm2, xmm6                   ; src[2n + 1]
+    movdqa xmm3, xmm7
+    psrldq xmm2, 2
+    psrldq xmm3, 2
+    pand xmm2, [cdFFFF]
+    pand xmm3, [cdFFFF]
+    packusdw xmm2, xmm3
+    movdqa xmm3, xmm6                   ; src[2n + 2]
+    movdqa xmm4, xmm7
+    psrldq xmm3, 4
+    psrldq xmm4, 4
+    movd eax, xmm7
+    movd xmm5, eax
+    pslldq xmm5, 12
+    por xmm3, xmm5
+    movdqa xmm5, xmm7
+    psrldq xmm5, 12
+    pslldq xmm5, 12
+    por xmm4, xmm5
+    pand xmm3, [cdFFFF]
+    pand xmm4, [cdFFFF]
+    packusdw xmm3, xmm4
+    movdqa xmm4, xmm1
+    movdqa xmm5, xmm2
+    movdqa xmm6, xmm3
+    ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+    paddw xmm4, xmm6
+    psraw xmm4, 1
+    psubw xmm5, xmm4
+    psraw xmm5, 1
+    movdqa xmm6, xmm5                   ; out hi
+    paddw xmm6, LHI_ADD
+    psraw xmm6, LHI_SFT
+    movdqa [edi], xmm6
+    ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+    movdqa xmm7, xmm5
+    movd eax, xmm7
+    pslldq xmm7, 2
+    and eax, 0xFFFF
+    movd xmm6, eax
+    por xmm7, xmm6
+    paddw xmm5, xmm7
+    psraw xmm5, 1
+    paddw xmm5, xmm1
+
+    movdqa xmm6, xmm5                   ; out lo
+    paddw xmm6, LLO_ADD
+    psraw xmm6, LLO_SFT
+    movdqa [edx], xmm6
+
+    ; move right
+    lea esi, [esi + 16 * 2]
+    lea edi, [edi + 8 * 2]
+    lea edx, [edx + 8 * 2]
+
+    ; move left
+    lea esi, [esi - 16 * 2]
+    lea edi, [edi - 8 * 2]
+    lea edx, [edx - 8 * 2]
+
+    ; move down
+    lea esi, [esi + 16 * 2]
+    lea edi, [edi + 8 * 2]
+    lea edx, [edx + 8 * 2]
+
+    dec ecx
+    jnz loop1a
+
+    ret
+
+;******************************************************************************
+; source 16 bit signed, 16 pixel width
+rfx_dwt_2d_encode_block_verti_16_16:
+    mov ecx, 2
+loop1b:
+    ; pre
+    movdqa xmm1, [esi]                  ; src[2n]
+    movdqa xmm2, [esi + 16 * 2]         ; src[2n + 1]
+    movdqa xmm3, [esi + 16 * 2 * 2]     ; src[2n + 2]
+    movdqa xmm4, xmm1
+    movdqa xmm5, xmm2
+    movdqa xmm6, xmm3
+    ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+    paddw xmm4, xmm6
+    psraw xmm4, 1
+    psubw xmm5, xmm4
+    psraw xmm5, 1
+    movdqa [edi], xmm5                  ; out hi
+    movdqa xmm6, xmm5                   ; save hi
+    ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+    paddw xmm5, xmm1
+    movdqa [edx], xmm5                  ; out lo
+    movdqa xmm7, xmm6                   ; save hi
+    ; move down
+    lea esi, [esi + 16 * 2 * 2]         ; 2 rows
+    lea edi, [edi + 16 * 2]             ; 1 row
+    lea edx, [edx + 16 * 2]             ; 1 row
+
+    ; loop
+    shl ecx, 16
+    mov cx, 6
+loop2b:
+    movdqa xmm1, xmm3                   ; src[2n]
+    movdqa xmm2, [esi + 16 * 2]         ; src[2n + 1]
+    movdqa xmm3, [esi + 16 * 2 * 2]     ; src[2n + 2]
+    movdqa xmm4, xmm1
+    movdqa xmm5, xmm2
+    movdqa xmm6, xmm3
+    ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+    paddw xmm4, xmm6
+    psraw xmm4, 1
+    psubw xmm5, xmm4
+    psraw xmm5, 1
+    movdqa [edi], xmm5                  ; out hi
+    movdqa xmm6, xmm5                   ; save hi
+    ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+    paddw xmm5, xmm7
+    psraw xmm5, 1
+    paddw xmm5, xmm1
+    movdqa [edx], xmm5                  ; out lo
+    movdqa xmm7, xmm6                   ; save hi
+    ; move down
+    lea esi, [esi + 16 * 2 * 2]         ; 2 rows
+    lea edi, [edi + 16 * 2]             ; 1 row
+    lea edx, [edx + 16 * 2]             ; 1 row
+
+    dec cx
+    jnz loop2b
+    shr ecx, 16
+
+    ; post
+    movdqa xmm1, xmm3                   ; src[2n]
+    movdqa xmm2, [esi + 16 * 2]         ; src[2n + 1]
+    movdqa xmm4, xmm1
+    movdqa xmm5, xmm2
+    movdqa xmm6, xmm3
+    ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+    paddw xmm4, xmm6
+    psraw xmm4, 1
+    psubw xmm5, xmm4
+    psraw xmm5, 1
+    movdqa [edi], xmm5                  ; out hi
+    ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+    paddw xmm5, xmm7
+    psraw xmm5, 1
+    paddw xmm5, xmm1
+    movdqa [edx], xmm5                  ; out lo
+    ; move down
+    lea esi, [esi + 16 * 2 * 2]         ; 2 row
+    lea edi, [edi + 16 * 2]             ; 1 row
+    lea edx, [edx + 16 * 2]             ; 1 row
+
+    ; move up
+    lea esi, [esi - 16 * 16 * 2]
+    lea edi, [edi - 8 * 16 * 2]
+    lea edx, [edx - 8 * 16 * 2]
+
+    ; move right
+    lea esi, [esi + 16]
+    lea edi, [edi + 16]
+    lea edx, [edx + 16]
+
+    dec ecx
+    jnz loop1b
+
+    ret
+
+;******************************************************************************
+; source 16 bit signed, 32 pixel width
+rfx_dwt_2d_encode_block_horiz_16_32:
+    mov ecx, 16
+loop1c:
+    ; pre
+    movdqa xmm1, [esi]                  ; src[2n]
+    movdqa xmm2, [esi + 16]
+    movdqa xmm6, xmm1
+    movdqa xmm7, xmm2
+    pand xmm1, [cdFFFF]
+    pand xmm2, [cdFFFF]
+    packusdw xmm1, xmm2
+    movdqa xmm2, xmm6                   ; src[2n + 1]
+    movdqa xmm3, xmm7
+    psrldq xmm2, 2
+    psrldq xmm3, 2
+    pand xmm2, [cdFFFF]
+    pand xmm3, [cdFFFF]
+    packusdw xmm2, xmm3
+    movdqa xmm3, xmm6                   ; src[2n + 2]
+    movdqa xmm4, xmm7
+    psrldq xmm3, 4
+    psrldq xmm4, 4
+    movd eax, xmm7
+    movd xmm5, eax
+    pslldq xmm5, 12
+    por xmm3, xmm5
+    mov eax, [esi + 32]
+    movd xmm5, eax
+    pslldq xmm5, 12
+    por xmm4, xmm5
+    pand xmm3, [cdFFFF]
+    pand xmm4, [cdFFFF]
+    packusdw xmm3, xmm4
+    movdqa xmm4, xmm1
+    movdqa xmm5, xmm2
+    movdqa xmm6, xmm3
+    ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+    paddw xmm4, xmm6
+    psraw xmm4, 1
+    psubw xmm5, xmm4
+    psraw xmm5, 1
+
+    movdqa xmm6, xmm5                   ; out hi
+    paddw xmm6, LHI_ADD
+    psraw xmm6, LHI_SFT
+    movdqa [edi], xmm6
+    movdqa xmm2, xmm5                   ; save hi
+
+    ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+    movdqa xmm7, xmm5
+    movd eax, xmm7
+    pslldq xmm7, 2
+    and eax, 0xFFFF
+    movd xmm6, eax
+    por xmm7, xmm6
+    paddw xmm5, xmm7
+    psraw xmm5, 1
+    paddw xmm5, xmm1
+
+    psrldq xmm2, 14
+    movd ebx, xmm2                      ; save hi
+
+    movdqa xmm6, xmm5                   ; out lo
+    paddw xmm6, LLO_ADD
+    psraw xmm6, LLO_SFT
+    movdqa [edx], xmm6
+
+    ; move right
+    lea esi, [esi + 16 * 2]
+    lea edi, [edi + 8 * 2]
+    lea edx, [edx + 8 * 2]
+
+    ; post
+    movdqa xmm1, [esi]                  ; src[2n]
+    movdqa xmm2, [esi + 16]
+    movdqa xmm6, xmm1
+    movdqa xmm7, xmm2
+    pand xmm1, [cdFFFF]
+    pand xmm2, [cdFFFF]
+    packusdw xmm1, xmm2
+    movdqa xmm2, xmm6                   ; src[2n + 1]
+    movdqa xmm3, xmm7
+    psrldq xmm2, 2
+    psrldq xmm3, 2
+    pand xmm2, [cdFFFF]
+    pand xmm3, [cdFFFF]
+    packusdw xmm2, xmm3
+    movdqa xmm3, xmm6                   ; src[2n + 2]
+    movdqa xmm4, xmm7
+    psrldq xmm3, 4
+    psrldq xmm4, 4
+    movd eax, xmm7
+    movd xmm5, eax
+    pslldq xmm5, 12
+    por xmm3, xmm5
+    movdqa xmm5, xmm7
+    psrldq xmm5, 12
+    pslldq xmm5, 12
+    por xmm4, xmm5
+    pand xmm3, [cdFFFF]
+    pand xmm4, [cdFFFF]
+    packusdw xmm3, xmm4
+    movdqa xmm4, xmm1
+    movdqa xmm5, xmm2
+    movdqa xmm6, xmm3
+    ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+    paddw xmm4, xmm6
+    psraw xmm4, 1
+    psubw xmm5, xmm4
+    psraw xmm5, 1
+
+    movdqa xmm6, xmm5                   ; out hi
+    paddw xmm6, LHI_ADD
+    psraw xmm6, LHI_SFT
+    movdqa [edi], xmm6
+
+    ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+    movdqa xmm7, xmm5
+    pslldq xmm7, 2
+    movd xmm6, ebx
+    por xmm7, xmm6
+    paddw xmm5, xmm7
+    psraw xmm5, 1
+    paddw xmm5, xmm1
+
+    movdqa xmm6, xmm5                   ; out lo
+    paddw xmm6, LLO_ADD
+    psraw xmm6, LLO_SFT
+    movdqa [edx], xmm6
+
+    ; move right
+    lea esi, [esi + 16 * 2]
+    lea edi, [edi + 8 * 2]
+    lea edx, [edx + 8 * 2]
+
+    ; move left
+    lea esi, [esi - 32 * 2]
+    lea edi, [edi - 16 * 2]
+    lea edx, [edx - 16 * 2]
+
+    ; move down
+    lea esi, [esi + 32 * 2]
+    lea edi, [edi + 16 * 2]
+    lea edx, [edx + 16 * 2]
+
+    dec ecx
+    jnz loop1c
+
+    ret
+
+;******************************************************************************
+; source 16 bit signed, 32 pixel width
+rfx_dwt_2d_encode_block_horiz_16_32_no_lo:
+    mov ecx, 16
+loop1c1:
+    ; pre
+    movdqa xmm1, [esi]                  ; src[2n]
+    movdqa xmm2, [esi + 16]
+    movdqa xmm6, xmm1
+    movdqa xmm7, xmm2
+    pand xmm1, [cdFFFF]
+    pand xmm2, [cdFFFF]
+    packusdw xmm1, xmm2
+    movdqa xmm2, xmm6                   ; src[2n + 1]
+    movdqa xmm3, xmm7
+    psrldq xmm2, 2
+    psrldq xmm3, 2
+    pand xmm2, [cdFFFF]
+    pand xmm3, [cdFFFF]
+    packusdw xmm2, xmm3
+    movdqa xmm3, xmm6                   ; src[2n + 2]
+    movdqa xmm4, xmm7
+    psrldq xmm3, 4
+    psrldq xmm4, 4
+    movd eax, xmm7
+    movd xmm5, eax
+    pslldq xmm5, 12
+    por xmm3, xmm5
+    mov eax, [esi + 32]
+    movd xmm5, eax
+    pslldq xmm5, 12
+    por xmm4, xmm5
+    pand xmm3, [cdFFFF]
+    pand xmm4, [cdFFFF]
+    packusdw xmm3, xmm4
+    movdqa xmm4, xmm1
+    movdqa xmm5, xmm2
+    movdqa xmm6, xmm3
+    ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+    paddw xmm4, xmm6
+    psraw xmm4, 1
+    psubw xmm5, xmm4
+    psraw xmm5, 1
+
+    movdqa xmm6, xmm5                   ; out hi
+    paddw xmm6, LHI_ADD
+    psraw xmm6, LHI_SFT
+    movdqa [edi], xmm6
+    movdqa xmm2, xmm5                   ; save hi
+
+    ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+    movdqa xmm7, xmm5
+    movd eax, xmm7
+    pslldq xmm7, 2
+    and eax, 0xFFFF
+    movd xmm6, eax
+    por xmm7, xmm6
+    paddw xmm5, xmm7
+    psraw xmm5, 1
+    paddw xmm5, xmm1
+
+    psrldq xmm2, 14
+    movd ebx, xmm2                      ; save hi
+
+    movdqa [edx], xmm5                  ; out lo
+
+    ; move right
+    lea esi, [esi + 16 * 2]
+    lea edi, [edi + 8 * 2]
+    lea edx, [edx + 8 * 2]
+
+    ; post
+    movdqa xmm1, [esi]                  ; src[2n]
+    movdqa xmm2, [esi + 16]
+    movdqa xmm6, xmm1
+    movdqa xmm7, xmm2
+    pand xmm1, [cdFFFF]
+    pand xmm2, [cdFFFF]
+    packusdw xmm1, xmm2
+    movdqa xmm2, xmm6                   ; src[2n + 1]
+    movdqa xmm3, xmm7
+    psrldq xmm2, 2
+    psrldq xmm3, 2
+    pand xmm2, [cdFFFF]
+    pand xmm3, [cdFFFF]
+    packusdw xmm2, xmm3
+    movdqa xmm3, xmm6                   ; src[2n + 2]
+    movdqa xmm4, xmm7
+    psrldq xmm3, 4
+    psrldq xmm4, 4
+    movd eax, xmm7
+    movd xmm5, eax
+    pslldq xmm5, 12
+    por xmm3, xmm5
+    movdqa xmm5, xmm7
+    psrldq xmm5, 12
+    pslldq xmm5, 12
+    por xmm4, xmm5
+    pand xmm3, [cdFFFF]
+    pand xmm4, [cdFFFF]
+    packusdw xmm3, xmm4
+    movdqa xmm4, xmm1
+    movdqa xmm5, xmm2
+    movdqa xmm6, xmm3
+    ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+    paddw xmm4, xmm6
+    psraw xmm4, 1
+    psubw xmm5, xmm4
+    psraw xmm5, 1
+
+    movdqa xmm6, xmm5                   ; out hi
+    paddw xmm6, LHI_ADD
+    psraw xmm6, LHI_SFT
+    movdqa [edi], xmm6
+
+    ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+    movdqa xmm7, xmm5
+    pslldq xmm7, 2
+    movd xmm6, ebx
+    por xmm7, xmm6
+    paddw xmm5, xmm7
+    psraw xmm5, 1
+    paddw xmm5, xmm1
+
+    movdqa [edx], xmm5                  ; out lo
+
+    ; move right
+    lea esi, [esi + 16 * 2]
+    lea edi, [edi + 8 * 2]
+    lea edx, [edx + 8 * 2]
+
+    ; move left
+    lea esi, [esi - 32 * 2]
+    lea edi, [edi - 16 * 2]
+    lea edx, [edx - 16 * 2]
+
+    ; move down
+    lea esi, [esi + 32 * 2]
+    lea edi, [edi + 16 * 2]
+    lea edx, [edx + 16 * 2]
+
+    dec ecx
+    jnz loop1c1
+
+    ret
+
+;******************************************************************************
+; source 16 bit signed, 32 pixel width
+rfx_dwt_2d_encode_block_verti_16_32:
+    mov ecx, 4
+loop1d:
+    ; pre
+    movdqa xmm1, [esi]                  ; src[2n]
+    movdqa xmm2, [esi + 32 * 2]         ; src[2n + 1]
+    movdqa xmm3, [esi + 32 * 2 * 2]     ; src[2n + 2]
+    movdqa xmm4, xmm1
+    movdqa xmm5, xmm2
+    movdqa xmm6, xmm3
+    ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+    paddw xmm4, xmm6
+    psraw xmm4, 1
+    psubw xmm5, xmm4
+    psraw xmm5, 1
+    movdqa [edi], xmm5                  ; out hi
+    movdqa xmm6, xmm5                   ; save hi
+    ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+    paddw xmm5, xmm1
+    movdqa [edx], xmm5                  ; out lo
+    movdqa xmm7, xmm6                   ; save hi
+    ; move down
+    lea esi, [esi + 32 * 2 * 2]         ; 2 rows
+    lea edi, [edi + 32 * 2]             ; 1 row
+    lea edx, [edx + 32 * 2]             ; 1 row
+
+    ; loop
+    shl ecx, 16
+    mov cx, 14
+loop2d:
+    movdqa xmm1, xmm3                   ; src[2n]
+    movdqa xmm2, [esi + 32 * 2]         ; src[2n + 1]
+    movdqa xmm3, [esi + 32 * 2 * 2]     ; src[2n + 2]
+    movdqa xmm4, xmm1
+    movdqa xmm5, xmm2
+    movdqa xmm6, xmm3
+    ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+    paddw xmm4, xmm6
+    psraw xmm4, 1
+    psubw xmm5, xmm4
+    psraw xmm5, 1
+    movdqa [edi], xmm5                  ; out hi
+    movdqa xmm6, xmm5                   ; save hi
+    ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+    paddw xmm5, xmm7
+    psraw xmm5, 1
+    paddw xmm5, xmm1
+    movdqa [edx], xmm5                  ; out lo
+    movdqa xmm7, xmm6                   ; save hi
+    ; move down
+    lea esi, [esi + 32 * 2 * 2]         ; 2 rows
+    lea edi, [edi + 32 * 2]             ; 1 row
+    lea edx, [edx + 32 * 2]             ; 1 row
+
+    dec cx
+    jnz loop2d
+    shr ecx, 16
+
+    ; post
+    movdqa xmm1, xmm3                   ; src[2n]
+    movdqa xmm2, [esi + 32 * 2]         ; src[2n + 1]
+    movdqa xmm4, xmm1
+    movdqa xmm5, xmm2
+    movdqa xmm6, xmm3
+    ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+    paddw xmm4, xmm6
+    psraw xmm4, 1
+    psubw xmm5, xmm4
+    psraw xmm5, 1
+    movdqa [edi], xmm5                  ; out hi
+    ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+    paddw xmm5, xmm7
+    psraw xmm5, 1
+    paddw xmm5, xmm1
+    movdqa [edx], xmm5                  ; out lo
+    ; move down
+    lea esi, [esi + 32 * 2 * 2]         ; 2 row
+    lea edi, [edi + 32 * 2]             ; 1 row
+    lea edx, [edx + 32 * 2]             ; 1 row
+
+    ; move up
+    lea esi, [esi - 32 * 32 * 2]
+    lea edi, [edi - 16 * 32 * 2]
+    lea edx, [edx - 16 * 32 * 2]
+
+    ; move right
+    lea esi, [esi + 16]
+    lea edi, [edi + 16]
+    lea edx, [edx + 16]
+
+    dec ecx
+    jnz loop1d
+
+    ret
+
+;******************************************************************************
+; source 16 bit signed, 64 pixel width
+rfx_dwt_2d_encode_block_horiz_16_64:
+    mov ecx, 32
+loop1e:
+    ; pre
+    movdqa xmm1, [esi]                  ; src[2n]
+    movdqa xmm2, [esi + 16]
+    movdqa xmm6, xmm1
+    movdqa xmm7, xmm2
+    pand xmm1, [cdFFFF]
+    pand xmm2, [cdFFFF]
+    packusdw xmm1, xmm2
+    movdqa xmm2, xmm6                   ; src[2n + 1]
+    movdqa xmm3, xmm7
+    psrldq xmm2, 2
+    psrldq xmm3, 2
+    pand xmm2, [cdFFFF]
+    pand xmm3, [cdFFFF]
+    packusdw xmm2, xmm3
+    movdqa xmm3, xmm6                   ; src[2n + 2]
+    movdqa xmm4, xmm7
+    psrldq xmm3, 4
+    psrldq xmm4, 4
+    movd eax, xmm7
+    movd xmm5, eax
+    pslldq xmm5, 12
+    por xmm3, xmm5
+    mov eax, [esi + 32]
+    movd xmm5, eax
+    pslldq xmm5, 12
+    por xmm4, xmm5
+    pand xmm3, [cdFFFF]
+    pand xmm4, [cdFFFF]
+    packusdw xmm3, xmm4
+    movdqa xmm4, xmm1
+    movdqa xmm5, xmm2
+    movdqa xmm6, xmm3
+    ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+    paddw xmm4, xmm6
+    psraw xmm4, 1
+    psubw xmm5, xmm4
+    psraw xmm5, 1
+
+    movdqa xmm6, xmm5                   ; out hi
+    paddw xmm6, LHI_ADD
+    psraw xmm6, LHI_SFT
+    movdqa [edi], xmm6
+    movdqa xmm2, xmm5                   ; save hi
+
+    ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+    movdqa xmm7, xmm5
+    movd eax, xmm7
+    pslldq xmm7, 2
+    and eax, 0xFFFF
+    movd xmm6, eax
+    por xmm7, xmm6
+    paddw xmm5, xmm7
+    psraw xmm5, 1
+    paddw xmm5, xmm1
+
+    psrldq xmm2, 14
+    movd ebx, xmm2                      ; save hi
+
+    movdqa xmm6, xmm5                   ; out lo
+    paddw xmm6, LLO_ADD
+    psraw xmm6, LLO_SFT
+    movdqa [edx], xmm6
+
+    ; move right
+    lea esi, [esi + 16 * 2]
+    lea edi, [edi + 8 * 2]
+    lea edx, [edx + 8 * 2]
+
+    ; loop
+    shl ecx, 16
+    mov cx, 2
+loop2e:
+    movdqa xmm1, [esi]                  ; src[2n]
+    movdqa xmm2, [esi + 16]
+    movdqa xmm6, xmm1
+    movdqa xmm7, xmm2
+    pand xmm1, [cdFFFF]
+    pand xmm2, [cdFFFF]
+    packusdw xmm1, xmm2
+    movdqa xmm2, xmm6                   ; src[2n + 1]
+    movdqa xmm3, xmm7
+    psrldq xmm2, 2
+    psrldq xmm3, 2
+    pand xmm2, [cdFFFF]
+    pand xmm3, [cdFFFF]
+    packusdw xmm2, xmm3
+    movdqa xmm3, xmm6                   ; src[2n + 2]
+    movdqa xmm4, xmm7
+    psrldq xmm3, 4
+    psrldq xmm4, 4
+    movd eax, xmm7
+    movd xmm5, eax
+    pslldq xmm5, 12
+    por xmm3, xmm5
+    mov eax, [esi + 32]
+    movd xmm5, eax
+    pslldq xmm5, 12
+    por xmm4, xmm5
+    pand xmm3, [cdFFFF]
+    pand xmm4, [cdFFFF]
+    packusdw xmm3, xmm4
+    movdqa xmm4, xmm1
+    movdqa xmm5, xmm2
+    movdqa xmm6, xmm3
+    ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+    paddw xmm4, xmm6
+    psraw xmm4, 1
+    psubw xmm5, xmm4
+    psraw xmm5, 1
+
+    movdqa xmm6, xmm5                   ; out hi
+    paddw xmm6, LHI_ADD
+    psraw xmm6, LHI_SFT
+    movdqa [edi], xmm6
+    movdqa xmm2, xmm5                   ; save hi
+
+    ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+    movdqa xmm7, xmm5
+    pslldq xmm7, 2
+    movd xmm6, ebx
+    por xmm7, xmm6
+    paddw xmm5, xmm7
+    psraw xmm5, 1
+    paddw xmm5, xmm1
+
+    psrldq xmm2, 14
+    movd ebx, xmm2                      ; save hi
+
+    movdqa xmm6, xmm5                   ; out lo
+    paddw xmm6, LLO_ADD
+    psraw xmm6, LLO_SFT
+    movdqa [edx], xmm6
+
+    ; move right
+    lea esi, [esi + 16 * 2]
+    lea edi, [edi + 8 * 2]
+    lea edx, [edx + 8 * 2]
+
+    dec cx
+    jnz loop2e
+    shr ecx, 16
+
+    ; post
+    movdqa xmm1, [esi]                  ; src[2n]
+    movdqa xmm2, [esi + 16]
+    movdqa xmm6, xmm1
+    movdqa xmm7, xmm2
+    pand xmm1, [cdFFFF]
+    pand xmm2, [cdFFFF]
+    packusdw xmm1, xmm2
+    movdqa xmm2, xmm6                   ; src[2n + 1]
+    movdqa xmm3, xmm7
+    psrldq xmm2, 2
+    psrldq xmm3, 2
+    pand xmm2, [cdFFFF]
+    pand xmm3, [cdFFFF]
+    packusdw xmm2, xmm3
+    movdqa xmm3, xmm6                   ; src[2n + 2]
+    movdqa xmm4, xmm7
+    psrldq xmm3, 4
+    psrldq xmm4, 4
+    movd eax, xmm7
+    movd xmm5, eax
+    pslldq xmm5, 12
+    por xmm3, xmm5
+    movdqa xmm5, xmm7
+    psrldq xmm5, 12
+    pslldq xmm5, 12
+    por xmm4, xmm5
+    pand xmm3, [cdFFFF]
+    pand xmm4, [cdFFFF]
+    packusdw xmm3, xmm4
+    movdqa xmm4, xmm1
+    movdqa xmm5, xmm2
+    movdqa xmm6, xmm3
+    ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+    paddw xmm4, xmm6
+    psraw xmm4, 1
+    psubw xmm5, xmm4
+    psraw xmm5, 1
+
+    movdqa xmm6, xmm5                   ; out hi
+    paddw xmm6, LHI_ADD
+    psraw xmm6, LHI_SFT
+    movdqa [edi], xmm6
+
+    ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+    movdqa xmm7, xmm5
+    pslldq xmm7, 2
+    movd xmm6, ebx
+    por xmm7, xmm6
+    paddw xmm5, xmm7
+    psraw xmm5, 1
+    paddw xmm5, xmm1
+
+    movdqa xmm6, xmm5                   ; out lo
+    paddw xmm6, LLO_ADD
+    psraw xmm6, LLO_SFT
+    movdqa [edx], xmm6
+
+    ; move right
+    lea esi, [esi + 16 * 2]
+    lea edi, [edi + 8 * 2]
+    lea edx, [edx + 8 * 2]
+
+    ; move left
+    lea esi, [esi - 64 * 2]
+    lea edi, [edi - 32 * 2]
+    lea edx, [edx - 32 * 2]
+
+    ; move down
+    lea esi, [esi + 64 * 2]
+    lea edi, [edi + 32 * 2]
+    lea edx, [edx + 32 * 2]
+
+    dec ecx
+    jnz loop1e
+
+    ret
+
+;******************************************************************************
+; source 16 bit signed, 64 pixel width
+rfx_dwt_2d_encode_block_horiz_16_64_no_lo:
+    mov ecx, 32
+loop1e1:
+    ; pre
+    movdqa xmm1, [esi]                  ; src[2n]
+    movdqa xmm2, [esi + 16]
+    movdqa xmm6, xmm1
+    movdqa xmm7, xmm2
+    pand xmm1, [cdFFFF]
+    pand xmm2, [cdFFFF]
+    packusdw xmm1, xmm2
+    movdqa xmm2, xmm6                   ; src[2n + 1]
+    movdqa xmm3, xmm7
+    psrldq xmm2, 2
+    psrldq xmm3, 2
+    pand xmm2, [cdFFFF]
+    pand xmm3, [cdFFFF]
+    packusdw xmm2, xmm3
+    movdqa xmm3, xmm6                   ; src[2n + 2]
+    movdqa xmm4, xmm7
+    psrldq xmm3, 4
+    psrldq xmm4, 4
+    movd eax, xmm7
+    movd xmm5, eax
+    pslldq xmm5, 12
+    por xmm3, xmm5
+    mov eax, [esi + 32]
+    movd xmm5, eax
+    pslldq xmm5, 12
+    por xmm4, xmm5
+    pand xmm3, [cdFFFF]
+    pand xmm4, [cdFFFF]
+    packusdw xmm3, xmm4
+    movdqa xmm4, xmm1
+    movdqa xmm5, xmm2
+    movdqa xmm6, xmm3
+    ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+    paddw xmm4, xmm6
+    psraw xmm4, 1
+    psubw xmm5, xmm4
+    psraw xmm5, 1
+
+    movdqa xmm6, xmm5                   ; out hi
+    paddw xmm6, LHI_ADD
+    psraw xmm6, LHI_SFT
+    movdqa [edi], xmm6
+    movdqa xmm2, xmm5                   ; save hi
+
+    ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+    movdqa xmm7, xmm5
+    movd eax, xmm7
+    pslldq xmm7, 2
+    and eax, 0xFFFF
+    movd xmm6, eax
+    por xmm7, xmm6
+    paddw xmm5, xmm7
+    psraw xmm5, 1
+    paddw xmm5, xmm1
+
+    psrldq xmm2, 14
+    movd ebx, xmm2                      ; save hi
+
+    movdqa [edx], xmm5                  ; out lo
+
+    ; move right
+    lea esi, [esi + 16 * 2]
+    lea edi, [edi + 8 * 2]
+    lea edx, [edx + 8 * 2]
+
+    ; loop
+    shl ecx, 16
+    mov cx, 2
+loop2e1:
+    movdqa xmm1, [esi]                  ; src[2n]
+    movdqa xmm2, [esi + 16]
+    movdqa xmm6, xmm1
+    movdqa xmm7, xmm2
+    pand xmm1, [cdFFFF]
+    pand xmm2, [cdFFFF]
+    packusdw xmm1, xmm2
+    movdqa xmm2, xmm6                   ; src[2n + 1]
+    movdqa xmm3, xmm7
+    psrldq xmm2, 2
+    psrldq xmm3, 2
+    pand xmm2, [cdFFFF]
+    pand xmm3, [cdFFFF]
+    packusdw xmm2, xmm3
+    movdqa xmm3, xmm6                   ; src[2n + 2]
+    movdqa xmm4, xmm7
+    psrldq xmm3, 4
+    psrldq xmm4, 4
+    movd eax, xmm7
+    movd xmm5, eax
+    pslldq xmm5, 12
+    por xmm3, xmm5
+    mov eax, [esi + 32]
+    movd xmm5, eax
+    pslldq xmm5, 12
+    por xmm4, xmm5
+    pand xmm3, [cdFFFF]
+    pand xmm4, [cdFFFF]
+    packusdw xmm3, xmm4
+    movdqa xmm4, xmm1
+    movdqa xmm5, xmm2
+    movdqa xmm6, xmm3
+    ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+    paddw xmm4, xmm6
+    psraw xmm4, 1
+    psubw xmm5, xmm4
+    psraw xmm5, 1
+
+    movdqa xmm6, xmm5                   ; out hi
+    paddw xmm6, LHI_ADD
+    psraw xmm6, LHI_SFT
+    movdqa [edi], xmm6
+    movdqa xmm2, xmm5                   ; save hi
+
+    ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+    movdqa xmm7, xmm5
+    pslldq xmm7, 2
+    movd xmm6, ebx
+    por xmm7, xmm6
+    paddw xmm5, xmm7
+    psraw xmm5, 1
+    paddw xmm5, xmm1
+
+    psrldq xmm2, 14
+    movd ebx, xmm2                      ; save hi
+
+    movdqa [edx], xmm5                  ; out lo
+
+    ; move right
+    lea esi, [esi + 16 * 2]
+    lea edi, [edi + 8 * 2]
+    lea edx, [edx + 8 * 2]
+
+    dec cx
+    jnz loop2e1
+    shr ecx, 16
+
+    ; post
+    movdqa xmm1, [esi]                  ; src[2n]
+    movdqa xmm2, [esi + 16]
+    movdqa xmm6, xmm1
+    movdqa xmm7, xmm2
+    pand xmm1, [cdFFFF]
+    pand xmm2, [cdFFFF]
+    packusdw xmm1, xmm2
+    movdqa xmm2, xmm6                   ; src[2n + 1]
+    movdqa xmm3, xmm7
+    psrldq xmm2, 2
+    psrldq xmm3, 2
+    pand xmm2, [cdFFFF]
+    pand xmm3, [cdFFFF]
+    packusdw xmm2, xmm3
+    movdqa xmm3, xmm6                   ; src[2n + 2]
+    movdqa xmm4, xmm7
+    psrldq xmm3, 4
+    psrldq xmm4, 4
+    movd eax, xmm7
+    movd xmm5, eax
+    pslldq xmm5, 12
+    por xmm3, xmm5
+    movdqa xmm5, xmm7
+    psrldq xmm5, 12
+    pslldq xmm5, 12
+    por xmm4, xmm5
+    pand xmm3, [cdFFFF]
+    pand xmm4, [cdFFFF]
+    packusdw xmm3, xmm4
+    movdqa xmm4, xmm1
+    movdqa xmm5, xmm2
+    movdqa xmm6, xmm3
+    ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+    paddw xmm4, xmm6
+    psraw xmm4, 1
+    psubw xmm5, xmm4
+    psraw xmm5, 1
+
+    movdqa xmm6, xmm5                   ; out hi
+    paddw xmm6, LHI_ADD
+    psraw xmm6, LHI_SFT
+    movdqa [edi], xmm6
+
+    ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+    movdqa xmm7, xmm5
+    pslldq xmm7, 2
+    movd xmm6, ebx
+    por xmm7, xmm6
+    paddw xmm5, xmm7
+    psraw xmm5, 1
+    paddw xmm5, xmm1
+
+    movdqa [edx], xmm5                  ; out lo
+
+    ; move right
+    lea esi, [esi + 16 * 2]
+    lea edi, [edi + 8 * 2]
+    lea edx, [edx + 8 * 2]
+
+    ; move left
+    lea esi, [esi - 64 * 2]
+    lea edi, [edi - 32 * 2]
+    lea edx, [edx - 32 * 2]
+
+    ; move down
+    lea esi, [esi + 64 * 2]
+    lea edi, [edi + 32 * 2]
+    lea edx, [edx + 32 * 2]
+
+    dec ecx
+    jnz loop1e1
+
+    ret
+
+;******************************************************************************
+; source 8 bit unsigned, 64 pixel width
+rfx_dwt_2d_encode_block_verti_8_64:
+    mov ecx, 8
+loop1f:
+    ; pre
+    movq xmm1, [esi]                    ; src[2n]
+    movq xmm2, [esi + 64 * 1]           ; src[2n + 1]
+    movq xmm3, [esi + 64 * 1 * 2]       ; src[2n + 2]
+    punpcklbw xmm1, xmm0
+    punpcklbw xmm2, xmm0
+    punpcklbw xmm3, xmm0
+    psubw xmm1, [cw128]
+    psubw xmm2, [cw128]
+    psubw xmm3, [cw128]
+    psllw xmm1, 5
+    psllw xmm2, 5
+    psllw xmm3, 5
+    movdqa xmm4, xmm1
+    movdqa xmm5, xmm2
+    movdqa xmm6, xmm3
+    ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+    paddw xmm4, xmm6
+    psraw xmm4, 1
+    psubw xmm5, xmm4
+    psraw xmm5, 1
+    movdqa [edi], xmm5                  ; out hi
+    movdqa xmm6, xmm5                   ; save hi
+    ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+    paddw xmm5, xmm1
+    movdqa [edx], xmm5                  ; out lo
+    movdqa xmm7, xmm6                   ; save hi
+    ; move down
+    lea esi, [esi + 64 * 1 * 2]         ; 2 rows
+    lea edi, [edi + 64 * 2]             ; 1 row
+    lea edx, [edx + 64 * 2]             ; 1 row
+
+    ; loop
+    shl ecx, 16
+    mov cx, 30
+loop2f:
+    movdqa xmm1, xmm3                   ; src[2n]
+    movq xmm2, [esi + 64 * 1]           ; src[2n + 1]
+    movq xmm3, [esi + 64 * 1 * 2]       ; src[2n + 2]
+    punpcklbw xmm2, xmm0
+    punpcklbw xmm3, xmm0
+    psubw xmm2, [cw128]
+    psubw xmm3, [cw128]
+    psllw xmm2, 5
+    psllw xmm3, 5
+    movdqa xmm4, xmm1
+    movdqa xmm5, xmm2
+    movdqa xmm6, xmm3
+    ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+    paddw xmm4, xmm6
+    psraw xmm4, 1
+    psubw xmm5, xmm4
+    psraw xmm5, 1
+    movdqa [edi], xmm5                  ; out hi
+    movdqa xmm6, xmm5                   ; save hi
+    ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+    paddw xmm5, xmm7
+    psraw xmm5, 1
+    paddw xmm5, xmm1
+    movdqa [edx], xmm5                  ; out lo
+    movdqa xmm7, xmm6                   ; save hi
+    ; move down
+    lea esi, [esi + 64 * 1 * 2]         ; 2 rows
+    lea edi, [edi + 64 * 2]             ; 1 row
+    lea edx, [edx + 64 * 2]             ; 1 row
+
+    dec cx
+    jnz loop2f
+    shr ecx, 16
+
+    ; post
+    movdqa xmm1, xmm3                   ; src[2n]
+    movq xmm2, [esi + 64 * 1]           ; src[2n + 1]
+    punpcklbw xmm2, xmm0
+    psubw xmm2, [cw128]
+    psllw xmm2, 5
+    movdqa xmm4, xmm1
+    movdqa xmm5, xmm2
+    movdqa xmm6, xmm3
+    ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+    paddw xmm4, xmm6
+    psraw xmm4, 1
+    psubw xmm5, xmm4
+    psraw xmm5, 1
+    movdqa [edi], xmm5                  ; out hi
+    ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+    paddw xmm5, xmm7
+    psraw xmm5, 1
+    paddw xmm5, xmm1
+    movdqa [edx], xmm5                  ; out lo
+    ; move down
+    lea esi, [esi + 64 * 1 * 2]         ; 2 rows
+    lea edi, [edi + 64 * 2]             ; 1 row
+    lea edx, [edx + 64 * 2]             ; 1 row
+
+    ; move up
+    lea esi, [esi - 64 * 1 * 64]
+    lea edi, [edi - 32 * 64 * 2]
+    lea edx, [edx - 32 * 64 * 2]
+
+    ; move right
+    lea esi, [esi + 8]
+    lea edi, [edi + 16]
+    lea edx, [edx + 16]
+
+    dec ecx
+    jnz loop1f
+
+    ret
+
+set_quants_hi:
+    sub eax, 6 - 5
+    movd xmm1, eax
+    movdqa LHI_SFT, xmm1
+    imul eax, 16
+    lea edx, [cwa0]
+    add edx, eax
+    movdqa xmm1, [edx]
+    movdqa LHI_ADD, xmm1
+    ret
+
+set_quants_lo:
+    sub eax, 6 - 5
+    movd xmm1, eax
+    movdqa LLO_SFT, xmm1
+    imul eax, 16
+    lea edx, [cwa0]
+    add edx, eax
+    movdqa xmm1, [edx]
+    movdqa LLO_ADD, xmm1
+    ret
+
+%define LQTABLE           [esp + 144] ; qtable
+%define LIN_BUFFER        [esp + 148] ; in_buffer
+%define LOUT_BUFFER       [esp + 152] ; out_buffer
+%define LWORK_BUFFER      [esp + 156] ; work_buffer
+
+;int
+;rfxcodec_encode_dwt_shift_x86_sse41(const char *qtable,
+;                                    unsigned char *in_buffer,
+;                                    short *out_buffer,
+;                                    short *work_buffer);
+
+;******************************************************************************
+%ifidn __OUTPUT_FORMAT__,elf
+PROC rfxcodec_encode_dwt_shift_x86_sse41
+%else
+PROC _rfxcodec_encode_dwt_shift_x86_sse41
+%endif
+    ; align stack
+    mov eax, esp
+    sub eax, 0x10
+    and eax, 0x0F
+    sub esp, eax
+    push eax
+    sub esp, 3 * 4
+    sub esp, 4 * 4
+    ; copy params to after align
+    movdqu xmm0, [esp + eax + 4 * 4 + 3 * 4 + 4 + 4]
+    movdqu [esp], xmm0
+    ; save registers
+    push ebx
+    push esi
+    push edi
+    push ebp
+    sub esp, 16 * 8
+    pxor xmm0, xmm0
+
+    ; verical DWT to work buffer, level 1
+    mov esi, LIN_BUFFER                 ; src
+    mov edi, LWORK_BUFFER               ; dst hi
+    lea edi, [edi + 64 * 32 * 2]        ; dst hi
+    mov edx, LWORK_BUFFER               ; dst lo
+    call rfx_dwt_2d_encode_block_verti_8_64
+
+    ; horizontal DWT to out buffer, level 1, part 1
+    xor eax, eax
+    mov edx, LQTABLE
+    mov al, [edx + 4]
+    and al, 0xF
+    call set_quants_hi
+    mov esi, LWORK_BUFFER               ; src
+    mov edi, LOUT_BUFFER                ; dst hi - HL1
+    mov edx, LOUT_BUFFER                ; dst lo - LL1
+    lea edx, [edx + 32 * 32 * 6]        ; dst lo - LL1
+    call rfx_dwt_2d_encode_block_horiz_16_64_no_lo
+
+    ; horizontal DWT to out buffer, level 1, part 2
+    xor eax, eax
+    mov edx, LQTABLE
+    mov al, [edx + 4]
+    shr al, 4
+    call set_quants_hi
+    xor eax, eax
+    mov edx, LQTABLE
+    mov al, [edx + 3]
+    shr al, 4
+    call set_quants_lo
+    mov esi, LWORK_BUFFER               ; src
+    lea esi, [esi + 64 * 32 * 2]        ; src
+    mov edi, LOUT_BUFFER                ; dst hi - HH1
+    lea edi, [edi + 32 * 32 * 4]        ; dst hi - HH1
+    mov edx, LOUT_BUFFER                ; dst lo - LH1
+    lea edx, [edx + 32 * 32 * 2]        ; dst lo - LH1
+    call rfx_dwt_2d_encode_block_horiz_16_64
+
+    ; verical DWT to work buffer, level 2
+    mov esi, LOUT_BUFFER                ; src
+    lea esi, [esi + 32 * 32 * 6]        ; src
+    mov edi, LWORK_BUFFER               ; dst hi
+    lea edi, [edi + 32 * 16 * 2]        ; dst hi
+    mov edx, LWORK_BUFFER               ; dst lo
+    call rfx_dwt_2d_encode_block_verti_16_32
+
+    ; horizontal DWT to out buffer, level 2, part 1
+    xor eax, eax
+    mov edx, LQTABLE
+    mov al, [edx + 2]
+    shr al, 4
+    call set_quants_hi
+    mov esi, LWORK_BUFFER               ; src
+    ; 32 * 32 * 6 + 16 * 16 * 0 = 6144
+    mov edi, LOUT_BUFFER                ; dst hi - HL2
+    lea edi, [edi + 6144]               ; dst hi - HL2
+    ; 32 * 32 * 6 + 16 * 16 * 6 = 7680
+    mov edx, LOUT_BUFFER                ; dst lo - LL2
+    lea edx, [edx + 7680]               ; dst lo - LL2
+    call rfx_dwt_2d_encode_block_horiz_16_32_no_lo
+
+    ; horizontal DWT to out buffer, level 2, part 2
+    xor eax, eax
+    mov edx, LQTABLE
+    mov al, [edx + 3]
+    and al, 0xF
+    call set_quants_hi
+    xor eax, eax
+    mov edx, LQTABLE
+    mov al, [edx + 2]
+    and al, 0xF
+    call set_quants_lo
+    mov esi, LWORK_BUFFER               ; src
+    lea esi, [esi + 32 * 16 * 2]        ; src
+    ; 32 * 32 * 6 + 16 * 16 * 4 = 7168
+    mov edi, LOUT_BUFFER                ; dst hi - HH2
+    lea edi, [edi + 7168]               ; dst hi - HH2
+    ; 32 * 32 * 6 + 16 * 16 * 2 = 6656
+    mov edx, LOUT_BUFFER                ; dst lo - LH2
+    lea edx, [edx + 6656]               ; dst lo - LH2
+    call rfx_dwt_2d_encode_block_horiz_16_32
+
+    ; verical DWT to work buffer, level 3
+    ; 32 * 32 * 6 + 16 * 16 * 6 = 7680
+    mov esi, LOUT_BUFFER                ; src
+    lea esi, [esi + 7680]               ; src
+    mov edi, LWORK_BUFFER               ; dst hi
+    lea edi, [edi + 16 * 8 * 2]         ; dst hi
+    mov edx, LWORK_BUFFER               ; dst lo
+    call rfx_dwt_2d_encode_block_verti_16_16
+
+    ; horizontal DWT to out buffer, level 3, part 1
+    xor eax, eax
+    mov edx, LQTABLE
+    mov al, [edx + 1]
+    and al, 0xF
+    call set_quants_hi
+    xor eax, eax
+    mov edx, LQTABLE
+    mov al, [edx + 0]
+    and al, 0xF
+    call set_quants_lo
+    mov esi, LWORK_BUFFER               ; src
+    ; 32 * 32 * 6 + 16 * 16 * 6 + 8 * 8 * 0 = 7680
+    mov edi, LOUT_BUFFER                ; dst hi - HL3
+    lea edi, [edi + 7680]               ; dst hi - HL3
+    ; 32 * 32 * 6 + 16 * 16 * 6 + 8 * 8 * 6 = 8064
+    mov edx, LOUT_BUFFER                ; dst lo - LL3
+    lea edx, [edx + 8064]               ; dst lo - LL3
+    call rfx_dwt_2d_encode_block_horiz_16_16
+
+    ; horizontal DWT to out buffer, level 3, part 2
+    xor eax, eax
+    mov edx, LQTABLE
+    mov al, [edx + 1]
+    shr al, 4
+    call set_quants_hi
+    xor eax, eax
+    mov edx, LQTABLE
+    mov al, [edx + 0]
+    shr al, 4
+    call set_quants_lo
+    mov esi, LWORK_BUFFER               ; src
+    lea esi, [esi + 16 * 8 * 2]         ; src
+    ; 32 * 32 * 6 + 16 * 16 * 6 + 8 * 8 * 4 = 7936
+    mov edi, LOUT_BUFFER                ; dst hi - HH3
+    lea edi, [edi + 7936]               ; dst hi - HH3
+    ; 32 * 32 * 6 + 16 * 16 * 6 + 8 * 8 * 2 = 7808
+    mov edx, LOUT_BUFFER                ; dst lo - LH3
+    lea edx, [edx + 7808]               ; dst lo - LH3
+    call rfx_dwt_2d_encode_block_horiz_16_16
+
+    ; quants
+    add esp, 16 * 8
+    ; restore registers
+    pop ebp
+    pop edi
+    pop esi
+    pop ebx
+    ; params
+    add esp, 3 * 4
+    add esp, 4 * 4
+    ; align
+    pop eax
+    add esp, eax
+    ; return value
+    mov eax, 0
+    ret
+    align 16
+
diff --git a/src/x86/rfxdwt_x86_sse2.asm b/src/x86/rfxdwt_x86_sse2.asm
deleted file mode 100644
index dd2a2d9..0000000
--- a/src/x86/rfxdwt_x86_sse2.asm
+++ /dev/null
@@ -1,25 +0,0 @@
-
-section .data
-    const1 times 8 dw 1
-
-%macro PROC 1
-    align 16
-    global %1
-    %1:
-%endmacro
-
-;int
-;dwt_shift_x86_sse2(const int* qtable, sint8* src, sint16* dst, sint16* temp)
-
-PROC dwt_shift_x86_sse2
-    push ebx
-    push esi
-    push edi
-
-    mov eax, 0
-    pop edi
-    pop esi
-    pop ebx
-    ret
-    align 16
-
diff --git a/src/x86/rfxrlgr1_x86.asm b/src/x86/rfxrlgr1_x86.asm
deleted file mode 100644
index 8441051..0000000
--- a/src/x86/rfxrlgr1_x86.asm
+++ /dev/null
@@ -1,25 +0,0 @@
-
-section .data
-    const1 times 8 dw 1
-
-%macro PROC 1
-    align 16
-    global %1
-    %1:
-%endmacro
-
-;int
-;diff_rlgr1_x86(sint16 *co, int num_co, uint8 *dst, int dst_bytes);
-
-PROC diff_rlgr1_x86
-    push ebx
-    push esi
-    push edi
-
-    mov eax, 0
-    pop edi
-    pop esi
-    pop ebx
-    ret
-    align 16
-
diff --git a/src/x86/rfxrlgr3_x86.asm b/src/x86/rfxrlgr3_x86.asm
deleted file mode 100644
index 08b278d..0000000
--- a/src/x86/rfxrlgr3_x86.asm
+++ /dev/null
@@ -1,25 +0,0 @@
-
-section .data
-    const1 times 8 dw 1
-
-%macro PROC 1
-    align 16
-    global %1
-    %1:
-%endmacro
-
-;int
-;diff_rlgr3_x86(sint16 *co, int num_co, uint8 *dst, int dst_bytes);
-
-PROC diff_rlgr3_x86
-    push ebx
-    push esi
-    push edi
-
-    mov eax, 0
-    pop edi
-    pop esi
-    pop ebx
-    ret
-    align 16
-
diff --git a/tests/Makefile b/tests/Makefile
deleted file mode 100644
index 36cd57d..0000000
--- a/tests/Makefile
+++ /dev/null
@@ -1,22 +0,0 @@
-
-OBJS = rfxcodectest.o
-
-CFLAGS = -g -O2 -Wall -fPIC -I../include
-
-# this for linking to .so
-#LDFLAGS = $(PROFIL) -L../src -Wl,-rpath=../src
-# this if using .a
-LDFLAGS = $(PROFIL)
-
-# this for linking to .so
-#LIBS = -lrfxencode
-# this for using .a
-LIBS = ../src/librfxencode.a
-
-all: rfxcodectest
-
-rfxcodectest: $(OBJS) Makefile
-	$(CC) -o rfxcodectest $(LDFLAGS) $(OBJS) $(LIBS)
-
-clean:
-	rm -f $(OBJS) rfxcodectest
diff --git a/tests/Makefile.am b/tests/Makefile.am
new file mode 100644
index 0000000..8e24edc
--- /dev/null
+++ b/tests/Makefile.am
@@ -0,0 +1,11 @@
+EXTRA_DIST = readme.txt
+
+AM_CPPFLAGS = \
+  -I$(top_srcdir)/include
+
+bin_PROGRAMS = rfxcodectest
+
+rfxcodectest_SOURCES = rfxcodectest.c
+
+rfxcodectest_LDADD = \
+  $(top_builddir)/src/librfxencode.la
diff --git a/tests/rfxcodectest.c b/tests/rfxcodectest.c
index f959185..6733db8 100644
--- a/tests/rfxcodectest.c
+++ b/tests/rfxcodectest.c
@@ -1,7 +1,7 @@
 /**
  * RFX codec encoder test
  *
- * Copyright 2014 Jay Sorg <jay.sorg@gmail.com>
+ * Copyright 2014-2015 Jay Sorg <jay.sorg@gmail.com>
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,11 +27,11 @@
 
 #include <rfxcodec_encode.h>
 
-static const int g_rfx_default_quantization_values[] =
+static const unsigned char g_rfx_default_quantization_values[] =
 {
     /* LL3 LH3 HL3 HH3 LH2 HL2 HH2 LH1 HL1 HH1 */
-    6,  6,  6,  6,  7,  7,  8,  8,  8,  9,
-    9,  9,  9,  9,  10,  10,  12,  12,  12,  13
+    0x66,  0x66,  0x77,  0x88,  0x98,
+    0x99,  0x99,  0xaa,  0xcc,  0xdc
 };
 
 /*****************************************************************************/
@@ -46,7 +46,7 @@ get_mstime(void)
 
 /******************************************************************************/
 static int
-speed_random(int count, const int *quants)
+speed_random(int count, const char *quants)
 {
     void *han;
     int error;
@@ -56,34 +56,44 @@ speed_random(int count, const int *quants)
     char *cdata;
     char *buf;
     struct rfx_rect regions[1];
-    struct rfx_tile tiles[1];
+    struct rfx_tile tiles[2];
     int stime;
     int etime;
     int tiles_per_second;
     int num_regions;
     int num_tiles;
     int num_quants;
+    int flags;
 
     printf("speed_random:\n");
-    han = rfxcodec_encode_create(1920, 1024, RFX_FORMAT_BGRA, RFX_FLAGS_RLGR1);
-    if (han == 0)
+    //flags = RFX_FLAGS_RLGR1 | RFX_FLAGS_NOACCEL;
+    flags = RFX_FLAGS_RLGR1;
+    //flags = RFX_FLAGS_RLGR3;
+    //flags = RFX_FLAGS_RLGR1 | RFX_FLAGS_ALPHAV1;
+    error = rfxcodec_encode_create_ex(1920, 1024, RFX_FORMAT_BGRA, flags, &han);
+    if (error != 0)
     {
-        printf("speed_random: rfxcodec_encode_create failed\n");
+        printf("speed_random: rfxcodec_encode_create_ex failed\n");
         return 1;
     }
-    printf("speed_random: rfxcodec_encode_create ok\n");
-    cdata = (char *) malloc(64 * 64 * 4);
-    cdata_bytes = 64 * 64 * 4;
-    buf = (char *) malloc(64 * 64 * 4);
+    printf("speed_random: rfxcodec_encode_create_ex ok\n");
+    cdata = (char *) malloc(128 * 64 * 4);
+    cdata_bytes = 128 * 64 * 4;
+    buf = (char *) malloc(128 * 64 * 4);
+#if 1
     fd = open("/dev/urandom", O_RDONLY);
-    if (read(fd, buf, 64 * 64 * 4) != 64 * 64 * 4)
+    //fd = open("/dev/zero", O_RDONLY);
+    if (read(fd, buf, 128 * 64 * 4) != 128 * 64 * 4)
     {
         printf("speed_random: read error\n");
     }
     close(fd);
+#else
+    memset(buf, 0x7f, 128 * 64 * 4);
+#endif
     regions[0].x = 0;
     regions[0].y = 0;
-    regions[0].cx = 64;
+    regions[0].cx = 128;
     regions[0].cy = 64;
     num_regions = 1;
     tiles[0].x = 0;
@@ -93,22 +103,31 @@ speed_random(int count, const int *quants)
     tiles[0].quant_y = 0;
     tiles[0].quant_cb = 0;
     tiles[0].quant_cr = 0;
+    tiles[1].x = 64;
+    tiles[1].y = 0;
+    tiles[1].cx = 64;
+    tiles[1].cy = 64;
+    tiles[1].quant_y = 0;
+    tiles[1].quant_cb = 0;
+    tiles[1].quant_cr = 0;
     num_tiles = 1;
     num_quants = 1;
     error = 0;
     stime = get_mstime();
+    flags = 0;
+    //flags = RFX_FLAGS_ALPHAV1;
     for (index = 0; index < count; index++)
     {
-        error = rfxcodec_encode(han, cdata, &cdata_bytes, buf, 64, 64, 64 * 4,
-                                regions, num_regions, tiles, num_tiles,
-                                quants, num_quants);
+        error = rfxcodec_encode_ex(han, cdata, &cdata_bytes, buf, 64, 64, 64 * 4,
+                                   regions, num_regions, tiles, num_tiles,
+                                   quants, num_quants, flags);
         if (error != 0)
         {
             break;
         }
     }
     etime = get_mstime();
-    tiles_per_second = count * 1000 / (etime - stime);
+    tiles_per_second = count * num_tiles * 1000 / (etime - stime + 1);
     printf("speed_random: cdata_bytes %d count %d ms time %d "
            "tiles_per_second %d\n",
            cdata_bytes, count, etime - stime, tiles_per_second);
@@ -221,7 +240,7 @@ load_bmp_file(int in_fd, char **data, int *width, int *height)
 /******************************************************************************/
 static int
 encode_file(char *data, int width, int height, char *cdata, int *cdata_bytes,
-            const int *quants, int num_quants)
+            const char *quants, int num_quants)
 {
     int awidth;
     int aheight;
@@ -235,10 +254,10 @@ encode_file(char *data, int width, int height, char *cdata, int *cdata_bytes,
     void *han;
     struct rfx_rect regions[1];
 
-    han = rfxcodec_encode_create(1920, 1024, RFX_FORMAT_BGRA, RFX_FLAGS_RLGR1);
-    if (han == 0)
+    error = rfxcodec_encode_create_ex(1920, 1024, RFX_FORMAT_BGRA, RFX_FLAGS_RLGR1, &han);
+    if (error != 0)
     {
-        printf("encode_file: rfxcodec_encode_create failed\n");
+        printf("encode_file: rfxcodec_encode_create_ex failed\n");
         return 1;
     }
 
@@ -269,9 +288,9 @@ encode_file(char *data, int width, int height, char *cdata, int *cdata_bytes,
     regions[0].cy = height;
     num_regions = 1;
 
-    error = rfxcodec_encode(han, cdata, cdata_bytes, data, width, height, width * 4,
-                            regions, num_regions, tiles, num_tiles,
-                            quants, num_quants);
+    error = rfxcodec_encode_ex(han, cdata, cdata_bytes, data, width, height, width * 4,
+                               regions, num_regions, tiles, num_tiles,
+                               quants, num_quants, 0);
     if (error != 0)
     {
         printf("encode_file: rfxcodec_encode failed error %d\n", error);
@@ -287,7 +306,7 @@ encode_file(char *data, int width, int height, char *cdata, int *cdata_bytes,
 
 /******************************************************************************/
 static int
-read_file(int count, const int *quants, int num_quants,
+read_file(int count, const char *quants, int num_quants,
           const char *in_file, const char *out_file)
 {
     int in_fd;
@@ -380,7 +399,7 @@ main(int argc, char **argv)
     int count;
     char in_file[256];
     char out_file[256];
-    const int *quants = g_rfx_default_quantization_values;
+    const char *quants = (const char *) g_rfx_default_quantization_values;
 
     do_speed = 0;
     do_read = 0;